Spaces:

AuraSystems
/

spanish-embeddings-api

Sleeping

App Files Files Community

Jordi Catafal commited on Jun 1

Commit

909d9bf

1 Parent(s): 0a6cb95

Add Catalan RoBERTa model - now supporting 5 models

Browse files

Files changed (7) hide show

Dockerfile +5 -2
README.md +44 -10
app.py +17 -7
models/schemas.py +2 -2
requirements.txt +2 -1
utils/__init__.py +0 -1
utils/helpers.py +29 -2

Dockerfile CHANGED Viewed

@@ -5,9 +5,12 @@ ENV PYTHONUNBUFFERED=1
 ENV TRANSFORMERS_CACHE=/app/cache
 ENV HF_HOME=/app/cache
 ENV PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.6,max_split_size_mb:128
-# Add this to handle the larger models
 ENV TRANSFORMERS_OFFLINE=0
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
 # Install system dependencies for better performance
 RUN apt-get update && apt-get install -y \
@@ -39,4 +42,4 @@ RUN mkdir -p /app/cache
 EXPOSE 7860
 # Run the application
-CMD ["python", "app.py"]

 ENV TRANSFORMERS_CACHE=/app/cache
 ENV HF_HOME=/app/cache
 ENV PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.6,max_split_size_mb:128
+# Optimize for multiple large models
 ENV TRANSFORMERS_OFFLINE=0
 ENV HF_HUB_ENABLE_HF_TRANSFER=1
+ENV TOKENIZERS_PARALLELISM=false
+# Reduce memory fragmentation
+ENV MALLOC_TRIM_THRESHOLD_=100000
 # Install system dependencies for better performance
 RUN apt-get update && apt-get install -y \
 EXPOSE 7860
 # Run the application
+CMD ["python", "-u", "app.py"]

README.md CHANGED Viewed

@@ -10,10 +10,9 @@ pinned: false
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 --------------------------------
-# Spanish & Legal Embeddings API
-A high-performance API for generating embeddings from Spanish, English, and multilingual text using state-of-the-art models. This API provides access to four specialized models optimized for different use cases and languages.
 ## 🚀 Quick Start
@@ -29,6 +28,7 @@ A high-performance API for generating embeddings from Spanish, English, and mult
 | **robertalex** | 512 | Spanish | 768 | Spanish legal documents, formal Spanish |
 | **jina-v3** | 8,192 | Multilingual (30+ languages) | 1,024 | Superior multilingual embeddings, long context |
 | **legal-bert** | 512 | English | 768 | English legal documents, contracts, law texts |
 ## 🔗 API Endpoints
@@ -96,7 +96,22 @@ multilingual_response = requests.post(
 )
 print(f"Jina v3 dimensions: {multilingual_response.json()['dimensions']}")  # 1024 dims
-# Example 3: Legal text with RoBERTalex (Spanish)
 spanish_legal_response = requests.post(
     f"{API_URL}/embed",
     json={
@@ -109,7 +124,7 @@ spanish_legal_response = requests.post(
     }
 )
-# Example 4: Legal text with Legal-BERT (English)
 english_legal_response = requests.post(
     f"{API_URL}/embed",
     json={
@@ -122,11 +137,12 @@ english_legal_response = requests.post(
     }
 )
-# Example 5: Compare similarity across models
-text = "artificial intelligence and law"
 models_comparison = {}
-for model in ["jina", "jina-v3", "legal-bert"]:
     resp = requests.post(
         f"{API_URL}/embed",
         json={"texts": [text], "model": model, "normalize": True}
@@ -148,6 +164,15 @@ curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
        "normalize": true
      }'
 # Using Jina v3 for multilingual embeddings
 curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
      -H "Content-Type: application/json" \
@@ -234,7 +259,7 @@ class MultilingualEmbeddings(Embeddings):
         Initialize embeddings
         Args:
-            model: One of "jina", "robertalex", "jina-v3", "legal-bert"
         """
         self.api_url = "https://aurasystems-spanish-embeddings-api.hf.space/embed"
         self.model = model
@@ -262,12 +287,21 @@ spanish_docs = spanish_embeddings.embed_documents([
     "Segundo documento en español"
 ])
 # Multilingual embeddings with Jina v3
 multilingual_embeddings = MultilingualEmbeddings(model="jina-v3")
 mixed_docs = multilingual_embeddings.embed_documents([
     "English document",
     "Documento en español",
-    "Document en français"
 ])
 # Legal embeddings for English

 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 --------------------------------
+# Multilingual & Legal Embeddings API
+A high-performance API for generating embeddings from Spanish, Catalan, English, and multilingual text using state-of-the-art models. This API provides access to five specialized models optimized for different use cases and languages.
 ## 🚀 Quick Start
 | **robertalex** | 512 | Spanish | 768 | Spanish legal documents, formal Spanish |
 | **jina-v3** | 8,192 | Multilingual (30+ languages) | 1,024 | Superior multilingual embeddings, long context |
 | **legal-bert** | 512 | English | 768 | English legal documents, contracts, law texts |
+| **roberta-ca** | 512 | Catalan | 1,024 | Catalan text, general purpose, RoBERTa-large architecture |
 ## 🔗 API Endpoints
 )
 print(f"Jina v3 dimensions: {multilingual_response.json()['dimensions']}")  # 1024 dims
+# Example 3: Catalan text with RoBERTa-ca
+catalan_response = requests.post(
+    f"{API_URL}/embed",
+    json={
+        "texts": [
+            "Bon dia, com estàs?",
+            "M'agrada programar en Python",
+            "Barcelona és una ciutat meravellosa"
+        ],
+        "model": "roberta-ca",
+        "normalize": True
+    }
+)
+print(f"Catalan RoBERTa dimensions: {catalan_response.json()['dimensions']}")  # 1024 dims
+# Example 4: Legal text with RoBERTalex (Spanish)
 spanish_legal_response = requests.post(
     f"{API_URL}/embed",
     json={
     }
 )
+# Example 5: Legal text with Legal-BERT (English)
 english_legal_response = requests.post(
     f"{API_URL}/embed",
     json={
     }
 )
+# Example 6: Compare similarity across models
+text_es = "inteligencia artificial"
+text_ca = "intel·ligència artificial"
 models_comparison = {}
+for model, text in [("jina", text_es), ("roberta-ca", text_ca), ("jina-v3", text_es)]:
     resp = requests.post(
         f"{API_URL}/embed",
         json={"texts": [text], "model": model, "normalize": True}
        "normalize": true
      }'
+# Catalan text with RoBERTa-ca
+curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
+     -H "Content-Type: application/json" \
+     -d '{
+       "texts": ["Bon dia", "Com està vostè?", "Catalunya és meravellosa"],
+       "model": "roberta-ca",
+       "normalize": true
+     }'
 # Using Jina v3 for multilingual embeddings
 curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
      -H "Content-Type: application/json" \
         Initialize embeddings
         Args:
+            model: One of "jina", "robertalex", "jina-v3", "legal-bert", "roberta-ca"
         """
         self.api_url = "https://aurasystems-spanish-embeddings-api.hf.space/embed"
         self.model = model
     "Segundo documento en español"
 ])
+# Catalan embeddings
+catalan_embeddings = MultilingualEmbeddings(model="roberta-ca")
+catalan_docs = catalan_embeddings.embed_documents([
+    "Primer document en català",
+    "Segon document en català",
+    "La cultura catalana és rica i diversa"
+])
 # Multilingual embeddings with Jina v3
 multilingual_embeddings = MultilingualEmbeddings(model="jina-v3")
 mixed_docs = multilingual_embeddings.embed_documents([
     "English document",
     "Documento en español",
+    "Document en français",
+    "Document en català"
 ])
 # Legal embeddings for English

app.py CHANGED Viewed

@@ -9,9 +9,9 @@ from models.schemas import EmbeddingRequest, EmbeddingResponse, ModelInfo
 from utils.helpers import load_models, get_embeddings, cleanup_memory
 app = FastAPI(
-    title="Spanish & Legal Embedding API",
-    description="Multi-model embedding API for Spanish and Legal texts",
-    version="2.0.0"
 )
 # Global model cache
@@ -27,10 +27,11 @@ async def startup_event():
 @app.get("/")
 async def root():
     return {
-        "message": "Spanish & Legal Embedding API",
-        "models": ["jina", "robertalex", "jina-v3", "legal-bert"],
         "status": "running",
-        "docs": "/docs"
     }
 @app.post("/embed", response_model=EmbeddingResponse)
@@ -106,6 +107,15 @@ async def list_models():
             languages=["English"],
             model_type="legal domain",
             description="English legal domain BERT model"
         )
     ]
@@ -114,7 +124,7 @@ async def health_check():
     """Health check endpoint"""
     return {
         "status": "healthy",
-        "models_loaded": len(models_cache) == 4,
         "available_models": list(models_cache.keys())
     }

 from utils.helpers import load_models, get_embeddings, cleanup_memory
 app = FastAPI(
+    title="Multilingual & Legal Embedding API",
+    description="Multi-model embedding API for Spanish, Catalan, English and Legal texts",
+    version="3.0.0"
 )
 # Global model cache
 @app.get("/")
 async def root():
     return {
+        "message": "Multilingual & Legal Embedding API",
+        "models": ["jina", "robertalex", "jina-v3", "legal-bert", "roberta-ca"],
         "status": "running",
+        "docs": "/docs",
+        "total_models": 5
     }
 @app.post("/embed", response_model=EmbeddingResponse)
             languages=["English"],
             model_type="legal domain",
             description="English legal domain BERT model"
+        ),
+        ModelInfo(
+            model_id="roberta-ca",
+            name="projecte-aina/roberta-large-ca-v2",
+            dimensions=1024,
+            max_sequence_length=512,
+            languages=["Catalan"],
+            model_type="general",
+            description="Catalan RoBERTa-large model trained on large corpus"
         )
     ]
     """Health check endpoint"""
     return {
         "status": "healthy",
+        "models_loaded": len(models_cache) == 5,
         "available_models": list(models_cache.keys())
     }

models/schemas.py CHANGED Viewed

@@ -11,7 +11,7 @@ class EmbeddingRequest(BaseModel):
         description="List of texts to embed",
         example=["Hola mundo", "¿Cómo estás?"]
     )
-    model: Literal["jina", "robertalex", "jina-v3", "legal-bert"] = Field(
         default="jina",
         description="Model to use for embeddings"
     )
@@ -41,7 +41,7 @@ class EmbeddingRequest(BaseModel):
             model = values.get('model', 'jina')
             if model in ['jina', 'jina-v3'] and v > 8192:
                 raise ValueError(f"Max length for {model} model is 8192")
-            elif model in ['robertalex', 'legal-bert'] and v > 512:
                 raise ValueError(f"Max length for {model} model is 512")
             if v < 1:
                 raise ValueError("Max length must be positive")

         description="List of texts to embed",
         example=["Hola mundo", "¿Cómo estás?"]
     )
+    model: Literal["jina", "robertalex", "jina-v3", "legal-bert", "roberta-ca"] = Field(
         default="jina",
         description="Model to use for embeddings"
     )
             model = values.get('model', 'jina')
             if model in ['jina', 'jina-v3'] and v > 8192:
                 raise ValueError(f"Max length for {model} model is 8192")
+            elif model in ['robertalex', 'legal-bert', 'roberta-ca'] and v > 512:
                 raise ValueError(f"Max length for {model} model is 512")
             if v < 1:
                 raise ValueError("Max length must be positive")

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ scikit-learn==1.3.2
 pydantic==2.5.0
 huggingface-hub==0.19.4
 python-multipart==0.0.6
-protobuf>=3.20.0

 pydantic==2.5.0
 huggingface-hub==0.19.4
 python-multipart==0.0.6
+protobuf>=3.20.0
+sentencepiece==0.1.99

utils/__init__.py CHANGED Viewed

@@ -1,4 +1,3 @@
 # utils/__init__.py
 """Utils package for helper functions"""



1	# utils/__init__.py
2	"""Utils package for helper functions"""
3

utils/helpers.py CHANGED Viewed

@@ -69,6 +69,15 @@ def load_models() -> Dict:
         ).to(device)
         legal_bert_model.eval()
         models_cache = {
             'jina': {
                 'tokenizer': jina_tokenizer,
@@ -93,6 +102,12 @@ def load_models() -> Dict:
                 'model': legal_bert_model,
                 'device': device,
                 'pooling': 'cls'
             }
         }
@@ -152,11 +167,16 @@ def get_embeddings(
     if max_length is None:
         if model_name in ['jina', 'jina-v3']:
             max_length = 8192
-        else:  # robertalex, legal-bert
             max_length = 512
     # Process in batches for memory efficiency
-    batch_size = 8 if len(texts) > 8 else len(texts)
     all_embeddings = []
     for i in range(0, len(texts), batch_size):
@@ -259,6 +279,13 @@ def get_model_info(model_name: str) -> Dict:
             'max_length': 512,
             'pooling': 'cls',
             'languages': ['English']
         }
     }

         ).to(device)
         legal_bert_model.eval()
+        # Load Catalan RoBERTa model
+        print("Loading Catalan RoBERTa-large model...")
+        roberta_ca_tokenizer = AutoTokenizer.from_pretrained('projecte-aina/roberta-large-ca-v2')
+        roberta_ca_model = AutoModel.from_pretrained(
+            'projecte-aina/roberta-large-ca-v2',
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
+        ).to(device)
+        roberta_ca_model.eval()
         models_cache = {
             'jina': {
                 'tokenizer': jina_tokenizer,
                 'model': legal_bert_model,
                 'device': device,
                 'pooling': 'cls'
+            },
+            'roberta-ca': {
+                'tokenizer': roberta_ca_tokenizer,
+                'model': roberta_ca_model,
+                'device': device,
+                'pooling': 'cls'
             }
         }
     if max_length is None:
         if model_name in ['jina', 'jina-v3']:
             max_length = 8192
+        else:  # robertalex, legal-bert, roberta-ca
             max_length = 512
     # Process in batches for memory efficiency
+    # Reduce batch size for large models
+    if model_name in ['jina-v3', 'roberta-ca']:
+        batch_size = 4 if len(texts) > 4 else len(texts)
+    else:
+        batch_size = 8 if len(texts) > 8 else len(texts)
     all_embeddings = []
     for i in range(0, len(texts), batch_size):
             'max_length': 512,
             'pooling': 'cls',
             'languages': ['English']
+        },
+        'roberta-ca': {
+            'full_name': 'projecte-aina/roberta-large-ca-v2',
+            'dimensions': 1024,
+            'max_length': 512,
+            'pooling': 'cls',
+            'languages': ['Catalan']
         }
     }