Jordi Catafal commited on
Commit
909d9bf
1 Parent(s): 0a6cb95

Add Catalan RoBERTa model - now supporting 5 models

Browse files
Files changed (7) hide show
  1. Dockerfile +5 -2
  2. README.md +44 -10
  3. app.py +17 -7
  4. models/schemas.py +2 -2
  5. requirements.txt +2 -1
  6. utils/__init__.py +0 -1
  7. utils/helpers.py +29 -2
Dockerfile CHANGED
@@ -5,9 +5,12 @@ ENV PYTHONUNBUFFERED=1
5
  ENV TRANSFORMERS_CACHE=/app/cache
6
  ENV HF_HOME=/app/cache
7
  ENV PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.6,max_split_size_mb:128
8
- # Add this to handle the larger models
9
  ENV TRANSFORMERS_OFFLINE=0
10
  ENV HF_HUB_ENABLE_HF_TRANSFER=1
 
 
 
11
 
12
  # Install system dependencies for better performance
13
  RUN apt-get update && apt-get install -y \
@@ -39,4 +42,4 @@ RUN mkdir -p /app/cache
39
  EXPOSE 7860
40
 
41
  # Run the application
42
- CMD ["python", "app.py"]
 
5
  ENV TRANSFORMERS_CACHE=/app/cache
6
  ENV HF_HOME=/app/cache
7
  ENV PYTORCH_CUDA_ALLOC_CONF=garbage_collection_threshold:0.6,max_split_size_mb:128
8
+ # Optimize for multiple large models
9
  ENV TRANSFORMERS_OFFLINE=0
10
  ENV HF_HUB_ENABLE_HF_TRANSFER=1
11
+ ENV TOKENIZERS_PARALLELISM=false
12
+ # Reduce memory fragmentation
13
+ ENV MALLOC_TRIM_THRESHOLD_=100000
14
 
15
  # Install system dependencies for better performance
16
  RUN apt-get update && apt-get install -y \
 
42
  EXPOSE 7860
43
 
44
  # Run the application
45
+ CMD ["python", "-u", "app.py"]
README.md CHANGED
@@ -10,10 +10,9 @@ pinned: false
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
11
 
12
  --------------------------------
 
13
 
14
- # Spanish & Legal Embeddings API
15
-
16
- A high-performance API for generating embeddings from Spanish, English, and multilingual text using state-of-the-art models. This API provides access to four specialized models optimized for different use cases and languages.
17
 
18
  ## 馃殌 Quick Start
19
 
@@ -29,6 +28,7 @@ A high-performance API for generating embeddings from Spanish, English, and mult
29
  | **robertalex** | 512 | Spanish | 768 | Spanish legal documents, formal Spanish |
30
  | **jina-v3** | 8,192 | Multilingual (30+ languages) | 1,024 | Superior multilingual embeddings, long context |
31
  | **legal-bert** | 512 | English | 768 | English legal documents, contracts, law texts |
 
32
 
33
  ## 馃敆 API Endpoints
34
 
@@ -96,7 +96,22 @@ multilingual_response = requests.post(
96
  )
97
  print(f"Jina v3 dimensions: {multilingual_response.json()['dimensions']}") # 1024 dims
98
 
99
- # Example 3: Legal text with RoBERTalex (Spanish)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  spanish_legal_response = requests.post(
101
  f"{API_URL}/embed",
102
  json={
@@ -109,7 +124,7 @@ spanish_legal_response = requests.post(
109
  }
110
  )
111
 
112
- # Example 4: Legal text with Legal-BERT (English)
113
  english_legal_response = requests.post(
114
  f"{API_URL}/embed",
115
  json={
@@ -122,11 +137,12 @@ english_legal_response = requests.post(
122
  }
123
  )
124
 
125
- # Example 5: Compare similarity across models
126
- text = "artificial intelligence and law"
 
127
  models_comparison = {}
128
 
129
- for model in ["jina", "jina-v3", "legal-bert"]:
130
  resp = requests.post(
131
  f"{API_URL}/embed",
132
  json={"texts": [text], "model": model, "normalize": True}
@@ -148,6 +164,15 @@ curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
148
  "normalize": true
149
  }'
150
 
 
 
 
 
 
 
 
 
 
151
  # Using Jina v3 for multilingual embeddings
152
  curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
153
  -H "Content-Type: application/json" \
@@ -234,7 +259,7 @@ class MultilingualEmbeddings(Embeddings):
234
  Initialize embeddings
235
 
236
  Args:
237
- model: One of "jina", "robertalex", "jina-v3", "legal-bert"
238
  """
239
  self.api_url = "https://aurasystems-spanish-embeddings-api.hf.space/embed"
240
  self.model = model
@@ -262,12 +287,21 @@ spanish_docs = spanish_embeddings.embed_documents([
262
  "Segundo documento en espa帽ol"
263
  ])
264
 
 
 
 
 
 
 
 
 
265
  # Multilingual embeddings with Jina v3
266
  multilingual_embeddings = MultilingualEmbeddings(model="jina-v3")
267
  mixed_docs = multilingual_embeddings.embed_documents([
268
  "English document",
269
  "Documento en espa帽ol",
270
- "Document en fran莽ais"
 
271
  ])
272
 
273
  # Legal embeddings for English
 
10
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
11
 
12
  --------------------------------
13
+ # Multilingual & Legal Embeddings API
14
 
15
+ A high-performance API for generating embeddings from Spanish, Catalan, English, and multilingual text using state-of-the-art models. This API provides access to five specialized models optimized for different use cases and languages.
 
 
16
 
17
  ## 馃殌 Quick Start
18
 
 
28
  | **robertalex** | 512 | Spanish | 768 | Spanish legal documents, formal Spanish |
29
  | **jina-v3** | 8,192 | Multilingual (30+ languages) | 1,024 | Superior multilingual embeddings, long context |
30
  | **legal-bert** | 512 | English | 768 | English legal documents, contracts, law texts |
31
+ | **roberta-ca** | 512 | Catalan | 1,024 | Catalan text, general purpose, RoBERTa-large architecture |
32
 
33
  ## 馃敆 API Endpoints
34
 
 
96
  )
97
  print(f"Jina v3 dimensions: {multilingual_response.json()['dimensions']}") # 1024 dims
98
 
99
+ # Example 3: Catalan text with RoBERTa-ca
100
+ catalan_response = requests.post(
101
+ f"{API_URL}/embed",
102
+ json={
103
+ "texts": [
104
+ "Bon dia, com est脿s?",
105
+ "M'agrada programar en Python",
106
+ "Barcelona 茅s una ciutat meravellosa"
107
+ ],
108
+ "model": "roberta-ca",
109
+ "normalize": True
110
+ }
111
+ )
112
+ print(f"Catalan RoBERTa dimensions: {catalan_response.json()['dimensions']}") # 1024 dims
113
+
114
+ # Example 4: Legal text with RoBERTalex (Spanish)
115
  spanish_legal_response = requests.post(
116
  f"{API_URL}/embed",
117
  json={
 
124
  }
125
  )
126
 
127
+ # Example 5: Legal text with Legal-BERT (English)
128
  english_legal_response = requests.post(
129
  f"{API_URL}/embed",
130
  json={
 
137
  }
138
  )
139
 
140
+ # Example 6: Compare similarity across models
141
+ text_es = "inteligencia artificial"
142
+ text_ca = "intel路lig猫ncia artificial"
143
  models_comparison = {}
144
 
145
+ for model, text in [("jina", text_es), ("roberta-ca", text_ca), ("jina-v3", text_es)]:
146
  resp = requests.post(
147
  f"{API_URL}/embed",
148
  json={"texts": [text], "model": model, "normalize": True}
 
164
  "normalize": true
165
  }'
166
 
167
+ # Catalan text with RoBERTa-ca
168
+ curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
169
+ -H "Content-Type: application/json" \
170
+ -d '{
171
+ "texts": ["Bon dia", "Com est脿 vost猫?", "Catalunya 茅s meravellosa"],
172
+ "model": "roberta-ca",
173
+ "normalize": true
174
+ }'
175
+
176
  # Using Jina v3 for multilingual embeddings
177
  curl -X POST "https://aurasystems-spanish-embeddings-api.hf.space/embed" \
178
  -H "Content-Type: application/json" \
 
259
  Initialize embeddings
260
 
261
  Args:
262
+ model: One of "jina", "robertalex", "jina-v3", "legal-bert", "roberta-ca"
263
  """
264
  self.api_url = "https://aurasystems-spanish-embeddings-api.hf.space/embed"
265
  self.model = model
 
287
  "Segundo documento en espa帽ol"
288
  ])
289
 
290
+ # Catalan embeddings
291
+ catalan_embeddings = MultilingualEmbeddings(model="roberta-ca")
292
+ catalan_docs = catalan_embeddings.embed_documents([
293
+ "Primer document en catal脿",
294
+ "Segon document en catal脿",
295
+ "La cultura catalana 茅s rica i diversa"
296
+ ])
297
+
298
  # Multilingual embeddings with Jina v3
299
  multilingual_embeddings = MultilingualEmbeddings(model="jina-v3")
300
  mixed_docs = multilingual_embeddings.embed_documents([
301
  "English document",
302
  "Documento en espa帽ol",
303
+ "Document en fran莽ais",
304
+ "Document en catal脿"
305
  ])
306
 
307
  # Legal embeddings for English
app.py CHANGED
@@ -9,9 +9,9 @@ from models.schemas import EmbeddingRequest, EmbeddingResponse, ModelInfo
9
  from utils.helpers import load_models, get_embeddings, cleanup_memory
10
 
11
  app = FastAPI(
12
- title="Spanish & Legal Embedding API",
13
- description="Multi-model embedding API for Spanish and Legal texts",
14
- version="2.0.0"
15
  )
16
 
17
  # Global model cache
@@ -27,10 +27,11 @@ async def startup_event():
27
  @app.get("/")
28
  async def root():
29
  return {
30
- "message": "Spanish & Legal Embedding API",
31
- "models": ["jina", "robertalex", "jina-v3", "legal-bert"],
32
  "status": "running",
33
- "docs": "/docs"
 
34
  }
35
 
36
  @app.post("/embed", response_model=EmbeddingResponse)
@@ -106,6 +107,15 @@ async def list_models():
106
  languages=["English"],
107
  model_type="legal domain",
108
  description="English legal domain BERT model"
 
 
 
 
 
 
 
 
 
109
  )
110
  ]
111
 
@@ -114,7 +124,7 @@ async def health_check():
114
  """Health check endpoint"""
115
  return {
116
  "status": "healthy",
117
- "models_loaded": len(models_cache) == 4,
118
  "available_models": list(models_cache.keys())
119
  }
120
 
 
9
  from utils.helpers import load_models, get_embeddings, cleanup_memory
10
 
11
  app = FastAPI(
12
+ title="Multilingual & Legal Embedding API",
13
+ description="Multi-model embedding API for Spanish, Catalan, English and Legal texts",
14
+ version="3.0.0"
15
  )
16
 
17
  # Global model cache
 
27
  @app.get("/")
28
  async def root():
29
  return {
30
+ "message": "Multilingual & Legal Embedding API",
31
+ "models": ["jina", "robertalex", "jina-v3", "legal-bert", "roberta-ca"],
32
  "status": "running",
33
+ "docs": "/docs",
34
+ "total_models": 5
35
  }
36
 
37
  @app.post("/embed", response_model=EmbeddingResponse)
 
107
  languages=["English"],
108
  model_type="legal domain",
109
  description="English legal domain BERT model"
110
+ ),
111
+ ModelInfo(
112
+ model_id="roberta-ca",
113
+ name="projecte-aina/roberta-large-ca-v2",
114
+ dimensions=1024,
115
+ max_sequence_length=512,
116
+ languages=["Catalan"],
117
+ model_type="general",
118
+ description="Catalan RoBERTa-large model trained on large corpus"
119
  )
120
  ]
121
 
 
124
  """Health check endpoint"""
125
  return {
126
  "status": "healthy",
127
+ "models_loaded": len(models_cache) == 5,
128
  "available_models": list(models_cache.keys())
129
  }
130
 
models/schemas.py CHANGED
@@ -11,7 +11,7 @@ class EmbeddingRequest(BaseModel):
11
  description="List of texts to embed",
12
  example=["Hola mundo", "驴C贸mo est谩s?"]
13
  )
14
- model: Literal["jina", "robertalex", "jina-v3", "legal-bert"] = Field(
15
  default="jina",
16
  description="Model to use for embeddings"
17
  )
@@ -41,7 +41,7 @@ class EmbeddingRequest(BaseModel):
41
  model = values.get('model', 'jina')
42
  if model in ['jina', 'jina-v3'] and v > 8192:
43
  raise ValueError(f"Max length for {model} model is 8192")
44
- elif model in ['robertalex', 'legal-bert'] and v > 512:
45
  raise ValueError(f"Max length for {model} model is 512")
46
  if v < 1:
47
  raise ValueError("Max length must be positive")
 
11
  description="List of texts to embed",
12
  example=["Hola mundo", "驴C贸mo est谩s?"]
13
  )
14
+ model: Literal["jina", "robertalex", "jina-v3", "legal-bert", "roberta-ca"] = Field(
15
  default="jina",
16
  description="Model to use for embeddings"
17
  )
 
41
  model = values.get('model', 'jina')
42
  if model in ['jina', 'jina-v3'] and v > 8192:
43
  raise ValueError(f"Max length for {model} model is 8192")
44
+ elif model in ['robertalex', 'legal-bert', 'roberta-ca'] and v > 512:
45
  raise ValueError(f"Max length for {model} model is 512")
46
  if v < 1:
47
  raise ValueError("Max length must be positive")
requirements.txt CHANGED
@@ -8,4 +8,5 @@ scikit-learn==1.3.2
8
  pydantic==2.5.0
9
  huggingface-hub==0.19.4
10
  python-multipart==0.0.6
11
- protobuf>=3.20.0
 
 
8
  pydantic==2.5.0
9
  huggingface-hub==0.19.4
10
  python-multipart==0.0.6
11
+ protobuf>=3.20.0
12
+ sentencepiece==0.1.99
utils/__init__.py CHANGED
@@ -1,4 +1,3 @@
1
-
2
  # utils/__init__.py
3
  """Utils package for helper functions"""
4
 
 
 
1
  # utils/__init__.py
2
  """Utils package for helper functions"""
3
 
utils/helpers.py CHANGED
@@ -69,6 +69,15 @@ def load_models() -> Dict:
69
  ).to(device)
70
  legal_bert_model.eval()
71
 
 
 
 
 
 
 
 
 
 
72
  models_cache = {
73
  'jina': {
74
  'tokenizer': jina_tokenizer,
@@ -93,6 +102,12 @@ def load_models() -> Dict:
93
  'model': legal_bert_model,
94
  'device': device,
95
  'pooling': 'cls'
 
 
 
 
 
 
96
  }
97
  }
98
 
@@ -152,11 +167,16 @@ def get_embeddings(
152
  if max_length is None:
153
  if model_name in ['jina', 'jina-v3']:
154
  max_length = 8192
155
- else: # robertalex, legal-bert
156
  max_length = 512
157
 
158
  # Process in batches for memory efficiency
159
- batch_size = 8 if len(texts) > 8 else len(texts)
 
 
 
 
 
160
  all_embeddings = []
161
 
162
  for i in range(0, len(texts), batch_size):
@@ -259,6 +279,13 @@ def get_model_info(model_name: str) -> Dict:
259
  'max_length': 512,
260
  'pooling': 'cls',
261
  'languages': ['English']
 
 
 
 
 
 
 
262
  }
263
  }
264
 
 
69
  ).to(device)
70
  legal_bert_model.eval()
71
 
72
+ # Load Catalan RoBERTa model
73
+ print("Loading Catalan RoBERTa-large model...")
74
+ roberta_ca_tokenizer = AutoTokenizer.from_pretrained('projecte-aina/roberta-large-ca-v2')
75
+ roberta_ca_model = AutoModel.from_pretrained(
76
+ 'projecte-aina/roberta-large-ca-v2',
77
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
78
+ ).to(device)
79
+ roberta_ca_model.eval()
80
+
81
  models_cache = {
82
  'jina': {
83
  'tokenizer': jina_tokenizer,
 
102
  'model': legal_bert_model,
103
  'device': device,
104
  'pooling': 'cls'
105
+ },
106
+ 'roberta-ca': {
107
+ 'tokenizer': roberta_ca_tokenizer,
108
+ 'model': roberta_ca_model,
109
+ 'device': device,
110
+ 'pooling': 'cls'
111
  }
112
  }
113
 
 
167
  if max_length is None:
168
  if model_name in ['jina', 'jina-v3']:
169
  max_length = 8192
170
+ else: # robertalex, legal-bert, roberta-ca
171
  max_length = 512
172
 
173
  # Process in batches for memory efficiency
174
+ # Reduce batch size for large models
175
+ if model_name in ['jina-v3', 'roberta-ca']:
176
+ batch_size = 4 if len(texts) > 4 else len(texts)
177
+ else:
178
+ batch_size = 8 if len(texts) > 8 else len(texts)
179
+
180
  all_embeddings = []
181
 
182
  for i in range(0, len(texts), batch_size):
 
279
  'max_length': 512,
280
  'pooling': 'cls',
281
  'languages': ['English']
282
+ },
283
+ 'roberta-ca': {
284
+ 'full_name': 'projecte-aina/roberta-large-ca-v2',
285
+ 'dimensions': 1024,
286
+ 'max_length': 512,
287
+ 'pooling': 'cls',
288
+ 'languages': ['Catalan']
289
  }
290
  }
291