Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -9,7 +9,6 @@ import gradio as gr
|
|
| 9 |
from pylate import indexes, models, retrieve
|
| 10 |
from documents import MULTILINGUAL_DOCUMENTS
|
| 11 |
|
| 12 |
-
# Configure logging
|
| 13 |
logging.basicConfig(
|
| 14 |
level=logging.INFO,
|
| 15 |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
@@ -26,14 +25,13 @@ class CrossLingualRetriever:
|
|
| 26 |
|
| 27 |
self.model = models.ColBERT(model_name_or_path=model_name)
|
| 28 |
|
| 29 |
-
# Set padding token
|
| 30 |
-
|
| 31 |
-
self.model.tokenizer.pad_token = self.model.tokenizer.eos_token
|
| 32 |
|
| 33 |
# Initialize PLAID index
|
| 34 |
self.index = indexes.PLAID(
|
| 35 |
index_folder="pylate-index",
|
| 36 |
-
index_name="
|
| 37 |
override=True,
|
| 38 |
)
|
| 39 |
|
|
@@ -163,24 +161,22 @@ def search_documents(query: str, top_k: int) -> Tuple[str, str]:
|
|
| 163 |
|
| 164 |
# Example queries in different languages
|
| 165 |
EXAMPLE_QUERIES = [
|
| 166 |
-
["What is artificial intelligence?",
|
| 167 |
-
["¿Qué es el cambio climático?",
|
| 168 |
-
["양자 컴퓨팅이란 무엇인가요?",
|
| 169 |
["ما هي الصحة النفسية؟", 5],
|
| 170 |
-
["量子计算是什么?",
|
| 171 |
]
|
| 172 |
|
| 173 |
|
| 174 |
# Build Gradio interface
|
| 175 |
-
with gr.Blocks(title="Cross-Lingual Retrieval Demo", theme=gr.themes.Soft()) as demo:
|
| 176 |
gr.Markdown(
|
| 177 |
"""
|
| 178 |
# 🌍 Cross-Lingual Document Retrieval
|
| 179 |
### Powered by [LiquidAI/LFM2-ColBERT-350M](https://huggingface.co/LiquidAI/LFM2-ColBERT-350M)
|
| 180 |
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
The model finds semantically similar documents regardless of the language mismatch.
|
| 184 |
|
| 185 |
**Supported Languages:** English, Arabic, Chinese, French, German, Japanese, Korean, and Spanish
|
| 186 |
"""
|
|
@@ -196,7 +192,7 @@ with gr.Blocks(title="Cross-Lingual Retrieval Demo", theme=gr.themes.Soft()) as
|
|
| 196 |
|
| 197 |
top_k_slider = gr.Slider(
|
| 198 |
minimum=1,
|
| 199 |
-
maximum=
|
| 200 |
value=5,
|
| 201 |
step=1,
|
| 202 |
label="Number of results to retrieve",
|
|
@@ -244,7 +240,7 @@ with gr.Blocks(title="Cross-Lingual Retrieval Demo", theme=gr.themes.Soft()) as
|
|
| 244 |
)
|
| 245 |
|
| 246 |
# Examples section
|
| 247 |
-
gr.Markdown("### 💡
|
| 248 |
gr.Examples(
|
| 249 |
examples=EXAMPLE_QUERIES,
|
| 250 |
inputs=[query_input, top_k_slider],
|
|
@@ -255,7 +251,6 @@ with gr.Blocks(title="Cross-Lingual Retrieval Demo", theme=gr.themes.Soft()) as
|
|
| 255 |
|
| 256 |
gr.Markdown(
|
| 257 |
"""
|
| 258 |
-
---
|
| 259 |
**How it works:** This demo uses the LiquidAI LFM2-ColBERT-350M model with late interaction retrieval.
|
| 260 |
The model encodes both queries and documents into token-level embeddings, enabling fine-grained matching
|
| 261 |
across languages with high speed and accuracy.
|
|
|
|
| 9 |
from pylate import indexes, models, retrieve
|
| 10 |
from documents import MULTILINGUAL_DOCUMENTS
|
| 11 |
|
|
|
|
| 12 |
logging.basicConfig(
|
| 13 |
level=logging.INFO,
|
| 14 |
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
|
|
|
| 25 |
|
| 26 |
self.model = models.ColBERT(model_name_or_path=model_name)
|
| 27 |
|
| 28 |
+
# Set padding token
|
| 29 |
+
self.model.tokenizer.pad_token = self.model.tokenizer.eos_token
|
|
|
|
| 30 |
|
| 31 |
# Initialize PLAID index
|
| 32 |
self.index = indexes.PLAID(
|
| 33 |
index_folder="pylate-index",
|
| 34 |
+
index_name="index",
|
| 35 |
override=True,
|
| 36 |
)
|
| 37 |
|
|
|
|
| 161 |
|
| 162 |
# Example queries in different languages
|
| 163 |
EXAMPLE_QUERIES = [
|
| 164 |
+
["What is artificial intelligence?", 8],
|
| 165 |
+
["¿Qué es el cambio climático?", 4],
|
| 166 |
+
["양자 컴퓨팅이란 무엇인가요?", 6],
|
| 167 |
["ما هي الصحة النفسية؟", 5],
|
| 168 |
+
["量子计算是什么?", 8],
|
| 169 |
]
|
| 170 |
|
| 171 |
|
| 172 |
# Build Gradio interface
|
| 173 |
+
with gr.Blocks(title="Cross-Lingual Retrieval Demo", theme=gr.themes.Soft(primary_hue="purple")) as demo:
|
| 174 |
gr.Markdown(
|
| 175 |
"""
|
| 176 |
# 🌍 Cross-Lingual Document Retrieval
|
| 177 |
### Powered by [LiquidAI/LFM2-ColBERT-350M](https://huggingface.co/LiquidAI/LFM2-ColBERT-350M)
|
| 178 |
|
| 179 |
+
Find semantically similar documents across different languages.
|
|
|
|
|
|
|
| 180 |
|
| 181 |
**Supported Languages:** English, Arabic, Chinese, French, German, Japanese, Korean, and Spanish
|
| 182 |
"""
|
|
|
|
| 192 |
|
| 193 |
top_k_slider = gr.Slider(
|
| 194 |
minimum=1,
|
| 195 |
+
maximum=12,
|
| 196 |
value=5,
|
| 197 |
step=1,
|
| 198 |
label="Number of results to retrieve",
|
|
|
|
| 240 |
)
|
| 241 |
|
| 242 |
# Examples section
|
| 243 |
+
gr.Markdown("### 💡 Example queries:")
|
| 244 |
gr.Examples(
|
| 245 |
examples=EXAMPLE_QUERIES,
|
| 246 |
inputs=[query_input, top_k_slider],
|
|
|
|
| 251 |
|
| 252 |
gr.Markdown(
|
| 253 |
"""
|
|
|
|
| 254 |
**How it works:** This demo uses the LiquidAI LFM2-ColBERT-350M model with late interaction retrieval.
|
| 255 |
The model encodes both queries and documents into token-level embeddings, enabling fine-grained matching
|
| 256 |
across languages with high speed and accuracy.
|