Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,7 +7,7 @@ DATASETS = [
|
|
| 7 |
"mMARCO-fr",
|
| 8 |
"BSARD",
|
| 9 |
]
|
| 10 |
-
|
| 11 |
"antoinelouis/biencoder-camemberta-base-mmarcoFR",
|
| 12 |
"antoinelouis/biencoder-camembert-base-mmarcoFR",
|
| 13 |
"antoinelouis/biencoder-distilcamembert-mmarcoFR",
|
|
@@ -22,15 +22,15 @@ DENSE_SINGLE_BIENCODERS = [
|
|
| 22 |
"OrdalieTech/Solon-embeddings-large-0.1",
|
| 23 |
"OrdalieTech/Solon-embeddings-base-0.1",
|
| 24 |
]
|
| 25 |
-
|
| 26 |
"antoinelouis/colbertv1-camembert-base-mmarcoFR",
|
| 27 |
"antoinelouis/colbertv2-camembert-L4-mmarcoFR",
|
| 28 |
"antoinelouis/colbert-xm",
|
| 29 |
]
|
| 30 |
-
|
| 31 |
"antoinelouis/spladev2-camembert-base-mmarcoFR",
|
| 32 |
]
|
| 33 |
-
|
| 34 |
"antoinelouis/crossencoder-camemberta-L2-mmarcoFR",
|
| 35 |
"antoinelouis/crossencoder-camemberta-L4-mmarcoFR",
|
| 36 |
"antoinelouis/crossencoder-camemberta-L6-mmarcoFR",
|
|
@@ -57,7 +57,6 @@ CROSS_ENCODERS = [
|
|
| 57 |
"antoinelouis/crossencoder-mMiniLMv2-L12-mmarcoFR",
|
| 58 |
"antoinelouis/crossencoder-mMiniLMv2-L6-mmarcoFR",
|
| 59 |
]
|
| 60 |
-
LLMS = []
|
| 61 |
COLUMNS = {
|
| 62 |
"Model": "html",
|
| 63 |
"#Params (M)": "number",
|
|
@@ -81,7 +80,7 @@ def get_model_info(model_id: str, model_type: str) -> pd.DataFrame:
|
|
| 81 |
if result.dataset_name in DATASETS and result.dataset_name not in data:
|
| 82 |
data[result.dataset_name] = {key: None for key in COLUMNS.keys()}
|
| 83 |
data[result.dataset_name]["Model"] = f'<a href="https://huggingface.co/{model_id}" target="_blank" style="color: blue; text-decoration: none;">{model_id}</a>'
|
| 84 |
-
data[result.dataset_name]["#Params (M)"] = round(model_info.safetensors.total/1e6) if model_info.safetensors else None
|
| 85 |
data[result.dataset_name]["Type"] = model_type
|
| 86 |
data[result.dataset_name]["Dataset"] = result.dataset_name
|
| 87 |
|
|
@@ -91,17 +90,24 @@ def get_model_info(model_id: str, model_type: str) -> pd.DataFrame:
|
|
| 91 |
return pd.DataFrame(list(data.values()))
|
| 92 |
|
| 93 |
def load_all_results() -> pd.DataFrame:
|
| 94 |
-
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 105 |
return df
|
| 106 |
|
| 107 |
def filter_dataf_by_dataset(dataf: pd.DataFrame, dataset_name: str, sort_by: str) -> pd.DataFrame:
|
|
@@ -111,35 +117,24 @@ def filter_dataf_by_dataset(dataf: pd.DataFrame, dataset_name: str, sort_by: str
|
|
| 111 |
.sort_values(by=sort_by, ascending=False)
|
| 112 |
)
|
| 113 |
|
| 114 |
-
|
| 115 |
def update_table(dataf: pd.DataFrame, query: str, selected_types: list, selected_sizes: list) -> pd.DataFrame:
|
| 116 |
filtered_df = dataf.copy()
|
| 117 |
-
conditions = []
|
| 118 |
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
conditions.append((filtered_df['Type'] == 'DSVBE'))
|
| 122 |
-
elif val == 'Dense multi-vector bi-encoder (DMVBE)':
|
| 123 |
-
conditions.append((filtered_df['Type'] == 'DMVBE'))
|
| 124 |
-
elif val == 'Sparse single-vector bi-encoder (SSVBE)':
|
| 125 |
-
conditions.append((filtered_df['Type'] == 'SSVBE'))
|
| 126 |
-
elif val == 'Cross-encoder (CE)':
|
| 127 |
-
conditions.append((filtered_df['Type'] == 'CE'))
|
| 128 |
-
elif val == 'LLM':
|
| 129 |
-
conditions.append((filtered_df['Type'] == 'LLM'))
|
| 130 |
|
|
|
|
| 131 |
for val in selected_sizes:
|
| 132 |
if val == 'Small (< 100M)':
|
| 133 |
-
|
| 134 |
elif val == 'Base (100M-300M)':
|
| 135 |
-
|
| 136 |
elif val == 'Large (300M-500M)':
|
| 137 |
-
|
| 138 |
elif val == 'Extra-large (500M+)':
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
filtered_df = filtered_df[pd.concat(conditions, axis=1).any(axis=1)]
|
| 143 |
|
| 144 |
if query:
|
| 145 |
filtered_df = filtered_df[filtered_df['Model'].str.contains(query, case=False)]
|
|
@@ -171,11 +166,10 @@ with gr.Blocks() as demo:
|
|
| 171 |
filter_type = gr.CheckboxGroup(
|
| 172 |
label="Model type",
|
| 173 |
choices=[
|
| 174 |
-
'
|
| 175 |
-
'
|
| 176 |
-
'Sparse
|
| 177 |
-
'Cross-encoder (
|
| 178 |
-
'LLM',
|
| 179 |
],
|
| 180 |
value=[],
|
| 181 |
interactive=True,
|
|
@@ -220,41 +214,11 @@ with gr.Blocks() as demo:
|
|
| 220 |
# elem_classes="text-sm",
|
| 221 |
# )
|
| 222 |
|
| 223 |
-
# Update tables on
|
| 224 |
-
search_bar
|
| 225 |
-
|
| 226 |
-
inputs=[
|
| 227 |
-
outputs=
|
| 228 |
-
)
|
| 229 |
-
# search_bar.change(
|
| 230 |
-
# fn=lambda x: update_table(dataf=bsard_df, query=x, selected_types=filter_type.value, selected_sizes=filter_size.value),
|
| 231 |
-
# inputs=[search_bar],
|
| 232 |
-
# outputs=bsard_table,
|
| 233 |
-
# )
|
| 234 |
-
|
| 235 |
-
# Update tables on model type filter.
|
| 236 |
-
filter_type.change(
|
| 237 |
-
fn=lambda selected_types: update_table(mmarco_df, search_bar.value, selected_types, filter_size.value),
|
| 238 |
-
inputs=[filter_type],
|
| 239 |
-
outputs=mmarco_table,
|
| 240 |
-
)
|
| 241 |
-
# filter_type.change(
|
| 242 |
-
# fn=lambda selected_types: update_table(bsard_df, search_bar.value, selected_types, filter_size.value),
|
| 243 |
-
# inputs=[filter_type],
|
| 244 |
-
# outputs=bsard_table,
|
| 245 |
-
# )
|
| 246 |
-
|
| 247 |
-
# Update tables on model size filter.
|
| 248 |
-
filter_size.change(
|
| 249 |
-
fn=lambda selected_sizes: update_table(mmarco_df, search_bar.value, filter_type.value, selected_sizes),
|
| 250 |
-
inputs=[filter_size],
|
| 251 |
-
outputs=mmarco_table,
|
| 252 |
-
)
|
| 253 |
-
# filter_size.change(
|
| 254 |
-
# fn=lambda selected_sizes: update_table(bsard_df, search_bar.value, filter_type.value, selected_sizes),
|
| 255 |
-
# inputs=[filter_size],
|
| 256 |
-
# outputs=bsard_table,
|
| 257 |
-
# )
|
| 258 |
|
| 259 |
# Citation
|
| 260 |
with gr.Column():
|
|
|
|
| 7 |
"mMARCO-fr",
|
| 8 |
"BSARD",
|
| 9 |
]
|
| 10 |
+
SINGLE_VECTOR_MODELS = [
|
| 11 |
"antoinelouis/biencoder-camemberta-base-mmarcoFR",
|
| 12 |
"antoinelouis/biencoder-camembert-base-mmarcoFR",
|
| 13 |
"antoinelouis/biencoder-distilcamembert-mmarcoFR",
|
|
|
|
| 22 |
"OrdalieTech/Solon-embeddings-large-0.1",
|
| 23 |
"OrdalieTech/Solon-embeddings-base-0.1",
|
| 24 |
]
|
| 25 |
+
MULTI_VECTOR_MODELS = [
|
| 26 |
"antoinelouis/colbertv1-camembert-base-mmarcoFR",
|
| 27 |
"antoinelouis/colbertv2-camembert-L4-mmarcoFR",
|
| 28 |
"antoinelouis/colbert-xm",
|
| 29 |
]
|
| 30 |
+
SPARSE_LEXICAL_MODELS = [
|
| 31 |
"antoinelouis/spladev2-camembert-base-mmarcoFR",
|
| 32 |
]
|
| 33 |
+
CROSS_ENCODER_MODELS = [
|
| 34 |
"antoinelouis/crossencoder-camemberta-L2-mmarcoFR",
|
| 35 |
"antoinelouis/crossencoder-camemberta-L4-mmarcoFR",
|
| 36 |
"antoinelouis/crossencoder-camemberta-L6-mmarcoFR",
|
|
|
|
| 57 |
"antoinelouis/crossencoder-mMiniLMv2-L12-mmarcoFR",
|
| 58 |
"antoinelouis/crossencoder-mMiniLMv2-L6-mmarcoFR",
|
| 59 |
]
|
|
|
|
| 60 |
COLUMNS = {
|
| 61 |
"Model": "html",
|
| 62 |
"#Params (M)": "number",
|
|
|
|
| 80 |
if result.dataset_name in DATASETS and result.dataset_name not in data:
|
| 81 |
data[result.dataset_name] = {key: None for key in COLUMNS.keys()}
|
| 82 |
data[result.dataset_name]["Model"] = f'<a href="https://huggingface.co/{model_id}" target="_blank" style="color: blue; text-decoration: none;">{model_id}</a>'
|
| 83 |
+
data[result.dataset_name]["#Params (M)"] = round(model_info.safetensors.total/1e6, 0) if model_info.safetensors else None
|
| 84 |
data[result.dataset_name]["Type"] = model_type
|
| 85 |
data[result.dataset_name]["Dataset"] = result.dataset_name
|
| 86 |
|
|
|
|
| 90 |
return pd.DataFrame(list(data.values()))
|
| 91 |
|
| 92 |
def load_all_results() -> pd.DataFrame:
|
| 93 |
+
# Load results from external baseline models.
|
| 94 |
+
df = pd.read_csv('./baselines.csv')
|
| 95 |
+
|
| 96 |
+
# Load results from own Hugging Face models.
|
| 97 |
+
for model_id in SINGLE_VECTOR_MODELS:
|
| 98 |
+
df = pd.concat([df, get_model_info(model_id, model_type="SINGLE")])
|
| 99 |
+
for model_id in MULTI_VECTOR_MODELS:
|
| 100 |
+
df = pd.concat([df, get_model_info(model_id, model_type="MULTI")])
|
| 101 |
+
for model_id in SPARSE_LEXICAL_MODELS:
|
| 102 |
+
df = pd.concat([df, get_model_info(model_id, model_type="SPARSE")])
|
| 103 |
+
for model_id in CROSS_ENCODER_MODELS:
|
| 104 |
+
df = pd.concat([df, get_model_info(model_id, model_type="CROSS")])
|
| 105 |
+
|
| 106 |
+
# Round all metrics to 1 decimal.
|
| 107 |
+
for col in df.columns:
|
| 108 |
+
if "Recall" in col or "MRR" in col or "nDCG" in col or "MAP" in col:
|
| 109 |
+
df[col] = df[col].round(1)
|
| 110 |
+
|
| 111 |
return df
|
| 112 |
|
| 113 |
def filter_dataf_by_dataset(dataf: pd.DataFrame, dataset_name: str, sort_by: str) -> pd.DataFrame:
|
|
|
|
| 117 |
.sort_values(by=sort_by, ascending=False)
|
| 118 |
)
|
| 119 |
|
|
|
|
| 120 |
def update_table(dataf: pd.DataFrame, query: str, selected_types: list, selected_sizes: list) -> pd.DataFrame:
|
| 121 |
filtered_df = dataf.copy()
|
|
|
|
| 122 |
|
| 123 |
+
if selected_types:
|
| 124 |
+
filtered_df = filtered_df[filtered_df['Type'].isin([t.split()[-1][1:-1] for t in selected_types])]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 125 |
|
| 126 |
+
size_conditions = []
|
| 127 |
for val in selected_sizes:
|
| 128 |
if val == 'Small (< 100M)':
|
| 129 |
+
size_conditions.append(filtered_df['#Params (M)'] < 100)
|
| 130 |
elif val == 'Base (100M-300M)':
|
| 131 |
+
size_conditions.append((filtered_df['#Params (M)'] >= 100) & (filtered_df['#Params (M)'] <= 300))
|
| 132 |
elif val == 'Large (300M-500M)':
|
| 133 |
+
size_conditions.append((filtered_df['#Params (M)'] >= 300) & (filtered_df['#Params (M)'] <= 500))
|
| 134 |
elif val == 'Extra-large (500M+)':
|
| 135 |
+
size_conditions.append(filtered_df['#Params (M)'] > 500)
|
| 136 |
+
if size_conditions:
|
| 137 |
+
filtered_df = filtered_df[pd.concat(size_conditions, axis=1).any(axis=1)]
|
|
|
|
| 138 |
|
| 139 |
if query:
|
| 140 |
filtered_df = filtered_df[filtered_df['Model'].str.contains(query, case=False)]
|
|
|
|
| 166 |
filter_type = gr.CheckboxGroup(
|
| 167 |
label="Model type",
|
| 168 |
choices=[
|
| 169 |
+
'Single-vector dense bi-encoder (SINGLE)',
|
| 170 |
+
'Multi-vector dense bi-encoder (MULTI)',
|
| 171 |
+
'Sparse lexical model (SPARSE)',
|
| 172 |
+
'Cross-encoder (CROSS)',
|
|
|
|
| 173 |
],
|
| 174 |
value=[],
|
| 175 |
interactive=True,
|
|
|
|
| 214 |
# elem_classes="text-sm",
|
| 215 |
# )
|
| 216 |
|
| 217 |
+
# Update tables on filter widgets change.
|
| 218 |
+
widgets = [search_bar, filter_type, filter_size]
|
| 219 |
+
for w in widgets:
|
| 220 |
+
w.change(fn=lambda q, t, s: update_table(dataf=mmarco_df, query=q, selected_types=t, selected_sizes=s), inputs=widgets, outputs=[mmarco_table])
|
| 221 |
+
#w.change(fn=lambda q, t, s: update_table(dataf=bsard_df, query=q, selected_types=t, selected_sizes=s), inputs=widgets, outputs=[bsard_table])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
# Citation
|
| 224 |
with gr.Column():
|