Spaces:
Running
Running
Merge main
Browse files- config.yaml +34 -11
- refresh.py +11 -4
config.yaml
CHANGED
|
@@ -20,7 +20,7 @@ tasks:
|
|
| 20 |
task_description: "Clustering is the task of grouping similar documents together."
|
| 21 |
PairClassification:
|
| 22 |
icon: "🎭"
|
| 23 |
-
metric:
|
| 24 |
metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
|
| 25 |
task_description: "Pair classification is the task of determining whether two texts are similar."
|
| 26 |
Reranking:
|
|
@@ -35,14 +35,19 @@ tasks:
|
|
| 35 |
task_description: "Retrieval is the task of finding relevant documents for a query."
|
| 36 |
STS:
|
| 37 |
icon: "☘️"
|
| 38 |
-
metric:
|
| 39 |
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
|
| 40 |
task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
|
| 41 |
Summarization:
|
| 42 |
icon: "📜"
|
| 43 |
-
metric:
|
| 44 |
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
|
| 45 |
task_description: "Summarization is the task of generating a summary of a text."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
InstructionRetrieval:
|
| 47 |
icon: "🔎📋"
|
| 48 |
metric: "p-MRR"
|
|
@@ -347,6 +352,8 @@ boards:
|
|
| 347 |
- RuReviewsClassification (rus-Cyrl)
|
| 348 |
- RuSciBenchGRNTIClassification (rus-Cyrl)
|
| 349 |
- RuSciBenchOECDClassification (rus-Cyrl)
|
|
|
|
|
|
|
| 350 |
Clustering:
|
| 351 |
- GeoreviewClusteringP2P (rus-Cyrl)
|
| 352 |
- RuSciBenchGRNTIClusteringP2P (rus-Cyrl)
|
|
@@ -355,12 +362,18 @@ boards:
|
|
| 355 |
- TERRa (rus-Cyrl)
|
| 356 |
Reranking:
|
| 357 |
- RuBQReranking (rus-Cyrl)
|
|
|
|
| 358 |
Retrieval:
|
| 359 |
- RiaNewsRetrieval (rus-Cyrl)
|
| 360 |
- RuBQRetrieval (rus-Cyrl)
|
|
|
|
| 361 |
STS:
|
| 362 |
- RUParaPhraserSTS (rus-Cyrl)
|
| 363 |
- RuSTSBenchmarkSTS (rus-Cyrl)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 364 |
se:
|
| 365 |
title: Swedish
|
| 366 |
language_long: Swedish
|
|
@@ -517,13 +530,23 @@ boards:
|
|
| 517 |
metric: nDCG@10
|
| 518 |
tasks:
|
| 519 |
Retrieval:
|
| 520 |
-
- AppsRetrieval
|
| 521 |
-
- CodeFeedbackMT
|
| 522 |
-
- CodeFeedbackST
|
| 523 |
-
- CodeSearchNetCCRetrieval
|
| 524 |
-
-
|
| 525 |
-
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 526 |
- CodeTransOceanDL
|
| 527 |
-
- CosQA
|
| 528 |
- StackOverflowQA
|
| 529 |
-
- SyntheticText2SQL
|
|
|
|
| 20 |
task_description: "Clustering is the task of grouping similar documents together."
|
| 21 |
PairClassification:
|
| 22 |
icon: "🎭"
|
| 23 |
+
metric: max_ap
|
| 24 |
metric_description: "Average Precision (AP) based on the models similarity metric (usually cosine)"
|
| 25 |
task_description: "Pair classification is the task of determining whether two texts are similar."
|
| 26 |
Reranking:
|
|
|
|
| 35 |
task_description: "Retrieval is the task of finding relevant documents for a query."
|
| 36 |
STS:
|
| 37 |
icon: "☘️"
|
| 38 |
+
metric: cosine_spearman
|
| 39 |
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
|
| 40 |
task_description: "Semantic Textual Similarity is the task of determining how similar two texts are."
|
| 41 |
Summarization:
|
| 42 |
icon: "📜"
|
| 43 |
+
metric: cosine_spearman
|
| 44 |
metric_description: "Spearman correlation based on the model's similarity metric (usually cosine)"
|
| 45 |
task_description: "Summarization is the task of generating a summary of a text."
|
| 46 |
+
MultilabelClassification:
|
| 47 |
+
icon: "🏷️"
|
| 48 |
+
metric: accuracy
|
| 49 |
+
metric_description: "Accuracy"
|
| 50 |
+
task_description: "Multilabel classification is the task of assigning multiple labels to a text."
|
| 51 |
InstructionRetrieval:
|
| 52 |
icon: "🔎📋"
|
| 53 |
metric: "p-MRR"
|
|
|
|
| 352 |
- RuReviewsClassification (rus-Cyrl)
|
| 353 |
- RuSciBenchGRNTIClassification (rus-Cyrl)
|
| 354 |
- RuSciBenchOECDClassification (rus-Cyrl)
|
| 355 |
+
- MassiveIntentClassification (rus-Cyrl)
|
| 356 |
+
- MassiveScenarioClassification (rus-Cyrl)
|
| 357 |
Clustering:
|
| 358 |
- GeoreviewClusteringP2P (rus-Cyrl)
|
| 359 |
- RuSciBenchGRNTIClusteringP2P (rus-Cyrl)
|
|
|
|
| 362 |
- TERRa (rus-Cyrl)
|
| 363 |
Reranking:
|
| 364 |
- RuBQReranking (rus-Cyrl)
|
| 365 |
+
- MIRACLReranking (rus-Cyrl)
|
| 366 |
Retrieval:
|
| 367 |
- RiaNewsRetrieval (rus-Cyrl)
|
| 368 |
- RuBQRetrieval (rus-Cyrl)
|
| 369 |
+
- MIRACLRetrieval (rus-Cyrl)
|
| 370 |
STS:
|
| 371 |
- RUParaPhraserSTS (rus-Cyrl)
|
| 372 |
- RuSTSBenchmarkSTS (rus-Cyrl)
|
| 373 |
+
- STS22 (rus-Cyrl)
|
| 374 |
+
MultilabelClassification:
|
| 375 |
+
- CEDRClassification (rus-Cyrl)
|
| 376 |
+
- SensitiveTopicsClassification (rus-Cyrl)
|
| 377 |
se:
|
| 378 |
title: Swedish
|
| 379 |
language_long: Swedish
|
|
|
|
| 530 |
metric: nDCG@10
|
| 531 |
tasks:
|
| 532 |
Retrieval:
|
| 533 |
+
- AppsRetrieval (eng-Latn_python-Code)
|
| 534 |
+
- CodeFeedbackMT (c-Code_sql-Code_python-Code_shell-Code_swift-Code_eng-Latn)
|
| 535 |
+
- CodeFeedbackST (python-Code_javascript-Code_go-Code_ruby-Code_java-Code_php-Code_eng-Latn)
|
| 536 |
+
- CodeSearchNetCCRetrieval (python-Code)
|
| 537 |
+
- CodeSearchNetCCRetrieval (javascript-Code)
|
| 538 |
+
- CodeSearchNetCCRetrieval (go-Code)
|
| 539 |
+
- CodeSearchNetCCRetrieval (ruby-Code)
|
| 540 |
+
- CodeSearchNetCCRetrieval (java-Code)
|
| 541 |
+
- CodeSearchNetCCRetrieval (php-Code)
|
| 542 |
+
- CodeSearchNetRetrieval (python-Code)
|
| 543 |
+
- CodeSearchNetRetrieval (javascript-Code)
|
| 544 |
+
- CodeSearchNetRetrieval (go-Code)
|
| 545 |
+
- CodeSearchNetRetrieval (ruby-Code)
|
| 546 |
+
- CodeSearchNetRetrieval (java-Code)
|
| 547 |
+
- CodeSearchNetRetrieval (php-Code)
|
| 548 |
+
- CodeTransOceanContest (python-Code_c++-Code)
|
| 549 |
- CodeTransOceanDL
|
| 550 |
+
- CosQA (eng-Latn_python-Code)
|
| 551 |
- StackOverflowQA
|
| 552 |
+
- SyntheticText2SQL (eng-Latn_sql-Code)
|
refresh.py
CHANGED
|
@@ -30,9 +30,10 @@ PRETTY_NAMES = {
|
|
| 30 |
TASK_TO_METRIC = {k: [v["metric"]] for k, v in TASKS_CONFIG.items()}
|
| 31 |
# Add legacy metric names
|
| 32 |
TASK_TO_METRIC["STS"].append("cos_sim_spearman")
|
| 33 |
-
TASK_TO_METRIC["STS"].append("
|
| 34 |
TASK_TO_METRIC["Summarization"].append("cos_sim_spearman")
|
| 35 |
-
TASK_TO_METRIC["Summarization"].append("
|
|
|
|
| 36 |
TASK_TO_METRIC["PairClassification"].append("cos_sim_ap")
|
| 37 |
TASK_TO_METRIC["PairClassification"].append("cosine_ap")
|
| 38 |
|
|
@@ -166,6 +167,8 @@ def filter_metric_external(x, task, metrics) -> bool:
|
|
| 166 |
return bool(x["mteb_task"] == task and x["metric"] == "ndcg_at_1")
|
| 167 |
elif (x["mteb_dataset_name"].startswith("BrightRetrieval") and (x["split"] == "long")):
|
| 168 |
return bool(x["mteb_task"] == task and x["metric"] in ["recall_at_1"])
|
|
|
|
|
|
|
| 169 |
else:
|
| 170 |
return bool(x["mteb_task"] == task and x["metric"] in metrics)
|
| 171 |
|
|
@@ -258,6 +261,10 @@ def get_external_model_results():
|
|
| 258 |
download_mode="force_redownload",
|
| 259 |
verification_mode="no_checks",
|
| 260 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
ds = ds.map(add_lang)
|
| 262 |
ds = ds.map(add_task)
|
| 263 |
base_dict = {
|
|
@@ -273,8 +280,8 @@ def get_external_model_results():
|
|
| 273 |
ds_sub = ds.filter(lambda x: filter_metric_external(x, task, metrics))[
|
| 274 |
"test"
|
| 275 |
]
|
| 276 |
-
|
| 277 |
-
for metric in
|
| 278 |
ds_dict = ds_sub.filter(lambda x: x["metric"] == metric).to_dict()
|
| 279 |
ds_dict = {
|
| 280 |
k: round(v, 2)
|
|
|
|
| 30 |
TASK_TO_METRIC = {k: [v["metric"]] for k, v in TASKS_CONFIG.items()}
|
| 31 |
# Add legacy metric names
|
| 32 |
TASK_TO_METRIC["STS"].append("cos_sim_spearman")
|
| 33 |
+
TASK_TO_METRIC["STS"].append("spearman")
|
| 34 |
TASK_TO_METRIC["Summarization"].append("cos_sim_spearman")
|
| 35 |
+
TASK_TO_METRIC["Summarization"].append("spearman")
|
| 36 |
+
TASK_TO_METRIC["PairClassification"].append("ap")
|
| 37 |
TASK_TO_METRIC["PairClassification"].append("cos_sim_ap")
|
| 38 |
TASK_TO_METRIC["PairClassification"].append("cosine_ap")
|
| 39 |
|
|
|
|
| 167 |
return bool(x["mteb_task"] == task and x["metric"] == "ndcg_at_1")
|
| 168 |
elif (x["mteb_dataset_name"].startswith("BrightRetrieval") and (x["split"] == "long")):
|
| 169 |
return bool(x["mteb_task"] == task and x["metric"] in ["recall_at_1"])
|
| 170 |
+
elif x["mteb_dataset_name"] == "MIRACLReranking":
|
| 171 |
+
return bool(x["mteb_task"] == task and x["metric"] in ["NDCG@10(MIRACL)"])
|
| 172 |
else:
|
| 173 |
return bool(x["mteb_task"] == task and x["metric"] in metrics)
|
| 174 |
|
|
|
|
| 261 |
download_mode="force_redownload",
|
| 262 |
verification_mode="no_checks",
|
| 263 |
)
|
| 264 |
+
except ValueError as e:
|
| 265 |
+
print(f"Can't fined model {model} in results repository. Exception: {e}")
|
| 266 |
+
continue
|
| 267 |
+
|
| 268 |
ds = ds.map(add_lang)
|
| 269 |
ds = ds.map(add_task)
|
| 270 |
base_dict = {
|
|
|
|
| 280 |
ds_sub = ds.filter(lambda x: filter_metric_external(x, task, metrics))[
|
| 281 |
"test"
|
| 282 |
]
|
| 283 |
+
curent_task_metrics = ds_sub.unique("metric")
|
| 284 |
+
for metric in curent_task_metrics:
|
| 285 |
ds_dict = ds_sub.filter(lambda x: x["metric"] == metric).to_dict()
|
| 286 |
ds_dict = {
|
| 287 |
k: round(v, 2)
|