Spaces:
Runtime error
Runtime error
Commit
·
099d855
1
Parent(s):
6e58d27
Add Model Size (GB)
Browse files
app.py
CHANGED
|
@@ -2,7 +2,7 @@ import json
|
|
| 2 |
|
| 3 |
from datasets import load_dataset
|
| 4 |
import gradio as gr
|
| 5 |
-
from huggingface_hub import HfApi, hf_hub_download
|
| 6 |
from huggingface_hub.repocard import metadata_load
|
| 7 |
import pandas as pd
|
| 8 |
|
|
@@ -233,6 +233,7 @@ EXTERNAL_MODEL_TO_LINK = {
|
|
| 233 |
"all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
|
| 234 |
"paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
| 235 |
"paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
|
|
|
| 236 |
}
|
| 237 |
|
| 238 |
EXTERNAL_MODEL_TO_DIM = {
|
|
@@ -338,6 +339,39 @@ EXTERNAL_MODEL_TO_SEQLEN = {
|
|
| 338 |
"unsup-simcse-bert-base-uncased": 512,
|
| 339 |
}
|
| 340 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 341 |
|
| 342 |
MODELS_TO_SKIP = {
|
| 343 |
"baseplate/instructor-large-1", # Duplicate
|
|
@@ -404,9 +438,9 @@ for model in EXTERNAL_MODELS:
|
|
| 404 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
| 405 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
| 406 |
|
| 407 |
-
def
|
| 408 |
filenames = [sib.rfilename for sib in model.siblings]
|
| 409 |
-
dim, seq = "", ""
|
| 410 |
if "1_Pooling/config.json" in filenames:
|
| 411 |
st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
|
| 412 |
dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
|
|
@@ -419,7 +453,23 @@ def get_dim_seq(model):
|
|
| 419 |
if not dim:
|
| 420 |
dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
|
| 421 |
seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
|
| 422 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 423 |
|
| 424 |
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
|
| 425 |
api = HfApi()
|
|
@@ -439,6 +489,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
| 439 |
# Model & at least one result
|
| 440 |
if len(res) > 1:
|
| 441 |
if add_emb_dim:
|
|
|
|
| 442 |
res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
|
| 443 |
res["Sequence Length"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
|
| 444 |
df_list.append(res)
|
|
@@ -474,7 +525,7 @@ def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_
|
|
| 474 |
# Model & at least one result
|
| 475 |
if len(out) > 1:
|
| 476 |
if add_emb_dim:
|
| 477 |
-
out["Embedding Dimensions"], out["Sequence Length"] =
|
| 478 |
df_list.append(out)
|
| 479 |
df = pd.DataFrame(df_list)
|
| 480 |
# Put 'Model' column first
|
|
@@ -532,7 +583,7 @@ def get_mteb_average():
|
|
| 532 |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
| 533 |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
| 534 |
|
| 535 |
-
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
| 536 |
|
| 537 |
return DATA_OVERALL
|
| 538 |
|
|
|
|
| 2 |
|
| 3 |
from datasets import load_dataset
|
| 4 |
import gradio as gr
|
| 5 |
+
from huggingface_hub import get_hf_file_metadata, HfApi, hf_hub_download, hf_hub_url
|
| 6 |
from huggingface_hub.repocard import metadata_load
|
| 7 |
import pandas as pd
|
| 8 |
|
|
|
|
| 233 |
"all-mpnet-base-v2": "https://huggingface.co/sentence-transformers/all-mpnet-base-v2",
|
| 234 |
"paraphrase-multilingual-mpnet-base-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-mpnet-base-v2",
|
| 235 |
"paraphrase-multilingual-MiniLM-L12-v2": "https://huggingface.co/sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2",
|
| 236 |
+
"contriever-base-msmarco": "https://huggingface.co/nthakur/contriever-base-msmarco",
|
| 237 |
}
|
| 238 |
|
| 239 |
EXTERNAL_MODEL_TO_DIM = {
|
|
|
|
| 339 |
"unsup-simcse-bert-base-uncased": 512,
|
| 340 |
}
|
| 341 |
|
| 342 |
+
EXTERNAL_MODEL_TO_SIZE = {
|
| 343 |
+
"gtr-t5-xxl": 9.73,
|
| 344 |
+
"gtr-t5-xl": 2.48,
|
| 345 |
+
"gtr-t5-large": 0.67,
|
| 346 |
+
"gtr-t5-base": 0.22,
|
| 347 |
+
"sentence-t5-xxl": 9.73,
|
| 348 |
+
"sentence-t5-xl": 2.48,
|
| 349 |
+
"sentence-t5-large": 0.67,
|
| 350 |
+
"sentence-t5-base": 0.22,
|
| 351 |
+
"all-mpnet-base-v2": 0.44,
|
| 352 |
+
"all-MiniLM-L12-v2": 0.13,
|
| 353 |
+
"all-MiniLM-L6-v2": 0.09,
|
| 354 |
+
"contriever-base-msmarco": 0.44,
|
| 355 |
+
"paraphrase-multilingual-mpnet-base-v2": 1.11,
|
| 356 |
+
"paraphrase-multilingual-MiniLM-L12-v2": 0.47,
|
| 357 |
+
"msmarco-bert-co-condensor": 0.44,
|
| 358 |
+
"sup-simcse-bert-base-uncased": 0.44,
|
| 359 |
+
"unsup-simcse-bert-base-uncased": 0.44,
|
| 360 |
+
"LaBSE": 1.88,
|
| 361 |
+
"komninos": 0.27,
|
| 362 |
+
"glove.6B.300d": 0.48,
|
| 363 |
+
"allenai-specter": 0.44,
|
| 364 |
+
"bert-base-uncased": 0.44,
|
| 365 |
+
"LASER2": 0.17,
|
| 366 |
+
"cross-en-de-roberta-sentence-transformer": 1.11,
|
| 367 |
+
"gbert-base": 0.44,
|
| 368 |
+
"gbert-large": 1.35,
|
| 369 |
+
"gelectra-base": 0.44,
|
| 370 |
+
"gelectra-large": 1.34,
|
| 371 |
+
"use-cmlm-multilingual": 1.89,
|
| 372 |
+
"xlm-roberta-large": 2.24,
|
| 373 |
+
"gottbert-base": 0.51
|
| 374 |
+
}
|
| 375 |
|
| 376 |
MODELS_TO_SKIP = {
|
| 377 |
"baseplate/instructor-large-1", # Duplicate
|
|
|
|
| 438 |
ds_dict = {k: round(v, 2) for k, v in zip(ds_dict["mteb_dataset_name_with_lang"], ds_dict["score"])}
|
| 439 |
EXTERNAL_MODEL_RESULTS[model][task][metric].append({**base_dict, **ds_dict})
|
| 440 |
|
| 441 |
+
def get_dim_seq_size(model):
|
| 442 |
filenames = [sib.rfilename for sib in model.siblings]
|
| 443 |
+
dim, seq, size = "", "", ""
|
| 444 |
if "1_Pooling/config.json" in filenames:
|
| 445 |
st_config_path = hf_hub_download(model.modelId, filename="1_Pooling/config.json")
|
| 446 |
dim = json.load(open(st_config_path)).get("word_embedding_dimension", "")
|
|
|
|
| 453 |
if not dim:
|
| 454 |
dim = config.get("hidden_dim", config.get("hidden_size", config.get("d_model", "")))
|
| 455 |
seq = config.get("n_positions", config.get("max_position_embeddings", config.get("n_ctx", config.get("seq_length", ""))))
|
| 456 |
+
# Get model file size without downloading
|
| 457 |
+
if "pytorch_model.bin" in filenames:
|
| 458 |
+
url = hf_hub_url(model.modelId, filename="pytorch_model.bin")
|
| 459 |
+
meta = get_hf_file_metadata(url)
|
| 460 |
+
size = round(meta.size / 1e9, 2)
|
| 461 |
+
elif "pytorch_model.bin.index.json" in filenames:
|
| 462 |
+
index_path = hf_hub_download(model.modelId, filename="pytorch_model.bin.index.json")
|
| 463 |
+
"""
|
| 464 |
+
{
|
| 465 |
+
"metadata": {
|
| 466 |
+
"total_size": 28272820224
|
| 467 |
+
},....
|
| 468 |
+
"""
|
| 469 |
+
size = json.load(open(index_path))
|
| 470 |
+
if ("metadata" in size) and ("total_size" in size["metadata"]):
|
| 471 |
+
size = round(size["metadata"]["total_size"] / 1e9, 2)
|
| 472 |
+
return dim, seq, size
|
| 473 |
|
| 474 |
def get_mteb_data(tasks=["Clustering"], langs=[], datasets=[], fillna=True, add_emb_dim=False, task_to_metric=TASK_TO_METRIC):
|
| 475 |
api = HfApi()
|
|
|
|
| 489 |
# Model & at least one result
|
| 490 |
if len(res) > 1:
|
| 491 |
if add_emb_dim:
|
| 492 |
+
res["Model Size (GB)"] = EXTERNAL_MODEL_TO_SIZE.get(model, "")
|
| 493 |
res["Embedding Dimensions"] = EXTERNAL_MODEL_TO_DIM.get(model, "")
|
| 494 |
res["Sequence Length"] = EXTERNAL_MODEL_TO_SEQLEN.get(model, "")
|
| 495 |
df_list.append(res)
|
|
|
|
| 525 |
# Model & at least one result
|
| 526 |
if len(out) > 1:
|
| 527 |
if add_emb_dim:
|
| 528 |
+
out["Embedding Dimensions"], out["Sequence Length"], out["Model Size (GB)"] = get_dim_seq_size(model)
|
| 529 |
df_list.append(out)
|
| 530 |
df = pd.DataFrame(df_list)
|
| 531 |
# Put 'Model' column first
|
|
|
|
| 583 |
DATA_STS_EN = DATA_OVERALL[["Model"] + TASK_LIST_STS]
|
| 584 |
DATA_SUMMARIZATION = DATA_OVERALL[["Model"] + TASK_LIST_SUMMARIZATION]
|
| 585 |
|
| 586 |
+
DATA_OVERALL = DATA_OVERALL[["Rank", "Model", "Model Size (GB)", "Embedding Dimensions", "Sequence Length", f"Average ({len(TASK_LIST_EN)} datasets)", f"Classification Average ({len(TASK_LIST_CLASSIFICATION)} datasets)", f"Clustering Average ({len(TASK_LIST_CLUSTERING)} datasets)", f"Pair Classification Average ({len(TASK_LIST_PAIR_CLASSIFICATION)} datasets)", f"Reranking Average ({len(TASK_LIST_RERANKING)} datasets)", f"Retrieval Average ({len(TASK_LIST_RETRIEVAL)} datasets)", f"STS Average ({len(TASK_LIST_STS)} datasets)", f"Summarization Average ({len(TASK_LIST_SUMMARIZATION)} dataset)"]]
|
| 587 |
|
| 588 |
return DATA_OVERALL
|
| 589 |
|