Spaces:
Running
Running
Upload from GitHub Actions: Get more results, compute average based on all tasks
Browse files- datasets.json +8 -8
- evals/backend.py +3 -4
- evals/datasets_/truthfulqa.py +30 -0
- evals/main.py +1 -1
- evals/models.py +11 -13
- evals/tasks.py +79 -9
- frontend/src/components/ScoreColumns.js +1 -1
- languages.json +6 -6
- models.json +121 -4
- results.json +0 -0
datasets.json
CHANGED
|
@@ -256,7 +256,7 @@
|
|
| 256 |
"parallel": true,
|
| 257 |
"translation": "machine",
|
| 258 |
"base": "MMLU",
|
| 259 |
-
"implemented":
|
| 260 |
"group": "Multitask Language Understanding"
|
| 261 |
},
|
| 262 |
{
|
|
@@ -300,7 +300,7 @@
|
|
| 300 |
"parallel": true,
|
| 301 |
"translation": "machine",
|
| 302 |
"base": "MGSM",
|
| 303 |
-
"implemented":
|
| 304 |
"group": "Grade School Math"
|
| 305 |
},
|
| 306 |
{
|
|
@@ -315,7 +315,7 @@
|
|
| 315 |
"parallel": true,
|
| 316 |
"translation": "machine",
|
| 317 |
"base": "MGSM",
|
| 318 |
-
"implemented":
|
| 319 |
"group": "Grade School Math"
|
| 320 |
},
|
| 321 |
{
|
|
@@ -345,7 +345,7 @@
|
|
| 345 |
"parallel": true,
|
| 346 |
"translation": "machine",
|
| 347 |
"base": "AI2 ARC",
|
| 348 |
-
"implemented":
|
| 349 |
"group": "Abstract Reasoning"
|
| 350 |
},
|
| 351 |
{
|
|
@@ -360,7 +360,7 @@
|
|
| 360 |
"parallel": true,
|
| 361 |
"translation": "machine",
|
| 362 |
"base": "AI2 ARC",
|
| 363 |
-
"implemented":
|
| 364 |
"group": "Abstract Reasoning"
|
| 365 |
},
|
| 366 |
{
|
|
@@ -375,7 +375,7 @@
|
|
| 375 |
"parallel": true,
|
| 376 |
"translation": "human",
|
| 377 |
"base": "TruthfulQA",
|
| 378 |
-
"implemented":
|
| 379 |
"group": "Truthfulness"
|
| 380 |
},
|
| 381 |
{
|
|
@@ -390,7 +390,7 @@
|
|
| 390 |
"parallel": true,
|
| 391 |
"translation": "machine",
|
| 392 |
"base": "TruthfulQA",
|
| 393 |
-
"implemented":
|
| 394 |
"group": "Truthfulness"
|
| 395 |
},
|
| 396 |
{
|
|
@@ -405,7 +405,7 @@
|
|
| 405 |
"parallel": true,
|
| 406 |
"translation": "machine",
|
| 407 |
"base": "TruthfulQA",
|
| 408 |
-
"implemented":
|
| 409 |
"group": "Truthfulness"
|
| 410 |
},
|
| 411 |
{
|
|
|
|
| 256 |
"parallel": true,
|
| 257 |
"translation": "machine",
|
| 258 |
"base": "MMLU",
|
| 259 |
+
"implemented": true,
|
| 260 |
"group": "Multitask Language Understanding"
|
| 261 |
},
|
| 262 |
{
|
|
|
|
| 300 |
"parallel": true,
|
| 301 |
"translation": "machine",
|
| 302 |
"base": "MGSM",
|
| 303 |
+
"implemented": true,
|
| 304 |
"group": "Grade School Math"
|
| 305 |
},
|
| 306 |
{
|
|
|
|
| 315 |
"parallel": true,
|
| 316 |
"translation": "machine",
|
| 317 |
"base": "MGSM",
|
| 318 |
+
"implemented": true,
|
| 319 |
"group": "Grade School Math"
|
| 320 |
},
|
| 321 |
{
|
|
|
|
| 345 |
"parallel": true,
|
| 346 |
"translation": "machine",
|
| 347 |
"base": "AI2 ARC",
|
| 348 |
+
"implemented": true,
|
| 349 |
"group": "Abstract Reasoning"
|
| 350 |
},
|
| 351 |
{
|
|
|
|
| 360 |
"parallel": true,
|
| 361 |
"translation": "machine",
|
| 362 |
"base": "AI2 ARC",
|
| 363 |
+
"implemented": true,
|
| 364 |
"group": "Abstract Reasoning"
|
| 365 |
},
|
| 366 |
{
|
|
|
|
| 375 |
"parallel": true,
|
| 376 |
"translation": "human",
|
| 377 |
"base": "TruthfulQA",
|
| 378 |
+
"implemented": true,
|
| 379 |
"group": "Truthfulness"
|
| 380 |
},
|
| 381 |
{
|
|
|
|
| 390 |
"parallel": true,
|
| 391 |
"translation": "machine",
|
| 392 |
"base": "TruthfulQA",
|
| 393 |
+
"implemented": true,
|
| 394 |
"group": "Truthfulness"
|
| 395 |
},
|
| 396 |
{
|
|
|
|
| 405 |
"parallel": true,
|
| 406 |
"translation": "machine",
|
| 407 |
"base": "TruthfulQA",
|
| 408 |
+
"implemented": true,
|
| 409 |
"group": "Truthfulness"
|
| 410 |
},
|
| 411 |
{
|
evals/backend.py
CHANGED
|
@@ -26,11 +26,10 @@ task_metrics = [
|
|
| 26 |
"classification_accuracy",
|
| 27 |
"mmlu_accuracy",
|
| 28 |
"arc_accuracy",
|
|
|
|
| 29 |
"mgsm_accuracy",
|
| 30 |
]
|
| 31 |
|
| 32 |
-
task_metrics_basic = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
|
| 33 |
-
|
| 34 |
|
| 35 |
def compute_normalized_average(df, metrics):
|
| 36 |
"""Compute average of min-max normalized metric columns."""
|
|
@@ -58,7 +57,7 @@ def make_model_table(df, models):
|
|
| 58 |
for metric in task_metrics:
|
| 59 |
if metric not in df.columns:
|
| 60 |
df[metric] = np.nan
|
| 61 |
-
df["average"] = compute_normalized_average(df,
|
| 62 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
| 63 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
| 64 |
df["rank"] = df.index + 1
|
|
@@ -93,7 +92,7 @@ def make_language_table(df, languages):
|
|
| 93 |
for metric in task_metrics:
|
| 94 |
if metric not in df.columns:
|
| 95 |
df[metric] = np.nan
|
| 96 |
-
df["average"] = compute_normalized_average(df,
|
| 97 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 98 |
df = df.sort_values(by="speakers", ascending=False)
|
| 99 |
df = df[
|
|
|
|
| 26 |
"classification_accuracy",
|
| 27 |
"mmlu_accuracy",
|
| 28 |
"arc_accuracy",
|
| 29 |
+
"truthfulqa_accuracy",
|
| 30 |
"mgsm_accuracy",
|
| 31 |
]
|
| 32 |
|
|
|
|
|
|
|
| 33 |
|
| 34 |
def compute_normalized_average(df, metrics):
|
| 35 |
"""Compute average of min-max normalized metric columns."""
|
|
|
|
| 57 |
for metric in task_metrics:
|
| 58 |
if metric not in df.columns:
|
| 59 |
df[metric] = np.nan
|
| 60 |
+
df["average"] = compute_normalized_average(df, task_metrics)
|
| 61 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
| 62 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
| 63 |
df["rank"] = df.index + 1
|
|
|
|
| 92 |
for metric in task_metrics:
|
| 93 |
if metric not in df.columns:
|
| 94 |
df[metric] = np.nan
|
| 95 |
+
df["average"] = compute_normalized_average(df, task_metrics)
|
| 96 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 97 |
df = df.sort_values(by="speakers", ascending=False)
|
| 98 |
df = df[
|
evals/datasets_/truthfulqa.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import random
|
| 2 |
+
from collections import Counter, defaultdict
|
| 3 |
+
|
| 4 |
+
from langcodes import Language, standardize_tag
|
| 5 |
+
from rich import print
|
| 6 |
+
|
| 7 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 8 |
+
|
| 9 |
+
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
|
| 10 |
+
tags_uhura_truthfulqa = {
|
| 11 |
+
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
|
| 12 |
+
if a.endswith("multiple_choice")
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def add_choices(row):
|
| 17 |
+
row["choices"] = row["mc1_targets"]["choices"]
|
| 18 |
+
row["labels"] = row["mc1_targets"]["labels"]
|
| 19 |
+
return row
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def load_truthfulqa(language_bcp_47, nr):
|
| 23 |
+
if language_bcp_47 in tags_uhura_truthfulqa.keys():
|
| 24 |
+
ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47])
|
| 25 |
+
ds = ds.map(add_choices)
|
| 26 |
+
examples = ds["train"]
|
| 27 |
+
task = ds["test"][nr]
|
| 28 |
+
return "masakhane/uhura-truthfulqa", examples, task
|
| 29 |
+
else:
|
| 30 |
+
return None, None, None
|
evals/main.py
CHANGED
|
@@ -15,7 +15,7 @@ n_sentences = 10
|
|
| 15 |
|
| 16 |
async def evaluate():
|
| 17 |
# FIXME we should not need this for-loop, but it helps
|
| 18 |
-
for n_languages in range(
|
| 19 |
print(f"running evaluations for {n_languages} languages")
|
| 20 |
old_results = pd.read_json("results.json")
|
| 21 |
old_models = pd.read_json("models.json")
|
|
|
|
| 15 |
|
| 16 |
async def evaluate():
|
| 17 |
# FIXME we should not need this for-loop, but it helps
|
| 18 |
+
for n_languages in range(10, 101, 10):
|
| 19 |
print(f"running evaluations for {n_languages} languages")
|
| 20 |
old_results = pd.read_json("results.json")
|
| 21 |
old_models = pd.read_json("models.json")
|
evals/models.py
CHANGED
|
@@ -11,8 +11,7 @@ from elevenlabs import AsyncElevenLabs
|
|
| 11 |
from google.cloud import translate_v2 as translate
|
| 12 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
| 13 |
from joblib.memory import Memory
|
| 14 |
-
from
|
| 15 |
-
from openai import AsyncOpenAI, PermissionDeniedError
|
| 16 |
from requests import HTTPError, get
|
| 17 |
|
| 18 |
# for development purposes, all languages will be evaluated on the fast models
|
|
@@ -23,12 +22,12 @@ important_models = [
|
|
| 23 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
| 24 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
| 25 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
| 26 |
-
|
| 27 |
"openai/gpt-4.1-mini", # 1.6$
|
| 28 |
"openai/gpt-4.1-nano", # 0.4$
|
| 29 |
"openai/gpt-4o-mini", # 0.6$
|
| 30 |
# "openai/gpt-4o-2024-11-20", # 10$
|
| 31 |
-
|
| 32 |
# "openai/gpt-3.5-turbo", # 1.5$
|
| 33 |
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
| 34 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
|
@@ -37,6 +36,9 @@ important_models = [
|
|
| 37 |
"google/gemini-2.5-flash", # 0.6$
|
| 38 |
"google/gemini-2.0-flash-lite-001", # 0.3$
|
| 39 |
"google/gemma-3-27b-it", # 0.2$
|
|
|
|
|
|
|
|
|
|
| 40 |
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
| 41 |
# "qwen/qwq-32b", # 0.2$
|
| 42 |
# "qwen/qwen-2.5-72b-instruct", # 0.39$
|
|
@@ -49,7 +51,6 @@ important_models = [
|
|
| 49 |
]
|
| 50 |
|
| 51 |
blocklist = [
|
| 52 |
-
"microsoft/wizardlm-2-8x22b", # temporarily rate-limited
|
| 53 |
"google/gemini-2.5-pro-preview",
|
| 54 |
"google/gemini-2.5-flash-preview",
|
| 55 |
"google/gemini-2.5-flash-lite-preview",
|
|
@@ -150,9 +151,10 @@ async def complete(**kwargs) -> str | None:
|
|
| 150 |
async with openrouter_rate_limit:
|
| 151 |
try:
|
| 152 |
response = await client.chat.completions.create(**kwargs)
|
| 153 |
-
except
|
| 154 |
-
|
| 155 |
-
|
|
|
|
| 156 |
if not response.choices:
|
| 157 |
raise Exception(response)
|
| 158 |
return response.choices[0].message.content.strip()
|
|
@@ -281,13 +283,9 @@ def load_models(date: date):
|
|
| 281 |
)
|
| 282 |
# models = models[models["cost"] <= 2.0].reset_index(drop=True)
|
| 283 |
models["tasks"] = [
|
| 284 |
-
["translation_from", "translation_to", "classification", "mmlu", "arc", "mgsm"]
|
| 285 |
] * len(models)
|
| 286 |
models = pd.concat([models, get_translation_models()])
|
| 287 |
-
models = models[ # temporary fix FIXME
|
| 288 |
-
(models["id"] != "google/gemini-2.5-pro")
|
| 289 |
-
& (models["id"] != "google/gemini-2.5-pro-preview")
|
| 290 |
-
]
|
| 291 |
return models
|
| 292 |
|
| 293 |
|
|
|
|
| 11 |
from google.cloud import translate_v2 as translate
|
| 12 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
| 13 |
from joblib.memory import Memory
|
| 14 |
+
from openai import AsyncOpenAI, BadRequestError
|
|
|
|
| 15 |
from requests import HTTPError, get
|
| 16 |
|
| 17 |
# for development purposes, all languages will be evaluated on the fast models
|
|
|
|
| 22 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
| 23 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
| 24 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
| 25 |
+
"openai/gpt-4.1", # 8$
|
| 26 |
"openai/gpt-4.1-mini", # 1.6$
|
| 27 |
"openai/gpt-4.1-nano", # 0.4$
|
| 28 |
"openai/gpt-4o-mini", # 0.6$
|
| 29 |
# "openai/gpt-4o-2024-11-20", # 10$
|
| 30 |
+
"openai/gpt-3.5-turbo-0613", # 2$
|
| 31 |
# "openai/gpt-3.5-turbo", # 1.5$
|
| 32 |
# "anthropic/claude-3.5-haiku", # 4$ -> too expensive for dev
|
| 33 |
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$
|
|
|
|
| 36 |
"google/gemini-2.5-flash", # 0.6$
|
| 37 |
"google/gemini-2.0-flash-lite-001", # 0.3$
|
| 38 |
"google/gemma-3-27b-it", # 0.2$
|
| 39 |
+
"qwen/qwen3-32b",
|
| 40 |
+
"qwen/qwen3-235b-a22b",
|
| 41 |
+
"qwen/qwen3-30b-a3b", # 0.29$
|
| 42 |
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
|
| 43 |
# "qwen/qwq-32b", # 0.2$
|
| 44 |
# "qwen/qwen-2.5-72b-instruct", # 0.39$
|
|
|
|
| 51 |
]
|
| 52 |
|
| 53 |
blocklist = [
|
|
|
|
| 54 |
"google/gemini-2.5-pro-preview",
|
| 55 |
"google/gemini-2.5-flash-preview",
|
| 56 |
"google/gemini-2.5-flash-lite-preview",
|
|
|
|
| 151 |
async with openrouter_rate_limit:
|
| 152 |
try:
|
| 153 |
response = await client.chat.completions.create(**kwargs)
|
| 154 |
+
except BadRequestError as e:
|
| 155 |
+
if "filtered" in e.message:
|
| 156 |
+
return None
|
| 157 |
+
raise e
|
| 158 |
if not response.choices:
|
| 159 |
raise Exception(response)
|
| 160 |
return response.choices[0].message.content.strip()
|
|
|
|
| 283 |
)
|
| 284 |
# models = models[models["cost"] <= 2.0].reset_index(drop=True)
|
| 285 |
models["tasks"] = [
|
| 286 |
+
["translation_from", "translation_to", "classification", "mmlu", "arc", "truthfulqa", "mgsm"]
|
| 287 |
] * len(models)
|
| 288 |
models = pd.concat([models, get_translation_models()])
|
|
|
|
|
|
|
|
|
|
|
|
|
| 289 |
return models
|
| 290 |
|
| 291 |
|
evals/tasks.py
CHANGED
|
@@ -9,6 +9,7 @@ from datasets_.flores import flores_sentences
|
|
| 9 |
from datasets_.mgsm import load_mgsm, parse_number
|
| 10 |
from datasets_.mmlu import load_mmlu
|
| 11 |
from datasets_.arc import load_uhura_arc_easy
|
|
|
|
| 12 |
from google.cloud import translate_v2 as translate
|
| 13 |
from langcodes import closest_supported_match
|
| 14 |
from languages import languages, script_name
|
|
@@ -224,6 +225,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
| 224 |
}
|
| 225 |
]
|
| 226 |
|
|
|
|
| 227 |
def format_multiple_choice(item):
|
| 228 |
return f"""{item["question"]}
|
| 229 |
|
|
@@ -234,6 +236,7 @@ def format_multiple_choice(item):
|
|
| 234 |
|
| 235 |
A|B|C|D?"""
|
| 236 |
|
|
|
|
| 237 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
| 238 |
ds_name, examples, task = load_mmlu(language_bcp_47, nr)
|
| 239 |
if not task:
|
|
@@ -253,7 +256,10 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
| 253 |
temperature=0,
|
| 254 |
max_tokens=1,
|
| 255 |
)
|
| 256 |
-
|
|
|
|
|
|
|
|
|
|
| 257 |
except Exception as e:
|
| 258 |
if "ResponsibleAIPolicyViolation" in str(e):
|
| 259 |
acc = 0
|
|
@@ -270,11 +276,12 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
| 270 |
}
|
| 271 |
]
|
| 272 |
|
|
|
|
| 273 |
async def arc_and_evaluate(model, language_bcp_47, nr):
|
| 274 |
ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
|
| 275 |
if not task:
|
| 276 |
return []
|
| 277 |
-
|
| 278 |
messages = []
|
| 279 |
for example in examples:
|
| 280 |
messages += [
|
|
@@ -289,7 +296,10 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
|
|
| 289 |
temperature=0,
|
| 290 |
max_tokens=1,
|
| 291 |
)
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
| 293 |
except Exception as e:
|
| 294 |
if "ResponsibleAIPolicyViolation" in str(e):
|
| 295 |
acc = 0
|
|
@@ -305,7 +315,68 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
|
|
| 305 |
"sentence_nr": nr,
|
| 306 |
}
|
| 307 |
]
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
| 311 |
system_prompt = """
|
|
@@ -325,11 +396,9 @@ async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
|
| 325 |
temperature=0,
|
| 326 |
max_tokens=1024,
|
| 327 |
)
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
accuracy = int(
|
| 331 |
-
parse_number(number[1].strip()) == parse_number(question["answer_number"])
|
| 332 |
-
)
|
| 333 |
else:
|
| 334 |
accuracy = 0
|
| 335 |
|
|
@@ -383,6 +452,7 @@ tasks = {
|
|
| 383 |
# "mlm": mlm_and_evaluate,
|
| 384 |
"mmlu": mmlu_and_evaluate,
|
| 385 |
"arc": arc_and_evaluate,
|
|
|
|
| 386 |
"mgsm": mgsm_and_evaluate,
|
| 387 |
# "asr": transcribe_and_evaluate,
|
| 388 |
}
|
|
|
|
| 9 |
from datasets_.mgsm import load_mgsm, parse_number
|
| 10 |
from datasets_.mmlu import load_mmlu
|
| 11 |
from datasets_.arc import load_uhura_arc_easy
|
| 12 |
+
from datasets_.truthfulqa import load_truthfulqa
|
| 13 |
from google.cloud import translate_v2 as translate
|
| 14 |
from langcodes import closest_supported_match
|
| 15 |
from languages import languages, script_name
|
|
|
|
| 225 |
}
|
| 226 |
]
|
| 227 |
|
| 228 |
+
|
| 229 |
def format_multiple_choice(item):
|
| 230 |
return f"""{item["question"]}
|
| 231 |
|
|
|
|
| 236 |
|
| 237 |
A|B|C|D?"""
|
| 238 |
|
| 239 |
+
|
| 240 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
| 241 |
ds_name, examples, task = load_mmlu(language_bcp_47, nr)
|
| 242 |
if not task:
|
|
|
|
| 256 |
temperature=0,
|
| 257 |
max_tokens=1,
|
| 258 |
)
|
| 259 |
+
if response:
|
| 260 |
+
acc = int(response[:1].strip() == task["answer"])
|
| 261 |
+
else:
|
| 262 |
+
acc = 0
|
| 263 |
except Exception as e:
|
| 264 |
if "ResponsibleAIPolicyViolation" in str(e):
|
| 265 |
acc = 0
|
|
|
|
| 276 |
}
|
| 277 |
]
|
| 278 |
|
| 279 |
+
|
| 280 |
async def arc_and_evaluate(model, language_bcp_47, nr):
|
| 281 |
ds_name, examples, task = load_uhura_arc_easy(language_bcp_47, nr)
|
| 282 |
if not task:
|
| 283 |
return []
|
| 284 |
+
|
| 285 |
messages = []
|
| 286 |
for example in examples:
|
| 287 |
messages += [
|
|
|
|
| 296 |
temperature=0,
|
| 297 |
max_tokens=1,
|
| 298 |
)
|
| 299 |
+
if response:
|
| 300 |
+
acc = int(response[:1].strip() == task["answer"])
|
| 301 |
+
else:
|
| 302 |
+
acc = 0
|
| 303 |
except Exception as e:
|
| 304 |
if "ResponsibleAIPolicyViolation" in str(e):
|
| 305 |
acc = 0
|
|
|
|
| 315 |
"sentence_nr": nr,
|
| 316 |
}
|
| 317 |
]
|
| 318 |
+
|
| 319 |
+
|
| 320 |
+
letters = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
|
| 321 |
+
|
| 322 |
+
|
| 323 |
+
def shuffle_choices_and_labels(item):
|
| 324 |
+
indices = list(range(len(item["choices"])))
|
| 325 |
+
random.shuffle(indices)
|
| 326 |
+
item["choices"] = [item["choices"][i] for i in indices]
|
| 327 |
+
item["labels"] = [item["labels"][i] for i in indices]
|
| 328 |
+
return item
|
| 329 |
+
|
| 330 |
+
|
| 331 |
+
def format_multiple_choice_truthfulqa(item):
|
| 332 |
+
text = item["question"] + "\n\n"
|
| 333 |
+
for i, choice in enumerate(item["choices"]):
|
| 334 |
+
text += f"{letters[i]}: {choice}\n"
|
| 335 |
+
text += "|".join(letters[: len(item["choices"])]) + "?"
|
| 336 |
+
return text
|
| 337 |
+
|
| 338 |
+
|
| 339 |
+
async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
| 340 |
+
ds_name, examples, task = load_truthfulqa(language_bcp_47, nr)
|
| 341 |
+
if not task:
|
| 342 |
+
return []
|
| 343 |
+
task = shuffle_choices_and_labels(task)
|
| 344 |
+
answer = letters[task["labels"].index(1)]
|
| 345 |
+
messages = []
|
| 346 |
+
for example in examples:
|
| 347 |
+
example = shuffle_choices_and_labels(example)
|
| 348 |
+
messages += [
|
| 349 |
+
{"role": "user", "content": format_multiple_choice_truthfulqa(example)},
|
| 350 |
+
{"role": "assistant", "content": letters[example["labels"].index(1)]},
|
| 351 |
+
]
|
| 352 |
+
messages += [{"role": "user", "content": format_multiple_choice_truthfulqa(task)}]
|
| 353 |
+
try:
|
| 354 |
+
response = await complete(
|
| 355 |
+
model=model,
|
| 356 |
+
messages=messages,
|
| 357 |
+
temperature=0,
|
| 358 |
+
max_tokens=1,
|
| 359 |
+
)
|
| 360 |
+
if response:
|
| 361 |
+
acc = int(response[:1].strip() == answer)
|
| 362 |
+
else:
|
| 363 |
+
acc = 0
|
| 364 |
+
except Exception as e:
|
| 365 |
+
if "ResponsibleAIPolicyViolation" in str(e):
|
| 366 |
+
acc = 0
|
| 367 |
+
else:
|
| 368 |
+
raise e
|
| 369 |
+
return [
|
| 370 |
+
{
|
| 371 |
+
"model": model,
|
| 372 |
+
"bcp_47": language_bcp_47,
|
| 373 |
+
"task": "truthfulqa",
|
| 374 |
+
"metric": "accuracy",
|
| 375 |
+
"score": acc,
|
| 376 |
+
"sentence_nr": nr,
|
| 377 |
+
}
|
| 378 |
+
]
|
| 379 |
+
|
| 380 |
|
| 381 |
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
| 382 |
system_prompt = """
|
|
|
|
| 396 |
temperature=0,
|
| 397 |
max_tokens=1024,
|
| 398 |
)
|
| 399 |
+
if response and len(response.split("####")) == 2:
|
| 400 |
+
number = response.split("####")[1].strip()
|
| 401 |
+
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
|
|
|
|
|
|
| 402 |
else:
|
| 403 |
accuracy = 0
|
| 404 |
|
|
|
|
| 452 |
# "mlm": mlm_and_evaluate,
|
| 453 |
"mmlu": mmlu_and_evaluate,
|
| 454 |
"arc": arc_and_evaluate,
|
| 455 |
+
"truthfulqa": truthfulqa_and_evaluate,
|
| 456 |
"mgsm": mgsm_and_evaluate,
|
| 457 |
# "asr": transcribe_and_evaluate,
|
| 458 |
}
|
frontend/src/components/ScoreColumns.js
CHANGED
|
@@ -14,7 +14,7 @@ const ScoreColumns = [
|
|
| 14 |
<Column
|
| 15 |
field='average'
|
| 16 |
header='Proficiency'
|
| 17 |
-
headerTooltip='Language Proficiency Score (average
|
| 18 |
sortable
|
| 19 |
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
| 20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
|
|
|
| 14 |
<Column
|
| 15 |
field='average'
|
| 16 |
header='Proficiency'
|
| 17 |
+
headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
|
| 18 |
sortable
|
| 19 |
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
| 20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
languages.json
CHANGED
|
@@ -79,7 +79,7 @@
|
|
| 79 |
"family":"Indo-European",
|
| 80 |
"flores_path":"fra_Latn",
|
| 81 |
"fleurs_tag":"fr_fr",
|
| 82 |
-
"commonvoice_hours":
|
| 83 |
"commonvoice_locale":"fr",
|
| 84 |
"in_benchmark":true
|
| 85 |
},
|
|
@@ -1375,7 +1375,7 @@
|
|
| 1375 |
"family":"Turkic",
|
| 1376 |
"flores_path":"uig_Arab",
|
| 1377 |
"fleurs_tag":null,
|
| 1378 |
-
"commonvoice_hours":
|
| 1379 |
"commonvoice_locale":"ug",
|
| 1380 |
"in_benchmark":true
|
| 1381 |
},
|
|
@@ -1747,7 +1747,7 @@
|
|
| 1747 |
"family":"Indo-European",
|
| 1748 |
"flores_path":"nob_Latn",
|
| 1749 |
"fleurs_tag":"nb_no",
|
| 1750 |
-
"commonvoice_hours":0.
|
| 1751 |
"commonvoice_locale":"nb-NO",
|
| 1752 |
"in_benchmark":true
|
| 1753 |
},
|
|
@@ -2323,7 +2323,7 @@
|
|
| 2323 |
"family":"Dravidian",
|
| 2324 |
"flores_path":null,
|
| 2325 |
"fleurs_tag":null,
|
| 2326 |
-
"commonvoice_hours":
|
| 2327 |
"commonvoice_locale":"brh",
|
| 2328 |
"in_benchmark":false
|
| 2329 |
},
|
|
@@ -2623,7 +2623,7 @@
|
|
| 2623 |
"family":"Indo-European",
|
| 2624 |
"flores_path":null,
|
| 2625 |
"fleurs_tag":null,
|
| 2626 |
-
"commonvoice_hours":0.
|
| 2627 |
"commonvoice_locale":"haz",
|
| 2628 |
"in_benchmark":false
|
| 2629 |
},
|
|
@@ -4651,7 +4651,7 @@
|
|
| 4651 |
"family":"Abkhaz-Adyge",
|
| 4652 |
"flores_path":null,
|
| 4653 |
"fleurs_tag":null,
|
| 4654 |
-
"commonvoice_hours":
|
| 4655 |
"commonvoice_locale":"ady",
|
| 4656 |
"in_benchmark":false
|
| 4657 |
},
|
|
|
|
| 79 |
"family":"Indo-European",
|
| 80 |
"flores_path":"fra_Latn",
|
| 81 |
"fleurs_tag":"fr_fr",
|
| 82 |
+
"commonvoice_hours":1065.0,
|
| 83 |
"commonvoice_locale":"fr",
|
| 84 |
"in_benchmark":true
|
| 85 |
},
|
|
|
|
| 1375 |
"family":"Turkic",
|
| 1376 |
"flores_path":"uig_Arab",
|
| 1377 |
"fleurs_tag":null,
|
| 1378 |
+
"commonvoice_hours":411.0,
|
| 1379 |
"commonvoice_locale":"ug",
|
| 1380 |
"in_benchmark":true
|
| 1381 |
},
|
|
|
|
| 1747 |
"family":"Indo-European",
|
| 1748 |
"flores_path":"nob_Latn",
|
| 1749 |
"fleurs_tag":"nb_no",
|
| 1750 |
+
"commonvoice_hours":0.5,
|
| 1751 |
"commonvoice_locale":"nb-NO",
|
| 1752 |
"in_benchmark":true
|
| 1753 |
},
|
|
|
|
| 2323 |
"family":"Dravidian",
|
| 2324 |
"flores_path":null,
|
| 2325 |
"fleurs_tag":null,
|
| 2326 |
+
"commonvoice_hours":1.2,
|
| 2327 |
"commonvoice_locale":"brh",
|
| 2328 |
"in_benchmark":false
|
| 2329 |
},
|
|
|
|
| 2623 |
"family":"Indo-European",
|
| 2624 |
"flores_path":null,
|
| 2625 |
"fleurs_tag":null,
|
| 2626 |
+
"commonvoice_hours":0.9,
|
| 2627 |
"commonvoice_locale":"haz",
|
| 2628 |
"in_benchmark":false
|
| 2629 |
},
|
|
|
|
| 4651 |
"family":"Abkhaz-Adyge",
|
| 4652 |
"flores_path":null,
|
| 4653 |
"fleurs_tag":null,
|
| 4654 |
+
"commonvoice_hours":30.0,
|
| 4655 |
"commonvoice_locale":"ady",
|
| 4656 |
"in_benchmark":false
|
| 4657 |
},
|
models.json
CHANGED
|
@@ -15,6 +15,7 @@
|
|
| 15 |
"classification",
|
| 16 |
"mmlu",
|
| 17 |
"arc",
|
|
|
|
| 18 |
"mgsm"
|
| 19 |
]
|
| 20 |
},
|
|
@@ -34,6 +35,7 @@
|
|
| 34 |
"classification",
|
| 35 |
"mmlu",
|
| 36 |
"arc",
|
|
|
|
| 37 |
"mgsm"
|
| 38 |
]
|
| 39 |
},
|
|
@@ -53,6 +55,7 @@
|
|
| 53 |
"classification",
|
| 54 |
"mmlu",
|
| 55 |
"arc",
|
|
|
|
| 56 |
"mgsm"
|
| 57 |
]
|
| 58 |
},
|
|
@@ -72,6 +75,7 @@
|
|
| 72 |
"classification",
|
| 73 |
"mmlu",
|
| 74 |
"arc",
|
|
|
|
| 75 |
"mgsm"
|
| 76 |
]
|
| 77 |
},
|
|
@@ -91,6 +95,7 @@
|
|
| 91 |
"classification",
|
| 92 |
"mmlu",
|
| 93 |
"arc",
|
|
|
|
| 94 |
"mgsm"
|
| 95 |
]
|
| 96 |
},
|
|
@@ -110,6 +115,7 @@
|
|
| 110 |
"classification",
|
| 111 |
"mmlu",
|
| 112 |
"arc",
|
|
|
|
| 113 |
"mgsm"
|
| 114 |
]
|
| 115 |
},
|
|
@@ -129,6 +135,7 @@
|
|
| 129 |
"classification",
|
| 130 |
"mmlu",
|
| 131 |
"arc",
|
|
|
|
| 132 |
"mgsm"
|
| 133 |
]
|
| 134 |
},
|
|
@@ -141,13 +148,14 @@
|
|
| 141 |
"size":684531386000.0,
|
| 142 |
"type":"open-source",
|
| 143 |
"license":"Mit",
|
| 144 |
-
"creation_date":1748390400000,
|
| 145 |
"tasks":[
|
| 146 |
"translation_from",
|
| 147 |
"translation_to",
|
| 148 |
"classification",
|
| 149 |
"mmlu",
|
| 150 |
"arc",
|
|
|
|
| 151 |
"mgsm"
|
| 152 |
]
|
| 153 |
},
|
|
@@ -167,6 +175,7 @@
|
|
| 167 |
"classification",
|
| 168 |
"mmlu",
|
| 169 |
"arc",
|
|
|
|
| 170 |
"mgsm"
|
| 171 |
]
|
| 172 |
},
|
|
@@ -186,6 +195,7 @@
|
|
| 186 |
"classification",
|
| 187 |
"mmlu",
|
| 188 |
"arc",
|
|
|
|
| 189 |
"mgsm"
|
| 190 |
]
|
| 191 |
},
|
|
@@ -205,6 +215,7 @@
|
|
| 205 |
"classification",
|
| 206 |
"mmlu",
|
| 207 |
"arc",
|
|
|
|
| 208 |
"mgsm"
|
| 209 |
]
|
| 210 |
},
|
|
@@ -271,12 +282,14 @@
|
|
| 271 |
"size":null,
|
| 272 |
"type":"closed-source",
|
| 273 |
"license":null,
|
| 274 |
-
"creation_date":1750118400000
|
| 275 |
"tasks":[
|
| 276 |
"translation_from",
|
| 277 |
"translation_to",
|
| 278 |
"classification",
|
| 279 |
"mmlu",
|
|
|
|
|
|
|
| 280 |
"mgsm"
|
| 281 |
]
|
| 282 |
},
|
|
@@ -332,6 +345,7 @@
|
|
| 332 |
"classification",
|
| 333 |
"mmlu",
|
| 334 |
"arc",
|
|
|
|
| 335 |
"mgsm"
|
| 336 |
]
|
| 337 |
},
|
|
@@ -351,6 +365,7 @@
|
|
| 351 |
"classification",
|
| 352 |
"mmlu",
|
| 353 |
"arc",
|
|
|
|
| 354 |
"mgsm"
|
| 355 |
]
|
| 356 |
},
|
|
@@ -370,6 +385,7 @@
|
|
| 370 |
"classification",
|
| 371 |
"mmlu",
|
| 372 |
"arc",
|
|
|
|
| 373 |
"mgsm"
|
| 374 |
]
|
| 375 |
},
|
|
@@ -404,6 +420,7 @@
|
|
| 404 |
"classification",
|
| 405 |
"mmlu",
|
| 406 |
"arc",
|
|
|
|
| 407 |
"mgsm"
|
| 408 |
]
|
| 409 |
},
|
|
@@ -423,6 +440,7 @@
|
|
| 423 |
"classification",
|
| 424 |
"mmlu",
|
| 425 |
"arc",
|
|
|
|
| 426 |
"mgsm"
|
| 427 |
]
|
| 428 |
},
|
|
@@ -442,6 +460,7 @@
|
|
| 442 |
"classification",
|
| 443 |
"mmlu",
|
| 444 |
"arc",
|
|
|
|
| 445 |
"mgsm"
|
| 446 |
]
|
| 447 |
},
|
|
@@ -485,6 +504,7 @@
|
|
| 485 |
"classification",
|
| 486 |
"mmlu",
|
| 487 |
"arc",
|
|
|
|
| 488 |
"mgsm"
|
| 489 |
]
|
| 490 |
},
|
|
@@ -504,6 +524,7 @@
|
|
| 504 |
"classification",
|
| 505 |
"mmlu",
|
| 506 |
"arc",
|
|
|
|
| 507 |
"mgsm"
|
| 508 |
]
|
| 509 |
},
|
|
@@ -523,6 +544,7 @@
|
|
| 523 |
"classification",
|
| 524 |
"mmlu",
|
| 525 |
"arc",
|
|
|
|
| 526 |
"mgsm"
|
| 527 |
]
|
| 528 |
},
|
|
@@ -542,6 +564,7 @@
|
|
| 542 |
"classification",
|
| 543 |
"mmlu",
|
| 544 |
"arc",
|
|
|
|
| 545 |
"mgsm"
|
| 546 |
]
|
| 547 |
},
|
|
@@ -561,6 +584,7 @@
|
|
| 561 |
"classification",
|
| 562 |
"mmlu",
|
| 563 |
"arc",
|
|
|
|
| 564 |
"mgsm"
|
| 565 |
]
|
| 566 |
},
|
|
@@ -580,6 +604,7 @@
|
|
| 580 |
"classification",
|
| 581 |
"mmlu",
|
| 582 |
"arc",
|
|
|
|
| 583 |
"mgsm"
|
| 584 |
]
|
| 585 |
},
|
|
@@ -599,6 +624,27 @@
|
|
| 599 |
"classification",
|
| 600 |
"mmlu",
|
| 601 |
"arc",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 602 |
"mgsm"
|
| 603 |
]
|
| 604 |
},
|
|
@@ -611,8 +657,16 @@
|
|
| 611 |
"size":null,
|
| 612 |
"type":"closed-source",
|
| 613 |
"license":null,
|
| 614 |
-
"creation_date":1744588800000
|
| 615 |
-
"tasks":
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
},
|
| 617 |
{
|
| 618 |
"id":"openai\/gpt-4.1-mini",
|
|
@@ -630,6 +684,7 @@
|
|
| 630 |
"classification",
|
| 631 |
"mmlu",
|
| 632 |
"arc",
|
|
|
|
| 633 |
"mgsm"
|
| 634 |
]
|
| 635 |
},
|
|
@@ -649,6 +704,7 @@
|
|
| 649 |
"classification",
|
| 650 |
"mmlu",
|
| 651 |
"arc",
|
|
|
|
| 652 |
"mgsm"
|
| 653 |
]
|
| 654 |
},
|
|
@@ -668,6 +724,67 @@
|
|
| 668 |
"classification",
|
| 669 |
"mmlu",
|
| 670 |
"arc",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 671 |
"mgsm"
|
| 672 |
]
|
| 673 |
}
|
|
|
|
| 15 |
"classification",
|
| 16 |
"mmlu",
|
| 17 |
"arc",
|
| 18 |
+
"truthfulqa",
|
| 19 |
"mgsm"
|
| 20 |
]
|
| 21 |
},
|
|
|
|
| 35 |
"classification",
|
| 36 |
"mmlu",
|
| 37 |
"arc",
|
| 38 |
+
"truthfulqa",
|
| 39 |
"mgsm"
|
| 40 |
]
|
| 41 |
},
|
|
|
|
| 55 |
"classification",
|
| 56 |
"mmlu",
|
| 57 |
"arc",
|
| 58 |
+
"truthfulqa",
|
| 59 |
"mgsm"
|
| 60 |
]
|
| 61 |
},
|
|
|
|
| 75 |
"classification",
|
| 76 |
"mmlu",
|
| 77 |
"arc",
|
| 78 |
+
"truthfulqa",
|
| 79 |
"mgsm"
|
| 80 |
]
|
| 81 |
},
|
|
|
|
| 95 |
"classification",
|
| 96 |
"mmlu",
|
| 97 |
"arc",
|
| 98 |
+
"truthfulqa",
|
| 99 |
"mgsm"
|
| 100 |
]
|
| 101 |
},
|
|
|
|
| 115 |
"classification",
|
| 116 |
"mmlu",
|
| 117 |
"arc",
|
| 118 |
+
"truthfulqa",
|
| 119 |
"mgsm"
|
| 120 |
]
|
| 121 |
},
|
|
|
|
| 135 |
"classification",
|
| 136 |
"mmlu",
|
| 137 |
"arc",
|
| 138 |
+
"truthfulqa",
|
| 139 |
"mgsm"
|
| 140 |
]
|
| 141 |
},
|
|
|
|
| 148 |
"size":684531386000.0,
|
| 149 |
"type":"open-source",
|
| 150 |
"license":"Mit",
|
| 151 |
+
"creation_date":1748390400000.0,
|
| 152 |
"tasks":[
|
| 153 |
"translation_from",
|
| 154 |
"translation_to",
|
| 155 |
"classification",
|
| 156 |
"mmlu",
|
| 157 |
"arc",
|
| 158 |
+
"truthfulqa",
|
| 159 |
"mgsm"
|
| 160 |
]
|
| 161 |
},
|
|
|
|
| 175 |
"classification",
|
| 176 |
"mmlu",
|
| 177 |
"arc",
|
| 178 |
+
"truthfulqa",
|
| 179 |
"mgsm"
|
| 180 |
]
|
| 181 |
},
|
|
|
|
| 195 |
"classification",
|
| 196 |
"mmlu",
|
| 197 |
"arc",
|
| 198 |
+
"truthfulqa",
|
| 199 |
"mgsm"
|
| 200 |
]
|
| 201 |
},
|
|
|
|
| 215 |
"classification",
|
| 216 |
"mmlu",
|
| 217 |
"arc",
|
| 218 |
+
"truthfulqa",
|
| 219 |
"mgsm"
|
| 220 |
]
|
| 221 |
},
|
|
|
|
| 282 |
"size":null,
|
| 283 |
"type":"closed-source",
|
| 284 |
"license":null,
|
| 285 |
+
"creation_date":1750118400000,
|
| 286 |
"tasks":[
|
| 287 |
"translation_from",
|
| 288 |
"translation_to",
|
| 289 |
"classification",
|
| 290 |
"mmlu",
|
| 291 |
+
"arc",
|
| 292 |
+
"truthfulqa",
|
| 293 |
"mgsm"
|
| 294 |
]
|
| 295 |
},
|
|
|
|
| 345 |
"classification",
|
| 346 |
"mmlu",
|
| 347 |
"arc",
|
| 348 |
+
"truthfulqa",
|
| 349 |
"mgsm"
|
| 350 |
]
|
| 351 |
},
|
|
|
|
| 365 |
"classification",
|
| 366 |
"mmlu",
|
| 367 |
"arc",
|
| 368 |
+
"truthfulqa",
|
| 369 |
"mgsm"
|
| 370 |
]
|
| 371 |
},
|
|
|
|
| 385 |
"classification",
|
| 386 |
"mmlu",
|
| 387 |
"arc",
|
| 388 |
+
"truthfulqa",
|
| 389 |
"mgsm"
|
| 390 |
]
|
| 391 |
},
|
|
|
|
| 420 |
"classification",
|
| 421 |
"mmlu",
|
| 422 |
"arc",
|
| 423 |
+
"truthfulqa",
|
| 424 |
"mgsm"
|
| 425 |
]
|
| 426 |
},
|
|
|
|
| 440 |
"classification",
|
| 441 |
"mmlu",
|
| 442 |
"arc",
|
| 443 |
+
"truthfulqa",
|
| 444 |
"mgsm"
|
| 445 |
]
|
| 446 |
},
|
|
|
|
| 460 |
"classification",
|
| 461 |
"mmlu",
|
| 462 |
"arc",
|
| 463 |
+
"truthfulqa",
|
| 464 |
"mgsm"
|
| 465 |
]
|
| 466 |
},
|
|
|
|
| 504 |
"classification",
|
| 505 |
"mmlu",
|
| 506 |
"arc",
|
| 507 |
+
"truthfulqa",
|
| 508 |
"mgsm"
|
| 509 |
]
|
| 510 |
},
|
|
|
|
| 524 |
"classification",
|
| 525 |
"mmlu",
|
| 526 |
"arc",
|
| 527 |
+
"truthfulqa",
|
| 528 |
"mgsm"
|
| 529 |
]
|
| 530 |
},
|
|
|
|
| 544 |
"classification",
|
| 545 |
"mmlu",
|
| 546 |
"arc",
|
| 547 |
+
"truthfulqa",
|
| 548 |
"mgsm"
|
| 549 |
]
|
| 550 |
},
|
|
|
|
| 564 |
"classification",
|
| 565 |
"mmlu",
|
| 566 |
"arc",
|
| 567 |
+
"truthfulqa",
|
| 568 |
"mgsm"
|
| 569 |
]
|
| 570 |
},
|
|
|
|
| 584 |
"classification",
|
| 585 |
"mmlu",
|
| 586 |
"arc",
|
| 587 |
+
"truthfulqa",
|
| 588 |
"mgsm"
|
| 589 |
]
|
| 590 |
},
|
|
|
|
| 604 |
"classification",
|
| 605 |
"mmlu",
|
| 606 |
"arc",
|
| 607 |
+
"truthfulqa",
|
| 608 |
"mgsm"
|
| 609 |
]
|
| 610 |
},
|
|
|
|
| 624 |
"classification",
|
| 625 |
"mmlu",
|
| 626 |
"arc",
|
| 627 |
+
"truthfulqa",
|
| 628 |
+
"mgsm"
|
| 629 |
+
]
|
| 630 |
+
},
|
| 631 |
+
{
|
| 632 |
+
"id":"openai\/gpt-3.5-turbo-0613",
|
| 633 |
+
"name":"GPT-3.5 Turbo (older v0613)",
|
| 634 |
+
"provider_name":"OpenAI",
|
| 635 |
+
"cost":2.0,
|
| 636 |
+
"hf_id":null,
|
| 637 |
+
"size":null,
|
| 638 |
+
"type":"closed-source",
|
| 639 |
+
"license":null,
|
| 640 |
+
"creation_date":1706140800000,
|
| 641 |
+
"tasks":[
|
| 642 |
+
"translation_from",
|
| 643 |
+
"translation_to",
|
| 644 |
+
"classification",
|
| 645 |
+
"mmlu",
|
| 646 |
+
"arc",
|
| 647 |
+
"truthfulqa",
|
| 648 |
"mgsm"
|
| 649 |
]
|
| 650 |
},
|
|
|
|
| 657 |
"size":null,
|
| 658 |
"type":"closed-source",
|
| 659 |
"license":null,
|
| 660 |
+
"creation_date":1744588800000,
|
| 661 |
+
"tasks":[
|
| 662 |
+
"translation_from",
|
| 663 |
+
"translation_to",
|
| 664 |
+
"classification",
|
| 665 |
+
"mmlu",
|
| 666 |
+
"arc",
|
| 667 |
+
"truthfulqa",
|
| 668 |
+
"mgsm"
|
| 669 |
+
]
|
| 670 |
},
|
| 671 |
{
|
| 672 |
"id":"openai\/gpt-4.1-mini",
|
|
|
|
| 684 |
"classification",
|
| 685 |
"mmlu",
|
| 686 |
"arc",
|
| 687 |
+
"truthfulqa",
|
| 688 |
"mgsm"
|
| 689 |
]
|
| 690 |
},
|
|
|
|
| 704 |
"classification",
|
| 705 |
"mmlu",
|
| 706 |
"arc",
|
| 707 |
+
"truthfulqa",
|
| 708 |
"mgsm"
|
| 709 |
]
|
| 710 |
},
|
|
|
|
| 724 |
"classification",
|
| 725 |
"mmlu",
|
| 726 |
"arc",
|
| 727 |
+
"truthfulqa",
|
| 728 |
+
"mgsm"
|
| 729 |
+
]
|
| 730 |
+
},
|
| 731 |
+
{
|
| 732 |
+
"id":"qwen\/qwen3-235b-a22b",
|
| 733 |
+
"name":"Qwen3 235B A22B",
|
| 734 |
+
"provider_name":"Qwen",
|
| 735 |
+
"cost":0.0,
|
| 736 |
+
"hf_id":"Qwen\/Qwen3-235B-A22B",
|
| 737 |
+
"size":235093634560.0,
|
| 738 |
+
"type":"open-source",
|
| 739 |
+
"license":"Apache 2.0",
|
| 740 |
+
"creation_date":1745712000000,
|
| 741 |
+
"tasks":[
|
| 742 |
+
"translation_from",
|
| 743 |
+
"translation_to",
|
| 744 |
+
"classification",
|
| 745 |
+
"mmlu",
|
| 746 |
+
"arc",
|
| 747 |
+
"truthfulqa",
|
| 748 |
+
"mgsm"
|
| 749 |
+
]
|
| 750 |
+
},
|
| 751 |
+
{
|
| 752 |
+
"id":"qwen\/qwen3-30b-a3b",
|
| 753 |
+
"name":"Qwen3 30B A3B",
|
| 754 |
+
"provider_name":"Qwen",
|
| 755 |
+
"cost":0.0,
|
| 756 |
+
"hf_id":"Qwen\/Qwen3-30B-A3B",
|
| 757 |
+
"size":30532122624.0,
|
| 758 |
+
"type":"open-source",
|
| 759 |
+
"license":"Apache 2.0",
|
| 760 |
+
"creation_date":1745712000000,
|
| 761 |
+
"tasks":[
|
| 762 |
+
"translation_from",
|
| 763 |
+
"translation_to",
|
| 764 |
+
"classification",
|
| 765 |
+
"mmlu",
|
| 766 |
+
"arc",
|
| 767 |
+
"truthfulqa",
|
| 768 |
+
"mgsm"
|
| 769 |
+
]
|
| 770 |
+
},
|
| 771 |
+
{
|
| 772 |
+
"id":"qwen\/qwen3-32b",
|
| 773 |
+
"name":"Qwen3 32B",
|
| 774 |
+
"provider_name":"Qwen",
|
| 775 |
+
"cost":0.0,
|
| 776 |
+
"hf_id":"Qwen\/Qwen3-32B",
|
| 777 |
+
"size":32762123264.0,
|
| 778 |
+
"type":"open-source",
|
| 779 |
+
"license":"Apache 2.0",
|
| 780 |
+
"creation_date":1745712000000,
|
| 781 |
+
"tasks":[
|
| 782 |
+
"translation_from",
|
| 783 |
+
"translation_to",
|
| 784 |
+
"classification",
|
| 785 |
+
"mmlu",
|
| 786 |
+
"arc",
|
| 787 |
+
"truthfulqa",
|
| 788 |
"mgsm"
|
| 789 |
]
|
| 790 |
}
|
results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|