Spaces:
Running
Running
Upload from GitHub Actions: updated translation functions
Browse files- evals/datasets_/mmlu.py +13 -4
- evals/datasets_/truthfulqa.py +52 -16
- evals/models.py +5 -1
evals/datasets_/mmlu.py
CHANGED
|
@@ -111,6 +111,7 @@ def print_datasets_analysis():
|
|
| 111 |
# MMLUX is translated using DeepL
|
| 112 |
# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
|
| 113 |
|
|
|
|
| 114 |
# print_datasets_analysis()
|
| 115 |
|
| 116 |
|
|
@@ -195,7 +196,13 @@ async def load_mmlu_translated(language_bcp_47, nr):
|
|
| 195 |
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
| 196 |
if len(filtered) == 0:
|
| 197 |
return None, None, None
|
| 198 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 199 |
|
| 200 |
# Translate question and choices
|
| 201 |
question_translated = await translate_google(task["question"], "en", language_bcp_47)
|
|
@@ -226,7 +233,7 @@ def translate_mmlu(languages):
|
|
| 226 |
for lang in languages["bcp_47"].values[:150]
|
| 227 |
if lang not in human_translated and lang in get_google_supported_languages()
|
| 228 |
]
|
| 229 |
-
n_samples =
|
| 230 |
|
| 231 |
slug = "fair-forward/mmlu-autotranslated"
|
| 232 |
for lang in tqdm(untranslated):
|
|
@@ -242,8 +249,10 @@ def translate_mmlu(languages):
|
|
| 242 |
if split == "dev":
|
| 243 |
samples.extend(ds.filter(lambda x: x["subject"] == category))
|
| 244 |
else:
|
| 245 |
-
|
| 246 |
-
|
|
|
|
|
|
|
| 247 |
samples.append(task)
|
| 248 |
questions_tr = [
|
| 249 |
translate_google(s["question"], "en", lang) for s in samples
|
|
|
|
| 111 |
# MMLUX is translated using DeepL
|
| 112 |
# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
|
| 113 |
|
| 114 |
+
|
| 115 |
# print_datasets_analysis()
|
| 116 |
|
| 117 |
|
|
|
|
| 196 |
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
| 197 |
if len(filtered) == 0:
|
| 198 |
return None, None, None
|
| 199 |
+
|
| 200 |
+
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
|
| 201 |
+
if nr < 20:
|
| 202 |
+
task = filtered[nr] # Direct mapping to same sample
|
| 203 |
+
else:
|
| 204 |
+
# Fallback to sequential if nr exceeds our sample count
|
| 205 |
+
task = filtered[nr % len(filtered)]
|
| 206 |
|
| 207 |
# Translate question and choices
|
| 208 |
question_translated = await translate_google(task["question"], "en", language_bcp_47)
|
|
|
|
| 233 |
for lang in languages["bcp_47"].values[:150]
|
| 234 |
if lang not in human_translated and lang in get_google_supported_languages()
|
| 235 |
]
|
| 236 |
+
n_samples = 20
|
| 237 |
|
| 238 |
slug = "fair-forward/mmlu-autotranslated"
|
| 239 |
for lang in tqdm(untranslated):
|
|
|
|
| 249 |
if split == "dev":
|
| 250 |
samples.extend(ds.filter(lambda x: x["subject"] == category))
|
| 251 |
else:
|
| 252 |
+
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
|
| 253 |
+
filtered = ds.filter(lambda x: x["subject"] == category)
|
| 254 |
+
for i in range(min(n_samples, len(filtered))):
|
| 255 |
+
task = filtered[i]
|
| 256 |
samples.append(task)
|
| 257 |
questions_tr = [
|
| 258 |
translate_google(s["question"], "en", lang) for s in samples
|
evals/datasets_/truthfulqa.py
CHANGED
|
@@ -14,11 +14,21 @@ from models import translate_google, get_google_supported_languages
|
|
| 14 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 15 |
|
| 16 |
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
|
|
|
|
|
|
|
| 17 |
tags_uhura_truthfulqa = {
|
| 18 |
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
|
| 19 |
if a.endswith("multiple_choice")
|
| 20 |
}
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def add_choices(row):
|
| 24 |
row["choices"] = row["mc1_targets"]["choices"]
|
|
@@ -34,6 +44,15 @@ async def load_truthfulqa(language_bcp_47, nr):
|
|
| 34 |
ds = ds.map(add_choices)
|
| 35 |
task = ds["test"][nr]
|
| 36 |
return "masakhane/uhura-truthfulqa", task, "human"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
else:
|
| 38 |
# Fallback to on-the-fly translation for missing languages/samples
|
| 39 |
return await load_truthfulqa_translated(language_bcp_47, nr)
|
|
@@ -52,7 +71,13 @@ async def load_truthfulqa_translated(language_bcp_47, nr):
|
|
| 52 |
# Load English TruthfulQA data
|
| 53 |
ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"])
|
| 54 |
ds = ds.map(add_choices)
|
| 55 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 56 |
|
| 57 |
# Translate question and choices
|
| 58 |
question_translated = await translate_google(task["question"], "en", language_bcp_47)
|
|
@@ -84,6 +109,9 @@ def translate_truthfulqa(languages):
|
|
| 84 |
]
|
| 85 |
n_samples = 20
|
| 86 |
|
|
|
|
|
|
|
|
|
|
| 87 |
slug = "fair-forward/truthfulqa-autotranslated"
|
| 88 |
for lang in tqdm(untranslated):
|
| 89 |
# check if already exists on hub
|
|
@@ -97,32 +125,40 @@ def translate_truthfulqa(languages):
|
|
| 97 |
if split == "train":
|
| 98 |
samples.extend(ds)
|
| 99 |
else:
|
| 100 |
-
|
|
|
|
| 101 |
task = ds[i]
|
| 102 |
samples.append(task)
|
|
|
|
|
|
|
| 103 |
questions_tr = [
|
| 104 |
translate_google(s["question"], "en", lang) for s in samples
|
| 105 |
]
|
| 106 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
for s in samples:
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
choices_tr
|
| 118 |
-
|
|
|
|
|
|
|
| 119 |
|
| 120 |
ds_lang = Dataset.from_dict(
|
| 121 |
{
|
| 122 |
-
"subject": [s["subject"] for s in samples],
|
| 123 |
"question": questions_tr,
|
| 124 |
-
"choices":
|
| 125 |
-
"
|
| 126 |
}
|
| 127 |
)
|
| 128 |
ds_lang.push_to_hub(
|
|
|
|
| 14 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 15 |
|
| 16 |
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
|
| 17 |
+
slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
|
| 18 |
+
|
| 19 |
tags_uhura_truthfulqa = {
|
| 20 |
standardize_tag(a.split("_")[0], macro=True): a for a in _get_dataset_config_names(slug_uhura_truthfulqa)
|
| 21 |
if a.endswith("multiple_choice")
|
| 22 |
}
|
| 23 |
|
| 24 |
+
# Get available auto-translated languages
|
| 25 |
+
try:
|
| 26 |
+
tags_truthfulqa_autotranslated = {
|
| 27 |
+
standardize_tag(a, macro=True): a for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
|
| 28 |
+
}
|
| 29 |
+
except Exception:
|
| 30 |
+
tags_truthfulqa_autotranslated = {}
|
| 31 |
+
|
| 32 |
|
| 33 |
def add_choices(row):
|
| 34 |
row["choices"] = row["mc1_targets"]["choices"]
|
|
|
|
| 44 |
ds = ds.map(add_choices)
|
| 45 |
task = ds["test"][nr]
|
| 46 |
return "masakhane/uhura-truthfulqa", task, "human"
|
| 47 |
+
elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
|
| 48 |
+
# Load from auto-translated dataset (same samples as translation)
|
| 49 |
+
ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
|
| 50 |
+
test_split = ds["test"] if "test" in ds else ds
|
| 51 |
+
if nr < len(test_split):
|
| 52 |
+
task = test_split[nr]
|
| 53 |
+
return slug_truthfulqa_autotranslated, task, "machine"
|
| 54 |
+
# If requested index exceeds stored sample count, fall back to on-the-fly
|
| 55 |
+
return await load_truthfulqa_translated(language_bcp_47, nr)
|
| 56 |
else:
|
| 57 |
# Fallback to on-the-fly translation for missing languages/samples
|
| 58 |
return await load_truthfulqa_translated(language_bcp_47, nr)
|
|
|
|
| 71 |
# Load English TruthfulQA data
|
| 72 |
ds = _load_dataset(slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"])
|
| 73 |
ds = ds.map(add_choices)
|
| 74 |
+
|
| 75 |
+
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
|
| 76 |
+
if nr < 20:
|
| 77 |
+
task = ds["test"][nr] # Direct mapping to same sample
|
| 78 |
+
else:
|
| 79 |
+
# Fallback to sequential if nr exceeds our sample count
|
| 80 |
+
task = ds["test"][nr % len(ds["test"])]
|
| 81 |
|
| 82 |
# Translate question and choices
|
| 83 |
question_translated = await translate_google(task["question"], "en", language_bcp_47)
|
|
|
|
| 109 |
]
|
| 110 |
n_samples = 20
|
| 111 |
|
| 112 |
+
# Set fixed seed for consistent sample selection across all languages
|
| 113 |
+
random.seed(42)
|
| 114 |
+
|
| 115 |
slug = "fair-forward/truthfulqa-autotranslated"
|
| 116 |
for lang in tqdm(untranslated):
|
| 117 |
# check if already exists on hub
|
|
|
|
| 125 |
if split == "train":
|
| 126 |
samples.extend(ds)
|
| 127 |
else:
|
| 128 |
+
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
|
| 129 |
+
for i in range(min(n_samples, len(ds))):
|
| 130 |
task = ds[i]
|
| 131 |
samples.append(task)
|
| 132 |
+
|
| 133 |
+
# Translate questions
|
| 134 |
questions_tr = [
|
| 135 |
translate_google(s["question"], "en", lang) for s in samples
|
| 136 |
]
|
| 137 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
| 138 |
+
|
| 139 |
+
# Translate choices for each sample
|
| 140 |
+
all_choices_tr = []
|
| 141 |
+
all_labels = []
|
| 142 |
+
|
| 143 |
for s in samples:
|
| 144 |
+
# Get choices from mc1_targets
|
| 145 |
+
choices = s["mc1_targets"]["choices"]
|
| 146 |
+
labels = s["mc1_targets"]["labels"]
|
| 147 |
+
|
| 148 |
+
# Translate choices
|
| 149 |
+
choices_tr = [
|
| 150 |
+
translate_google(choice, "en", lang) for choice in choices
|
| 151 |
+
]
|
| 152 |
+
choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
|
| 153 |
+
|
| 154 |
+
all_choices_tr.append(choices_tr)
|
| 155 |
+
all_labels.append(labels)
|
| 156 |
|
| 157 |
ds_lang = Dataset.from_dict(
|
| 158 |
{
|
|
|
|
| 159 |
"question": questions_tr,
|
| 160 |
+
"choices": all_choices_tr,
|
| 161 |
+
"labels": all_labels,
|
| 162 |
}
|
| 163 |
)
|
| 164 |
ds_lang.push_to_hub(
|
evals/models.py
CHANGED
|
@@ -8,7 +8,11 @@ from os import getenv
|
|
| 8 |
import pandas as pd
|
| 9 |
from aiolimiter import AsyncLimiter
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
from google.cloud import translate_v2 as translate
|
| 13 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
| 14 |
from joblib.memory import Memory
|
|
|
|
| 8 |
import pandas as pd
|
| 9 |
from aiolimiter import AsyncLimiter
|
| 10 |
from dotenv import load_dotenv
|
| 11 |
+
# Make ElevenLabs optional to avoid hard dependency when not using speech tasks
|
| 12 |
+
try:
|
| 13 |
+
from elevenlabs import AsyncElevenLabs
|
| 14 |
+
except Exception: # ImportError or other env-specific issues
|
| 15 |
+
AsyncElevenLabs = None
|
| 16 |
from google.cloud import translate_v2 as translate
|
| 17 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
| 18 |
from joblib.memory import Memory
|