Upload from GitHub Actions: fix norwegian
Browse files- evals/datasets_/arc.py +3 -4
- evals/datasets_/mgsm.py +6 -6
- evals/datasets_/mmlu.py +6 -6
- evals/datasets_/util.py +18 -0
evals/datasets_/arc.py
CHANGED
|
@@ -1,6 +1,5 @@
|
|
| 1 |
import random
|
| 2 |
|
| 3 |
-
from langcodes import standardize_tag
|
| 4 |
from rich import print
|
| 5 |
from models import translate_google, get_google_supported_languages
|
| 6 |
from tqdm import tqdm
|
|
@@ -9,11 +8,11 @@ import asyncio
|
|
| 9 |
from tqdm.asyncio import tqdm_asyncio
|
| 10 |
import os
|
| 11 |
|
| 12 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 13 |
|
| 14 |
slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
|
| 15 |
tags_uhura_arc_easy = {
|
| 16 |
-
|
| 17 |
for a in _get_dataset_config_names(slug_uhura_arc_easy)
|
| 18 |
if not a.endswith("unmatched")
|
| 19 |
}
|
|
@@ -35,7 +34,7 @@ random.shuffle(common_ids_test)
|
|
| 35 |
|
| 36 |
slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
|
| 37 |
tags_uhura_arc_easy_translated = {
|
| 38 |
-
|
| 39 |
for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
|
| 40 |
}
|
| 41 |
|
|
|
|
| 1 |
import random
|
| 2 |
|
|
|
|
| 3 |
from rich import print
|
| 4 |
from models import translate_google, get_google_supported_languages
|
| 5 |
from tqdm import tqdm
|
|
|
|
| 8 |
from tqdm.asyncio import tqdm_asyncio
|
| 9 |
import os
|
| 10 |
|
| 11 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset, standardize_bcp47
|
| 12 |
|
| 13 |
slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
|
| 14 |
tags_uhura_arc_easy = {
|
| 15 |
+
standardize_bcp47(a.split("_")[0]): a
|
| 16 |
for a in _get_dataset_config_names(slug_uhura_arc_easy)
|
| 17 |
if not a.endswith("unmatched")
|
| 18 |
}
|
|
|
|
| 34 |
|
| 35 |
slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
|
| 36 |
tags_uhura_arc_easy_translated = {
|
| 37 |
+
standardize_bcp47(a.split("_")[0]): a
|
| 38 |
for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
|
| 39 |
}
|
| 40 |
|
evals/datasets_/mgsm.py
CHANGED
|
@@ -3,8 +3,8 @@ import os
|
|
| 3 |
import random
|
| 4 |
|
| 5 |
from datasets import Dataset, load_dataset
|
| 6 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset, cache
|
| 7 |
-
from langcodes import Language
|
| 8 |
from models import get_google_supported_languages, translate_google
|
| 9 |
from rich import print
|
| 10 |
from tqdm import tqdm
|
|
@@ -12,20 +12,20 @@ from tqdm.asyncio import tqdm_asyncio
|
|
| 12 |
|
| 13 |
slug_mgsm = "juletxara/mgsm"
|
| 14 |
tags_mgsm = {
|
| 15 |
-
|
| 16 |
}
|
| 17 |
slug_afrimgsm = "masakhane/afrimgsm"
|
| 18 |
tags_afrimgsm = {
|
| 19 |
-
|
| 20 |
}
|
| 21 |
slug_gsm8kx = "Eurolingua/gsm8kx"
|
| 22 |
tags_gsm8kx = {
|
| 23 |
-
|
| 24 |
for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
|
| 25 |
}
|
| 26 |
slug_gsm_autotranslated = "fair-forward/gsm-autotranslated"
|
| 27 |
tags_gsm_autotranslated = {
|
| 28 |
-
|
| 29 |
for a in _get_dataset_config_names(slug_gsm_autotranslated)
|
| 30 |
}
|
| 31 |
|
|
|
|
| 3 |
import random
|
| 4 |
|
| 5 |
from datasets import Dataset, load_dataset
|
| 6 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset, cache, standardize_bcp47
|
| 7 |
+
from langcodes import Language
|
| 8 |
from models import get_google_supported_languages, translate_google
|
| 9 |
from rich import print
|
| 10 |
from tqdm import tqdm
|
|
|
|
| 12 |
|
| 13 |
slug_mgsm = "juletxara/mgsm"
|
| 14 |
tags_mgsm = {
|
| 15 |
+
standardize_bcp47(a): a for a in _get_dataset_config_names(slug_mgsm)
|
| 16 |
}
|
| 17 |
slug_afrimgsm = "masakhane/afrimgsm"
|
| 18 |
tags_afrimgsm = {
|
| 19 |
+
standardize_bcp47(a): a for a in _get_dataset_config_names(slug_afrimgsm)
|
| 20 |
}
|
| 21 |
slug_gsm8kx = "Eurolingua/gsm8kx"
|
| 22 |
tags_gsm8kx = {
|
| 23 |
+
standardize_bcp47(a): a
|
| 24 |
for a in _get_dataset_config_names(slug_gsm8kx, trust_remote_code=True)
|
| 25 |
}
|
| 26 |
slug_gsm_autotranslated = "fair-forward/gsm-autotranslated"
|
| 27 |
tags_gsm_autotranslated = {
|
| 28 |
+
standardize_bcp47(a): a
|
| 29 |
for a in _get_dataset_config_names(slug_gsm_autotranslated)
|
| 30 |
}
|
| 31 |
|
evals/datasets_/mmlu.py
CHANGED
|
@@ -4,7 +4,7 @@ import random
|
|
| 4 |
from collections import Counter, defaultdict
|
| 5 |
|
| 6 |
from datasets import Dataset, load_dataset
|
| 7 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset, cache
|
| 8 |
from langcodes import Language, standardize_tag
|
| 9 |
from models import get_google_supported_languages, translate_google
|
| 10 |
from rich import print
|
|
@@ -24,7 +24,7 @@ def print_datasets_analysis():
|
|
| 24 |
ds1 = _load_dataset(slug1, "eng")
|
| 25 |
print_counts(slug1, ds1["dev"]["subject"], ds1["test"]["subject"])
|
| 26 |
langs1 = _get_dataset_config_names(slug1)
|
| 27 |
-
langs1 = [
|
| 28 |
|
| 29 |
slug2 = "openai/MMMLU" # does not have dev set! – but: these languages are all also present in Global-MMLU
|
| 30 |
ds2 = _load_dataset(slug2, "FR_FR")
|
|
@@ -37,7 +37,7 @@ def print_datasets_analysis():
|
|
| 37 |
ds3 = _load_dataset(slug3, "en")
|
| 38 |
print_counts(slug3, ds3["dev"]["subject"], ds3["test"]["subject"])
|
| 39 |
langs3 = _get_dataset_config_names(slug3)
|
| 40 |
-
langs3 = [
|
| 41 |
|
| 42 |
slug4 = "lighteval/okapi_mmlu"
|
| 43 |
ds4 = _load_dataset(slug4, "ar", trust_remote_code=True)
|
|
@@ -132,11 +132,11 @@ def add_choices(row):
|
|
| 132 |
|
| 133 |
|
| 134 |
tags_afrimmlu = {
|
| 135 |
-
|
| 136 |
for a in _get_dataset_config_names("masakhane/afrimmlu")
|
| 137 |
}
|
| 138 |
tags_global_mmlu = {
|
| 139 |
-
|
| 140 |
for a in _get_dataset_config_names("CohereForAI/Global-MMLU")
|
| 141 |
}
|
| 142 |
tags_okapi = _get_dataset_config_names("lighteval/okapi_mmlu")
|
|
@@ -145,7 +145,7 @@ tags_mmlux = set(
|
|
| 145 |
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
| 146 |
)
|
| 147 |
tags_mmlu_autotranslated = {
|
| 148 |
-
|
| 149 |
for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
|
| 150 |
}
|
| 151 |
|
|
|
|
| 4 |
from collections import Counter, defaultdict
|
| 5 |
|
| 6 |
from datasets import Dataset, load_dataset
|
| 7 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset, cache, standardize_bcp47
|
| 8 |
from langcodes import Language, standardize_tag
|
| 9 |
from models import get_google_supported_languages, translate_google
|
| 10 |
from rich import print
|
|
|
|
| 24 |
ds1 = _load_dataset(slug1, "eng")
|
| 25 |
print_counts(slug1, ds1["dev"]["subject"], ds1["test"]["subject"])
|
| 26 |
langs1 = _get_dataset_config_names(slug1)
|
| 27 |
+
langs1 = [standardize_bcp47(a) for a in langs1]
|
| 28 |
|
| 29 |
slug2 = "openai/MMMLU" # does not have dev set! – but: these languages are all also present in Global-MMLU
|
| 30 |
ds2 = _load_dataset(slug2, "FR_FR")
|
|
|
|
| 37 |
ds3 = _load_dataset(slug3, "en")
|
| 38 |
print_counts(slug3, ds3["dev"]["subject"], ds3["test"]["subject"])
|
| 39 |
langs3 = _get_dataset_config_names(slug3)
|
| 40 |
+
langs3 = [standardize_bcp47(a) for a in langs3]
|
| 41 |
|
| 42 |
slug4 = "lighteval/okapi_mmlu"
|
| 43 |
ds4 = _load_dataset(slug4, "ar", trust_remote_code=True)
|
|
|
|
| 132 |
|
| 133 |
|
| 134 |
tags_afrimmlu = {
|
| 135 |
+
standardize_bcp47(a): a
|
| 136 |
for a in _get_dataset_config_names("masakhane/afrimmlu")
|
| 137 |
}
|
| 138 |
tags_global_mmlu = {
|
| 139 |
+
standardize_bcp47(a): a
|
| 140 |
for a in _get_dataset_config_names("CohereForAI/Global-MMLU")
|
| 141 |
}
|
| 142 |
tags_okapi = _get_dataset_config_names("lighteval/okapi_mmlu")
|
|
|
|
| 145 |
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
| 146 |
)
|
| 147 |
tags_mmlu_autotranslated = {
|
| 148 |
+
standardize_bcp47(a): a
|
| 149 |
for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
|
| 150 |
}
|
| 151 |
|
evals/datasets_/util.py
CHANGED
|
@@ -6,10 +6,28 @@ from datasets import Dataset, get_dataset_config_names, load_dataset
|
|
| 6 |
from datasets.exceptions import DatasetNotFoundError
|
| 7 |
from huggingface_hub.errors import RepositoryNotFoundError
|
| 8 |
from joblib.memory import Memory
|
|
|
|
| 9 |
|
| 10 |
cache = Memory(location=".cache", verbose=0).cache
|
| 11 |
TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN")
|
| 12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
@cache
|
| 15 |
def _get_dataset_config_names(dataset, **kwargs):
|
|
|
|
| 6 |
from datasets.exceptions import DatasetNotFoundError
|
| 7 |
from huggingface_hub.errors import RepositoryNotFoundError
|
| 8 |
from joblib.memory import Memory
|
| 9 |
+
from langcodes import standardize_tag
|
| 10 |
|
| 11 |
cache = Memory(location=".cache", verbose=0).cache
|
| 12 |
TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN")
|
| 13 |
|
| 14 |
+
# Macrolanguage mappings: when standardize_tag returns a macrolanguage,
|
| 15 |
+
# map it to the preferred specific variant for consistency across datasets.
|
| 16 |
+
# This ensures results from different benchmarks use the same language code.
|
| 17 |
+
MACROLANGUAGE_MAPPINGS = {
|
| 18 |
+
"no": "nb", # Norwegian -> Norwegian Bokmål (most widely used variant)
|
| 19 |
+
# Add more mappings here as needed, e.g.:
|
| 20 |
+
# "ms": "zsm", # Malay -> Standard Malay
|
| 21 |
+
# "ar": "arb", # Arabic -> Standard Arabic
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
|
| 25 |
+
def standardize_bcp47(tag: str, macro: bool = True) -> str:
|
| 26 |
+
"""Standardize a BCP-47 tag with consistent macrolanguage handling."""
|
| 27 |
+
|
| 28 |
+
standardized = standardize_tag(tag, macro=macro)
|
| 29 |
+
return MACROLANGUAGE_MAPPINGS.get(standardized, standardized)
|
| 30 |
+
|
| 31 |
|
| 32 |
@cache
|
| 33 |
def _get_dataset_config_names(dataset, **kwargs):
|