Upload from GitHub Actions: Merge pull request #22 from datenlabor-bmz/dev
Browse files- .github/workflows/nightly-evals.yml +9 -29
- .gitignore +5 -0
- Dockerfile +1 -1
- README.md +5 -0
- data/datasets.json +783 -0
- evals/__init__.py +0 -1
- evals/backend.py +88 -38
- evals/countries.py +10 -4
- evals/datasets_/arc.py +45 -28
- evals/datasets_/fleurs.py +2 -1
- evals/datasets_/mgsm.py +48 -24
- evals/datasets_/mmlu.py +57 -25
- evals/datasets_/truthfulqa.py +63 -27
- evals/datasets_/util.py +32 -1
- evals/download_data.py +33 -16
- evals/languages.py +3 -0
- evals/main.py +65 -47
- evals/models.py +192 -91
- evals/plots.py +75 -41
- evals/tasks.py +213 -256
- evals/translate.py +1 -1
- frontend/package-lock.json +0 -0
- frontend/package.json +7 -5
- frontend/public/sw.js +9 -0
- frontend/src/App.js +185 -77
- frontend/src/components/HistoryPlot.js +2 -2
- frontend/src/components/LanguageTable.js +1 -1
- frontend/src/components/ModelTable.js +31 -17
- frontend/src/components/ScoreColumns.js +23 -10
- frontend/src/components/ScoreField.js +2 -1
- frontend/src/components/SpeakerPlot.js +2 -2
- frontend/src/components/WorldMap.js +22 -7
- notes/system-architecture-diagram.md +177 -0
- pyproject.toml +2 -10
- uv.lock +0 -0
.github/workflows/nightly-evals.yml
CHANGED
|
@@ -1,13 +1,15 @@
|
|
| 1 |
name: Nightly Evaluation Run
|
| 2 |
|
| 3 |
on:
|
| 4 |
-
schedule:
|
| 5 |
-
|
| 6 |
workflow_dispatch: # Allow manual triggering
|
| 7 |
|
| 8 |
jobs:
|
| 9 |
run-evals:
|
| 10 |
runs-on: ubuntu-latest
|
|
|
|
|
|
|
| 11 |
steps:
|
| 12 |
- uses: actions/checkout@v3
|
| 13 |
|
|
@@ -25,38 +27,16 @@ jobs:
|
|
| 25 |
env:
|
| 26 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
| 27 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
|
|
|
|
|
|
| 28 |
run: |
|
| 29 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 30 |
uv run evals/download_data.py
|
| 31 |
uv run evals/main.py
|
| 32 |
|
| 33 |
-
- name:
|
| 34 |
-
env:
|
| 35 |
-
GH_PAT: ${{ secrets.GH_PAT }}
|
| 36 |
-
run: |
|
| 37 |
-
git config --local user.email "github-actions[bot]@users.noreply.github.com"
|
| 38 |
-
git config --local user.name "github-actions[bot]"
|
| 39 |
-
git config --local --unset-all http.https://github.com/.extraheader
|
| 40 |
-
git remote set-url origin https://${GH_PAT}@github.com/datenlabor-bmz/ai-language-monitor.git
|
| 41 |
-
git add results.json models.json languages.json
|
| 42 |
-
git commit -m "Update evaluation results" || echo "No changes to commit"
|
| 43 |
-
git push origin HEAD:main
|
| 44 |
-
|
| 45 |
-
- name: Upload to Hugging Face
|
| 46 |
env:
|
| 47 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 48 |
run: |
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
import os
|
| 52 |
-
|
| 53 |
-
upload_folder(
|
| 54 |
-
folder_path=".",
|
| 55 |
-
path_in_repo="/",
|
| 56 |
-
allow_patterns=["results.json", "models.json", "languages.json"],
|
| 57 |
-
repo_id="fair-forward/evals-for-every-language",
|
| 58 |
-
repo_type="space",
|
| 59 |
-
token=os.environ["HUGGINGFACE_ACCESS_TOKEN"],
|
| 60 |
-
commit_message="Upload from nightly evaluation run",
|
| 61 |
-
)
|
| 62 |
-
'
|
|
|
|
| 1 |
name: Nightly Evaluation Run
|
| 2 |
|
| 3 |
on:
|
| 4 |
+
# schedule:
|
| 5 |
+
# - cron: '0 3 * * *' # Run at 3am UTC every day
|
| 6 |
workflow_dispatch: # Allow manual triggering
|
| 7 |
|
| 8 |
jobs:
|
| 9 |
run-evals:
|
| 10 |
runs-on: ubuntu-latest
|
| 11 |
+
# checking if this is working in case eval runs take longer than 6h github actions allowance
|
| 12 |
+
timeout-minutes: 1440 # 24 hours timeout
|
| 13 |
steps:
|
| 14 |
- uses: actions/checkout@v3
|
| 15 |
|
|
|
|
| 27 |
env:
|
| 28 |
OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }}
|
| 29 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 30 |
+
N_SENTENCES: 20
|
| 31 |
+
MAX_LANGUAGES: 150
|
| 32 |
run: |
|
| 33 |
uv run huggingface-cli login --token ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 34 |
uv run evals/download_data.py
|
| 35 |
uv run evals/main.py
|
| 36 |
|
| 37 |
+
- name: Restart HuggingFace Space
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
env:
|
| 39 |
HUGGINGFACE_ACCESS_TOKEN: ${{ secrets.HUGGINGFACE_ACCESS_TOKEN }}
|
| 40 |
run: |
|
| 41 |
+
curl -X POST "https://huggingface.co/api/spaces/fair-forward/evals-for-every-language/restart" \
|
| 42 |
+
-H "Authorization: Bearer $HUGGINGFACE_ACCESS_TOKEN"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
.gitignore
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
data/translations/
|
| 2 |
floresp-*
|
| 3 |
fleurs
|
|
@@ -5,6 +6,8 @@ spbleu
|
|
| 5 |
.cache
|
| 6 |
.env
|
| 7 |
*_credentials.json
|
|
|
|
|
|
|
| 8 |
|
| 9 |
# Python-generated files
|
| 10 |
__pycache__/
|
|
@@ -20,3 +23,5 @@ wheels/
|
|
| 20 |
# folders and files to be ignored
|
| 21 |
.specstory/
|
| 22 |
.cursorindexingignore
|
|
|
|
|
|
|
|
|
| 1 |
+
results/
|
| 2 |
data/translations/
|
| 3 |
floresp-*
|
| 4 |
fleurs
|
|
|
|
| 6 |
.cache
|
| 7 |
.env
|
| 8 |
*_credentials.json
|
| 9 |
+
models_unfiltered.json
|
| 10 |
+
**/*.DS_Store
|
| 11 |
|
| 12 |
# Python-generated files
|
| 13 |
__pycache__/
|
|
|
|
| 23 |
# folders and files to be ignored
|
| 24 |
.specstory/
|
| 25 |
.cursorindexingignore
|
| 26 |
+
|
| 27 |
+
|
Dockerfile
CHANGED
|
@@ -14,7 +14,7 @@ ENV HOME=/home/user \
|
|
| 14 |
RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
|
| 15 |
USER user
|
| 16 |
WORKDIR $HOME/app
|
| 17 |
-
COPY --chown=user pyproject.toml uv.lock ./
|
| 18 |
RUN uv sync --frozen --no-dev
|
| 19 |
COPY --chown=user evals/ evals/
|
| 20 |
COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build
|
|
|
|
| 14 |
RUN mkdir -p ${UV_CACHE_DIR} && chown -R user:user ${HOME}
|
| 15 |
USER user
|
| 16 |
WORKDIR $HOME/app
|
| 17 |
+
COPY --chown=user pyproject.toml uv.lock README.md ./
|
| 18 |
RUN uv sync --frozen --no-dev
|
| 19 |
COPY --chown=user evals/ evals/
|
| 20 |
COPY --chown=user --from=build /frontend/build /home/user/app/frontend/build
|
README.md
CHANGED
|
@@ -45,6 +45,7 @@ _Tracking language proficiency of AI models for every language_
|
|
| 45 |
|
| 46 |
## Evaluate
|
| 47 |
|
|
|
|
| 48 |
```bash
|
| 49 |
uv run --extra dev evals/main.py
|
| 50 |
```
|
|
@@ -55,3 +56,7 @@ uv run --extra dev evals/main.py
|
|
| 55 |
uv run evals/backend.py
|
| 56 |
cd frontend && npm i && npm start
|
| 57 |
```
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
|
| 46 |
## Evaluate
|
| 47 |
|
| 48 |
+
### Local Development
|
| 49 |
```bash
|
| 50 |
uv run --extra dev evals/main.py
|
| 51 |
```
|
|
|
|
| 56 |
uv run evals/backend.py
|
| 57 |
cd frontend && npm i && npm start
|
| 58 |
```
|
| 59 |
+
|
| 60 |
+
## System Architecture
|
| 61 |
+
|
| 62 |
+
See [notes/system-architecture-diagram.md](notes/system-architecture-diagram.md) for the complete system architecture diagram and component descriptions.
|
data/datasets.json
ADDED
|
@@ -0,0 +1,783 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[
|
| 2 |
+
{
|
| 3 |
+
"name": "FLORES+",
|
| 4 |
+
"author": "Meta",
|
| 5 |
+
"author_url": "https://ai.meta.com",
|
| 6 |
+
"url": "https://huggingface.co/datasets/openlanguagedata/flores_plus",
|
| 7 |
+
"n_languages": 200,
|
| 8 |
+
"tasks": [
|
| 9 |
+
"translation"
|
| 10 |
+
],
|
| 11 |
+
"parallel": true,
|
| 12 |
+
"translation": "human",
|
| 13 |
+
"base": "FLORES",
|
| 14 |
+
"implemented": true,
|
| 15 |
+
"group": "Translation"
|
| 16 |
+
},
|
| 17 |
+
{
|
| 18 |
+
"name": "SIB-200",
|
| 19 |
+
"author": "Academic",
|
| 20 |
+
"author_url": null,
|
| 21 |
+
"url": "https://huggingface.co/datasets/Davlan/sib200",
|
| 22 |
+
"n_languages": 200,
|
| 23 |
+
"tasks": [
|
| 24 |
+
"classification"
|
| 25 |
+
],
|
| 26 |
+
"parallel": true,
|
| 27 |
+
"translation": "human",
|
| 28 |
+
"base": "FLORES",
|
| 29 |
+
"implemented": true,
|
| 30 |
+
"group": "Translation"
|
| 31 |
+
},
|
| 32 |
+
{
|
| 33 |
+
"name": "CCAligned",
|
| 34 |
+
"author": "Meta",
|
| 35 |
+
"author_url": "https://ai.meta.com",
|
| 36 |
+
"url": "https://huggingface.co/datasets/ahelk/ccaligned_multilingual",
|
| 37 |
+
"n_languages": 137,
|
| 38 |
+
"tasks": [
|
| 39 |
+
"translation"
|
| 40 |
+
],
|
| 41 |
+
"parallel": false,
|
| 42 |
+
"group": "Translation"
|
| 43 |
+
},
|
| 44 |
+
{
|
| 45 |
+
"name": "OPUS Collection",
|
| 46 |
+
"author": "Helsinki NLP",
|
| 47 |
+
"author_url": null,
|
| 48 |
+
"url": "https://opus.nlpl.eu",
|
| 49 |
+
"n_languages": 747,
|
| 50 |
+
"tasks": [
|
| 51 |
+
"translation"
|
| 52 |
+
],
|
| 53 |
+
"parallel": false,
|
| 54 |
+
"group": "Translation"
|
| 55 |
+
},
|
| 56 |
+
{
|
| 57 |
+
"name": "Global MMLU",
|
| 58 |
+
"author": "Cohere",
|
| 59 |
+
"author_url": "https://cohere.com",
|
| 60 |
+
"url": "https://huggingface.co/datasets/CohereForAI/Global-MMLU",
|
| 61 |
+
"n_languages": 42,
|
| 62 |
+
"languages": [
|
| 63 |
+
"am",
|
| 64 |
+
"ar",
|
| 65 |
+
"bn",
|
| 66 |
+
"cs",
|
| 67 |
+
"de",
|
| 68 |
+
"el",
|
| 69 |
+
"en",
|
| 70 |
+
"es",
|
| 71 |
+
"fa",
|
| 72 |
+
"fil",
|
| 73 |
+
"fr",
|
| 74 |
+
"ha",
|
| 75 |
+
"he",
|
| 76 |
+
"hi",
|
| 77 |
+
"id",
|
| 78 |
+
"ig",
|
| 79 |
+
"it",
|
| 80 |
+
"ja",
|
| 81 |
+
"ko",
|
| 82 |
+
"ky",
|
| 83 |
+
"lt",
|
| 84 |
+
"mg",
|
| 85 |
+
"ms",
|
| 86 |
+
"ne",
|
| 87 |
+
"nl",
|
| 88 |
+
"ny",
|
| 89 |
+
"pl",
|
| 90 |
+
"pt",
|
| 91 |
+
"ro",
|
| 92 |
+
"ru",
|
| 93 |
+
"si",
|
| 94 |
+
"sn",
|
| 95 |
+
"so",
|
| 96 |
+
"sr",
|
| 97 |
+
"sv",
|
| 98 |
+
"sw",
|
| 99 |
+
"te",
|
| 100 |
+
"tr",
|
| 101 |
+
"uk",
|
| 102 |
+
"vi",
|
| 103 |
+
"yo",
|
| 104 |
+
"zh"
|
| 105 |
+
],
|
| 106 |
+
"tasks": [
|
| 107 |
+
"question_answering"
|
| 108 |
+
],
|
| 109 |
+
"parallel": true,
|
| 110 |
+
"translation": "mixed",
|
| 111 |
+
"base": "MMLU",
|
| 112 |
+
"implemented": true,
|
| 113 |
+
"group": "Multitask Language Understanding"
|
| 114 |
+
},
|
| 115 |
+
{
|
| 116 |
+
"name": "MMMLU",
|
| 117 |
+
"author": "OpenAI",
|
| 118 |
+
"author_url": "https://openai.com",
|
| 119 |
+
"url": "https://huggingface.co/datasets/openai/MMMLU",
|
| 120 |
+
"n_languages": "14",
|
| 121 |
+
"languages": [
|
| 122 |
+
"ar",
|
| 123 |
+
"bn",
|
| 124 |
+
"de",
|
| 125 |
+
"es",
|
| 126 |
+
"fr",
|
| 127 |
+
"hi",
|
| 128 |
+
"id",
|
| 129 |
+
"it",
|
| 130 |
+
"ja",
|
| 131 |
+
"ko",
|
| 132 |
+
"pt",
|
| 133 |
+
"sw",
|
| 134 |
+
"yo",
|
| 135 |
+
"zh"
|
| 136 |
+
],
|
| 137 |
+
"tasks": [
|
| 138 |
+
"question_answering"
|
| 139 |
+
],
|
| 140 |
+
"parallel": true,
|
| 141 |
+
"translation": "human",
|
| 142 |
+
"base": "MMLU",
|
| 143 |
+
"implemented": true,
|
| 144 |
+
"group": "Multitask Language Understanding"
|
| 145 |
+
},
|
| 146 |
+
{
|
| 147 |
+
"name": "AfriMMLU",
|
| 148 |
+
"author": "Masakhane",
|
| 149 |
+
"author_url": "https://www.masakhane.io",
|
| 150 |
+
"url": "https://huggingface.co/datasets/masakhane/afrimmlu",
|
| 151 |
+
"n_languages": "17",
|
| 152 |
+
"languages": [
|
| 153 |
+
"am",
|
| 154 |
+
"en",
|
| 155 |
+
"ee",
|
| 156 |
+
"fr",
|
| 157 |
+
"ha",
|
| 158 |
+
"ig",
|
| 159 |
+
"rw",
|
| 160 |
+
"ln",
|
| 161 |
+
"lg",
|
| 162 |
+
"om",
|
| 163 |
+
"sn",
|
| 164 |
+
"st",
|
| 165 |
+
"sw",
|
| 166 |
+
"tw",
|
| 167 |
+
"wo",
|
| 168 |
+
"xh",
|
| 169 |
+
"yo",
|
| 170 |
+
"zu"
|
| 171 |
+
],
|
| 172 |
+
"tasks": [
|
| 173 |
+
"question_answering"
|
| 174 |
+
],
|
| 175 |
+
"parallel": true,
|
| 176 |
+
"translation": "human",
|
| 177 |
+
"base": "MMLU",
|
| 178 |
+
"implemented": true,
|
| 179 |
+
"group": "Multitask Language Understanding"
|
| 180 |
+
},
|
| 181 |
+
{
|
| 182 |
+
"name": "Okapi MMLU",
|
| 183 |
+
"author": "Academic",
|
| 184 |
+
"author_url": null,
|
| 185 |
+
"url": "https://huggingface.co/datasets/jon-tow/okapi_mmlu",
|
| 186 |
+
"n_languages": 26,
|
| 187 |
+
"languages": [
|
| 188 |
+
"ar",
|
| 189 |
+
"bn",
|
| 190 |
+
"ca",
|
| 191 |
+
"da",
|
| 192 |
+
"de",
|
| 193 |
+
"es",
|
| 194 |
+
"eu",
|
| 195 |
+
"fr",
|
| 196 |
+
"gu",
|
| 197 |
+
"hi",
|
| 198 |
+
"hr",
|
| 199 |
+
"hu",
|
| 200 |
+
"hy",
|
| 201 |
+
"id",
|
| 202 |
+
"it",
|
| 203 |
+
"kn",
|
| 204 |
+
"ml",
|
| 205 |
+
"mr",
|
| 206 |
+
"ne",
|
| 207 |
+
"nl",
|
| 208 |
+
"pt",
|
| 209 |
+
"ro",
|
| 210 |
+
"ru",
|
| 211 |
+
"sk",
|
| 212 |
+
"sr",
|
| 213 |
+
"sv",
|
| 214 |
+
"ta",
|
| 215 |
+
"te",
|
| 216 |
+
"uk",
|
| 217 |
+
"vi",
|
| 218 |
+
"zh"
|
| 219 |
+
],
|
| 220 |
+
"tasks": [
|
| 221 |
+
"question_answering"
|
| 222 |
+
],
|
| 223 |
+
"parallel": true,
|
| 224 |
+
"translation": "machine",
|
| 225 |
+
"base": "MMLU",
|
| 226 |
+
"implemented": false,
|
| 227 |
+
"group": "Multitask Language Understanding"
|
| 228 |
+
},
|
| 229 |
+
{
|
| 230 |
+
"name": "MMLU-X",
|
| 231 |
+
"author": "OpenGPT-X",
|
| 232 |
+
"author_url": "https://opengpt-x.de",
|
| 233 |
+
"url": "https://huggingface.co/datasets/openGPT-X/mmlux",
|
| 234 |
+
"n_languages": 20,
|
| 235 |
+
"languages": [
|
| 236 |
+
"bg",
|
| 237 |
+
"cs",
|
| 238 |
+
"da",
|
| 239 |
+
"de",
|
| 240 |
+
"el",
|
| 241 |
+
"es",
|
| 242 |
+
"et",
|
| 243 |
+
"fi",
|
| 244 |
+
"fr",
|
| 245 |
+
"hu",
|
| 246 |
+
"it",
|
| 247 |
+
"lt",
|
| 248 |
+
"lv",
|
| 249 |
+
"nl",
|
| 250 |
+
"pl",
|
| 251 |
+
"pt",
|
| 252 |
+
"ro",
|
| 253 |
+
"sk",
|
| 254 |
+
"sl",
|
| 255 |
+
"sv"
|
| 256 |
+
],
|
| 257 |
+
"tasks": [
|
| 258 |
+
"question_answering"
|
| 259 |
+
],
|
| 260 |
+
"parallel": true,
|
| 261 |
+
"translation": "machine",
|
| 262 |
+
"base": "MMLU",
|
| 263 |
+
"implemented": false,
|
| 264 |
+
"group": "Multitask Language Understanding"
|
| 265 |
+
},
|
| 266 |
+
{
|
| 267 |
+
"name": "MMLU Auto-Translated",
|
| 268 |
+
"author": null,
|
| 269 |
+
"author_url": null,
|
| 270 |
+
"url": null,
|
| 271 |
+
"n_languages": null,
|
| 272 |
+
"tasks": [
|
| 273 |
+
"question_answering"
|
| 274 |
+
],
|
| 275 |
+
"parallel": true,
|
| 276 |
+
"translation": "machine",
|
| 277 |
+
"base": "MMLU",
|
| 278 |
+
"implemented": true,
|
| 279 |
+
"group": "Multitask Language Understanding"
|
| 280 |
+
},
|
| 281 |
+
{
|
| 282 |
+
"name": "MGSM",
|
| 283 |
+
"author": "Google",
|
| 284 |
+
"author_url": "https://google.com",
|
| 285 |
+
"url": "https://huggingface.co/datasets/juletxara/mgsm",
|
| 286 |
+
"n_languages": 10,
|
| 287 |
+
"tasks": [
|
| 288 |
+
"math"
|
| 289 |
+
],
|
| 290 |
+
"parallel": true,
|
| 291 |
+
"base": "MGSM",
|
| 292 |
+
"implemented": true,
|
| 293 |
+
"group": "Grade School Math"
|
| 294 |
+
},
|
| 295 |
+
{
|
| 296 |
+
"name": "AfriMGSM",
|
| 297 |
+
"author": "Masakhane",
|
| 298 |
+
"author_url": "https://www.masakhane.io",
|
| 299 |
+
"url": "https://huggingface.co/datasets/masakhane/afrimgsm",
|
| 300 |
+
"n_languages": 18,
|
| 301 |
+
"tasks": [
|
| 302 |
+
"math"
|
| 303 |
+
],
|
| 304 |
+
"parallel": true,
|
| 305 |
+
"translation": "human",
|
| 306 |
+
"base": "MGSM",
|
| 307 |
+
"implemented": true,
|
| 308 |
+
"group": "Grade School Math"
|
| 309 |
+
},
|
| 310 |
+
{
|
| 311 |
+
"name": "GSM8K-X",
|
| 312 |
+
"author": "OpenGPT-X",
|
| 313 |
+
"author_url": "https://opengpt-x.de",
|
| 314 |
+
"url": "https://huggingface.co/datasets/openGPT-X/gsm8kx",
|
| 315 |
+
"n_languages": 20,
|
| 316 |
+
"tasks": [
|
| 317 |
+
"math"
|
| 318 |
+
],
|
| 319 |
+
"parallel": true,
|
| 320 |
+
"translation": "machine",
|
| 321 |
+
"base": "MGSM",
|
| 322 |
+
"implemented": true,
|
| 323 |
+
"group": "Grade School Math"
|
| 324 |
+
},
|
| 325 |
+
{
|
| 326 |
+
"name": "GSM Auto-Translated",
|
| 327 |
+
"author": null,
|
| 328 |
+
"author_url": null,
|
| 329 |
+
"url": null,
|
| 330 |
+
"n_languages": 52,
|
| 331 |
+
"tasks": [
|
| 332 |
+
"math"
|
| 333 |
+
],
|
| 334 |
+
"parallel": true,
|
| 335 |
+
"translation": "machine",
|
| 336 |
+
"base": "MGSM",
|
| 337 |
+
"implemented": true,
|
| 338 |
+
"group": "Grade School Math"
|
| 339 |
+
},
|
| 340 |
+
{
|
| 341 |
+
"name": "Uhuru ARC Easy",
|
| 342 |
+
"author": "Masakhane",
|
| 343 |
+
"author_url": "https://www.masakhane.io",
|
| 344 |
+
"url": "https://huggingface.co/datasets/masakhane/uhura-arc-easy",
|
| 345 |
+
"n_languages": 6,
|
| 346 |
+
"tasks": [
|
| 347 |
+
"question_answering"
|
| 348 |
+
],
|
| 349 |
+
"parallel": true,
|
| 350 |
+
"translation": "human",
|
| 351 |
+
"base": "AI2 ARC",
|
| 352 |
+
"implemented": true,
|
| 353 |
+
"group": "ARC Question Answering"
|
| 354 |
+
},
|
| 355 |
+
{
|
| 356 |
+
"name": "Okapi ARC Challenge",
|
| 357 |
+
"author": "Academic",
|
| 358 |
+
"author_url": null,
|
| 359 |
+
"url": "https://huggingface.co/datasets/jon-tow/okapi_arc_challenge",
|
| 360 |
+
"n_languages": 31,
|
| 361 |
+
"tasks": [
|
| 362 |
+
"question_answering"
|
| 363 |
+
],
|
| 364 |
+
"parallel": true,
|
| 365 |
+
"translation": "machine",
|
| 366 |
+
"base": "AI2 ARC",
|
| 367 |
+
"implemented": false,
|
| 368 |
+
"group": "ARC Question Answering"
|
| 369 |
+
},
|
| 370 |
+
{
|
| 371 |
+
"name": "Arc-X",
|
| 372 |
+
"author": "OpenGPT-X",
|
| 373 |
+
"author_url": "https://opengpt-x.de",
|
| 374 |
+
"url": "https://huggingface.co/datasets/openGPT-X/arcx",
|
| 375 |
+
"n_languages": 20,
|
| 376 |
+
"tasks": [
|
| 377 |
+
"question_answering"
|
| 378 |
+
],
|
| 379 |
+
"parallel": true,
|
| 380 |
+
"translation": "machine",
|
| 381 |
+
"base": "AI2 ARC",
|
| 382 |
+
"implemented": false,
|
| 383 |
+
"group": "ARC Question Answering"
|
| 384 |
+
},
|
| 385 |
+
{
|
| 386 |
+
"name": "ARC-Easy Auto-Translated",
|
| 387 |
+
"author": null,
|
| 388 |
+
"author_url": null,
|
| 389 |
+
"url": null,
|
| 390 |
+
"n_languages": null,
|
| 391 |
+
"tasks": [
|
| 392 |
+
"question_answering"
|
| 393 |
+
],
|
| 394 |
+
"parallel": true,
|
| 395 |
+
"translation": "machine",
|
| 396 |
+
"base": "AI2 ARC",
|
| 397 |
+
"implemented": true,
|
| 398 |
+
"group": "ARC Question Answering"
|
| 399 |
+
},
|
| 400 |
+
{
|
| 401 |
+
"name": "Uhura TruthfulQA",
|
| 402 |
+
"author": "Masakhane",
|
| 403 |
+
"author_url": "https://www.masakhane.io",
|
| 404 |
+
"url": "https://huggingface.co/datasets/masakhane/uhura-truthfulqa",
|
| 405 |
+
"n_languages": 6,
|
| 406 |
+
"tasks": [
|
| 407 |
+
"question_answering"
|
| 408 |
+
],
|
| 409 |
+
"parallel": true,
|
| 410 |
+
"translation": "human",
|
| 411 |
+
"base": "TruthfulQA",
|
| 412 |
+
"implemented": true,
|
| 413 |
+
"group": "Truthfulness"
|
| 414 |
+
},
|
| 415 |
+
{
|
| 416 |
+
"name": "Okapi TruthfulQA",
|
| 417 |
+
"author": "Academic",
|
| 418 |
+
"author_url": null,
|
| 419 |
+
"url": "https://huggingface.co/datasets/jon-tow/okapi_truthfulqa/tree/main/data",
|
| 420 |
+
"n_languages": 31,
|
| 421 |
+
"tasks": [
|
| 422 |
+
"question_answering"
|
| 423 |
+
],
|
| 424 |
+
"parallel": true,
|
| 425 |
+
"translation": "machine",
|
| 426 |
+
"base": "TruthfulQA",
|
| 427 |
+
"implemented": false,
|
| 428 |
+
"group": "Truthfulness"
|
| 429 |
+
},
|
| 430 |
+
{
|
| 431 |
+
"name": "TruthfulQA-X",
|
| 432 |
+
"author": "OpenGPT-X",
|
| 433 |
+
"author_url": "https://opengpt-x.de",
|
| 434 |
+
"url": "https://huggingface.co/datasets/openGPT-X/truthfulqax",
|
| 435 |
+
"n_languages": 20,
|
| 436 |
+
"tasks": [
|
| 437 |
+
"question_answering"
|
| 438 |
+
],
|
| 439 |
+
"parallel": true,
|
| 440 |
+
"translation": "machine",
|
| 441 |
+
"base": "TruthfulQA",
|
| 442 |
+
"implemented": false,
|
| 443 |
+
"group": "Truthfulness"
|
| 444 |
+
},
|
| 445 |
+
{
|
| 446 |
+
"name": "TruthfulQA Auto-Translated",
|
| 447 |
+
"author": null,
|
| 448 |
+
"author_url": null,
|
| 449 |
+
"url": null,
|
| 450 |
+
"n_languages": null,
|
| 451 |
+
"tasks": [
|
| 452 |
+
"question_answering"
|
| 453 |
+
],
|
| 454 |
+
"parallel": true,
|
| 455 |
+
"translation": "machine",
|
| 456 |
+
"base": "TruthfulQA",
|
| 457 |
+
"implemented": true,
|
| 458 |
+
"group": "Truthfulness"
|
| 459 |
+
},
|
| 460 |
+
{
|
| 461 |
+
"name": "FLEURS",
|
| 462 |
+
"author": "Meta",
|
| 463 |
+
"author_url": "https://ai.meta.com",
|
| 464 |
+
"url": "https://huggingface.co/datasets/google/fleurs",
|
| 465 |
+
"n_languages": 102,
|
| 466 |
+
"tasks": [
|
| 467 |
+
"speech_recognition"
|
| 468 |
+
],
|
| 469 |
+
"parallel": true,
|
| 470 |
+
"translation": "human",
|
| 471 |
+
"base": "FLORES",
|
| 472 |
+
"implemented": false,
|
| 473 |
+
"group": "Speech Recognition"
|
| 474 |
+
},
|
| 475 |
+
{
|
| 476 |
+
"name": "CommonVoice",
|
| 477 |
+
"author": "Mozilla",
|
| 478 |
+
"author_url": "https://blog.mozilla.ai",
|
| 479 |
+
"url": "https://huggingface.co/datasets/mozilla-foundation/common_voice_17_0",
|
| 480 |
+
"n_languages": 124,
|
| 481 |
+
"tasks": [
|
| 482 |
+
"speech_recognition"
|
| 483 |
+
],
|
| 484 |
+
"parallel": null,
|
| 485 |
+
"translation": "human",
|
| 486 |
+
"group": "Speech Recognition"
|
| 487 |
+
},
|
| 488 |
+
{
|
| 489 |
+
"name": "WorldCuisines",
|
| 490 |
+
"author": "Academic",
|
| 491 |
+
"author_url": "https://worldcuisines.github.io",
|
| 492 |
+
"url": "https://huggingface.co/datasets/worldcuisines/vqa",
|
| 493 |
+
"n_languages": 30,
|
| 494 |
+
"tasks": [
|
| 495 |
+
"visual_question_answering"
|
| 496 |
+
],
|
| 497 |
+
"parallel": null,
|
| 498 |
+
"group": "Visual Question Answering"
|
| 499 |
+
},
|
| 500 |
+
{
|
| 501 |
+
"name": "CVQA",
|
| 502 |
+
"author": "Academic",
|
| 503 |
+
"author_url": null,
|
| 504 |
+
"url": "https://huggingface.co/datasets/afaji/cvqa",
|
| 505 |
+
"n_languages": 39,
|
| 506 |
+
"tasks": [
|
| 507 |
+
"visual_question_answering"
|
| 508 |
+
],
|
| 509 |
+
"parallel": null,
|
| 510 |
+
"group": "Visual Question Answering"
|
| 511 |
+
},
|
| 512 |
+
{
|
| 513 |
+
"name": "XNLI",
|
| 514 |
+
"author": "Meta",
|
| 515 |
+
"author_url": "https://ai.meta.com",
|
| 516 |
+
"url": "https://huggingface.co/datasets/facebook/xnli",
|
| 517 |
+
"n_languages": 14,
|
| 518 |
+
"tasks": [
|
| 519 |
+
"classification",
|
| 520 |
+
"logic"
|
| 521 |
+
],
|
| 522 |
+
"parallel": true,
|
| 523 |
+
"base": "MNLI",
|
| 524 |
+
"group": "Natural Language Inference"
|
| 525 |
+
},
|
| 526 |
+
{
|
| 527 |
+
"name": "AfriXNLI",
|
| 528 |
+
"author": "Masakhane",
|
| 529 |
+
"author_url": "https://www.masakhane.io",
|
| 530 |
+
"url": "https://huggingface.co/datasets/masakhane/afrixnli",
|
| 531 |
+
"n_languages": 18,
|
| 532 |
+
"tasks": [
|
| 533 |
+
"classification",
|
| 534 |
+
"logic"
|
| 535 |
+
],
|
| 536 |
+
"parallel": true,
|
| 537 |
+
"translation": "human",
|
| 538 |
+
"base": "MNLI",
|
| 539 |
+
"implemented": false,
|
| 540 |
+
"group": "Natural Language Inference"
|
| 541 |
+
},
|
| 542 |
+
{
|
| 543 |
+
"name": "XGLUE",
|
| 544 |
+
"author": "Microsoft",
|
| 545 |
+
"author_url": "https://microsoft.ai",
|
| 546 |
+
"url": "https://huggingface.co/datasets/microsoft/xglue",
|
| 547 |
+
"n_languages": 18,
|
| 548 |
+
"tasks": [
|
| 549 |
+
"pos"
|
| 550 |
+
],
|
| 551 |
+
"parallel": null,
|
| 552 |
+
"base": "GLUE",
|
| 553 |
+
"group": "General Language Understanding"
|
| 554 |
+
},
|
| 555 |
+
{
|
| 556 |
+
"name": "IndicGLUE",
|
| 557 |
+
"author": "AI4Bharat",
|
| 558 |
+
"author_url": "https://models.ai4bharat.org",
|
| 559 |
+
"url": "https://huggingface.co/datasets/ai4bharat/indic_glue",
|
| 560 |
+
"n_languages": 11,
|
| 561 |
+
"tasks": [
|
| 562 |
+
"question_answering"
|
| 563 |
+
],
|
| 564 |
+
"parallel": null,
|
| 565 |
+
"base": "GLUE",
|
| 566 |
+
"group": "General Language Understanding"
|
| 567 |
+
},
|
| 568 |
+
{
|
| 569 |
+
"name": "Okapi HellaSwag",
|
| 570 |
+
"author": "Academic",
|
| 571 |
+
"author_url": null,
|
| 572 |
+
"url": "https://huggingface.co/datasets/jon-tow/okapi_hellaswag",
|
| 573 |
+
"n_languages": 31,
|
| 574 |
+
"tasks": [
|
| 575 |
+
"question_answering"
|
| 576 |
+
],
|
| 577 |
+
"parallel": true,
|
| 578 |
+
"translation": "machine",
|
| 579 |
+
"base": "HellaSwag",
|
| 580 |
+
"implemented": false,
|
| 581 |
+
"group": "Adversarial Language Modelling"
|
| 582 |
+
},
|
| 583 |
+
{
|
| 584 |
+
"name": "HellaSwag-X",
|
| 585 |
+
"author": "OpenGPT-X",
|
| 586 |
+
"author_url": "https://opengpt-x.de",
|
| 587 |
+
"url": "https://huggingface.co/datasets/openGPT-X/hellaswagx",
|
| 588 |
+
"n_languages": 20,
|
| 589 |
+
"tasks": [
|
| 590 |
+
"question_answering"
|
| 591 |
+
],
|
| 592 |
+
"parallel": true,
|
| 593 |
+
"translation": "machine",
|
| 594 |
+
"base": "HellaSwag",
|
| 595 |
+
"implemented": false,
|
| 596 |
+
"group": "Adversarial Language Modelling"
|
| 597 |
+
},
|
| 598 |
+
{
|
| 599 |
+
"name": "WikiANN / PAN-X",
|
| 600 |
+
"author": "Academic",
|
| 601 |
+
"author_url": null,
|
| 602 |
+
"url": "https://huggingface.co/datasets/unimelb-nlp/wikiann",
|
| 603 |
+
"n_languages": 176,
|
| 604 |
+
"tasks": [
|
| 605 |
+
"ner"
|
| 606 |
+
],
|
| 607 |
+
"parallel": false,
|
| 608 |
+
"group": "Named Entity Recognition"
|
| 609 |
+
},
|
| 610 |
+
{
|
| 611 |
+
"name": "MasakhaNER",
|
| 612 |
+
"author": "Masakhane",
|
| 613 |
+
"author_url": "https://www.masakhane.io",
|
| 614 |
+
"url": "https://huggingface.co/datasets/masakhane/masakhaner",
|
| 615 |
+
"n_languages": 10,
|
| 616 |
+
"tasks": [
|
| 617 |
+
"ner"
|
| 618 |
+
],
|
| 619 |
+
"parallel": null,
|
| 620 |
+
"group": "Named Entity Recognition"
|
| 621 |
+
},
|
| 622 |
+
{
|
| 623 |
+
"name": "TΓΌlu 3 SFT Mixture",
|
| 624 |
+
"author": "AllenAI",
|
| 625 |
+
"author_url": "https://allenai.org",
|
| 626 |
+
"url": "https://huggingface.co/datasets/allenai/tulu-3-sft-mixture",
|
| 627 |
+
"n_languages": 70,
|
| 628 |
+
"tasks": [
|
| 629 |
+
"instruction_following"
|
| 630 |
+
],
|
| 631 |
+
"parallel": false,
|
| 632 |
+
"group": "Instruction Following"
|
| 633 |
+
},
|
| 634 |
+
{
|
| 635 |
+
"name": "xP3",
|
| 636 |
+
"author": "BigScience",
|
| 637 |
+
"author_url": "https://bigscience.huggingface.co",
|
| 638 |
+
"url": "https://huggingface.co/datasets/bigscience/xP3",
|
| 639 |
+
"n_languages": 46,
|
| 640 |
+
"tasks": [
|
| 641 |
+
"instruction_following"
|
| 642 |
+
],
|
| 643 |
+
"parallel": false,
|
| 644 |
+
"group": "Instruction Following"
|
| 645 |
+
},
|
| 646 |
+
{
|
| 647 |
+
"name": "Aya",
|
| 648 |
+
"author": "Cohere",
|
| 649 |
+
"author_url": "https://cohere.com",
|
| 650 |
+
"url": "https://huggingface.co/datasets/CohereForAI/aya_dataset",
|
| 651 |
+
"n_languages": 65,
|
| 652 |
+
"tasks": [
|
| 653 |
+
"instruction_following"
|
| 654 |
+
],
|
| 655 |
+
"parallel": null,
|
| 656 |
+
"group": "Instruction Following"
|
| 657 |
+
},
|
| 658 |
+
{
|
| 659 |
+
"name": "SEA-IFEVAL",
|
| 660 |
+
"author": "AI Singapore",
|
| 661 |
+
"author_url": "https://aisingapore.org",
|
| 662 |
+
"url": "https://huggingface.co/datasets/aisingapore/instruction_following-ifeval",
|
| 663 |
+
"n_languages": 7,
|
| 664 |
+
"tasks": [
|
| 665 |
+
"instruction_following"
|
| 666 |
+
],
|
| 667 |
+
"parallel": true,
|
| 668 |
+
"base": "IFEVAL",
|
| 669 |
+
"group": "Instruction Following"
|
| 670 |
+
},
|
| 671 |
+
{
|
| 672 |
+
"name": "Babel-670",
|
| 673 |
+
"author": "Academic",
|
| 674 |
+
"author_url": null,
|
| 675 |
+
"url": "https://github.com/UBC-NLP/Babel-670-Language-Identification",
|
| 676 |
+
"n_languages": 670,
|
| 677 |
+
"tasks": [
|
| 678 |
+
"language_identification"
|
| 679 |
+
],
|
| 680 |
+
"parallel": false,
|
| 681 |
+
"group": "Other Tasks"
|
| 682 |
+
},
|
| 683 |
+
{
|
| 684 |
+
"name": "CulturaX",
|
| 685 |
+
"author": "Academic",
|
| 686 |
+
"author_url": null,
|
| 687 |
+
"url": "https://huggingface.co/datasets/uonlp/CulturaX",
|
| 688 |
+
"n_languages": 167,
|
| 689 |
+
"tasks": [
|
| 690 |
+
"language_modeling"
|
| 691 |
+
],
|
| 692 |
+
"parallel": false,
|
| 693 |
+
"group": "Other Tasks"
|
| 694 |
+
},
|
| 695 |
+
{
|
| 696 |
+
"name": "XTREME",
|
| 697 |
+
"author": "Google",
|
| 698 |
+
"author_url": "https://google.com",
|
| 699 |
+
"url": "https://huggingface.co/datasets/google/xtreme",
|
| 700 |
+
"n_languages": 40,
|
| 701 |
+
"tasks": [
|
| 702 |
+
"translation",
|
| 703 |
+
"classification",
|
| 704 |
+
"question_answering",
|
| 705 |
+
"ner"
|
| 706 |
+
],
|
| 707 |
+
"parallel": null,
|
| 708 |
+
"group": "Other Tasks"
|
| 709 |
+
},
|
| 710 |
+
{
|
| 711 |
+
"name": "XLSUM",
|
| 712 |
+
"author": "Academic",
|
| 713 |
+
"author_url": null,
|
| 714 |
+
"url": "https://huggingface.co/datasets/csebuetnlp/xlsum",
|
| 715 |
+
"n_languages": 45,
|
| 716 |
+
"tasks": [
|
| 717 |
+
"summarization"
|
| 718 |
+
],
|
| 719 |
+
"parallel": true,
|
| 720 |
+
"group": "Other Tasks"
|
| 721 |
+
},
|
| 722 |
+
{
|
| 723 |
+
"name": "MSVAMP",
|
| 724 |
+
"author": "Microsoft",
|
| 725 |
+
"author_url": "https://microsoft.ai",
|
| 726 |
+
"url": "https://huggingface.co/datasets/Mathoctopus/MSVAMP",
|
| 727 |
+
"n_languages": 10,
|
| 728 |
+
"tasks": [
|
| 729 |
+
"math"
|
| 730 |
+
],
|
| 731 |
+
"parallel": true,
|
| 732 |
+
"group": "Other Tasks"
|
| 733 |
+
},
|
| 734 |
+
{
|
| 735 |
+
"name": "Multilingual Sentiments",
|
| 736 |
+
"author": "Academic",
|
| 737 |
+
"author_url": null,
|
| 738 |
+
"url": "https://huggingface.co/datasets/tyqiangz/multilingual-sentiments",
|
| 739 |
+
"n_languages": 12,
|
| 740 |
+
"tasks": [
|
| 741 |
+
"sentiment_analysis"
|
| 742 |
+
],
|
| 743 |
+
"parallel": null,
|
| 744 |
+
"group": "Other Tasks"
|
| 745 |
+
},
|
| 746 |
+
{
|
| 747 |
+
"name": "Lanfrica",
|
| 748 |
+
"author": "Lanfrica",
|
| 749 |
+
"author_url": "https://lanfrica.com",
|
| 750 |
+
"url": "https://lanfrica.com/records?language=yor&task=machine%20translation",
|
| 751 |
+
"n_languages": 2200,
|
| 752 |
+
"tasks": [
|
| 753 |
+
"datasets"
|
| 754 |
+
],
|
| 755 |
+
"parallel": null,
|
| 756 |
+
"group": "Dataset Collections"
|
| 757 |
+
},
|
| 758 |
+
{
|
| 759 |
+
"name": "HuggingFace Languages",
|
| 760 |
+
"author": "HuggingFace",
|
| 761 |
+
"author_url": "https://huggingface.co",
|
| 762 |
+
"url": "https://huggingface.co/languages",
|
| 763 |
+
"n_languages": 4680,
|
| 764 |
+
"tasks": [
|
| 765 |
+
"datasets",
|
| 766 |
+
"models"
|
| 767 |
+
],
|
| 768 |
+
"parallel": null,
|
| 769 |
+
"group": "Dataset Collections"
|
| 770 |
+
},
|
| 771 |
+
{
|
| 772 |
+
"name": "HuggingFace Multilingual Datasets",
|
| 773 |
+
"author": "HuggingFace",
|
| 774 |
+
"author_url": "https://huggingface.co",
|
| 775 |
+
"url": "https://huggingface.co/datasets?other=multilinguality:multilingual",
|
| 776 |
+
"n_languages": 2012,
|
| 777 |
+
"tasks": [
|
| 778 |
+
"datasets"
|
| 779 |
+
],
|
| 780 |
+
"parallel": false,
|
| 781 |
+
"group": "Dataset Collections"
|
| 782 |
+
}
|
| 783 |
+
]
|
evals/__init__.py
CHANGED
|
@@ -1 +0,0 @@
|
|
| 1 |
-
|
|
|
|
|
|
evals/backend.py
CHANGED
|
@@ -4,16 +4,18 @@ import os
|
|
| 4 |
import numpy as np
|
| 5 |
import pandas as pd
|
| 6 |
import uvicorn
|
|
|
|
| 7 |
from countries import make_country_table
|
|
|
|
| 8 |
from fastapi import FastAPI, Request
|
| 9 |
from fastapi.middleware.cors import CORSMiddleware
|
| 10 |
from fastapi.middleware.gzip import GZipMiddleware
|
| 11 |
from fastapi.responses import JSONResponse
|
| 12 |
from fastapi.staticfiles import StaticFiles
|
| 13 |
|
| 14 |
-
scores =
|
| 15 |
-
languages =
|
| 16 |
-
models =
|
| 17 |
|
| 18 |
|
| 19 |
def mean(lst):
|
|
@@ -26,7 +28,7 @@ task_metrics = [
|
|
| 26 |
"classification_accuracy",
|
| 27 |
"mmlu_accuracy",
|
| 28 |
"arc_accuracy",
|
| 29 |
-
|
| 30 |
"mgsm_accuracy",
|
| 31 |
]
|
| 32 |
|
|
@@ -39,28 +41,58 @@ def compute_normalized_average(df, metrics):
|
|
| 39 |
col_min = normalized_df[col].min()
|
| 40 |
col_max = normalized_df[col].max()
|
| 41 |
if col_max > col_min: # Avoid division by zero
|
| 42 |
-
normalized_df[col] = (normalized_df[col] - col_min) / (
|
|
|
|
|
|
|
| 43 |
else:
|
| 44 |
normalized_df[col] = 0 # If all values are the same, set to 0
|
| 45 |
return normalized_df.mean(axis=1, skipna=False)
|
| 46 |
|
| 47 |
|
| 48 |
-
def make_model_table(
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
)
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
df =
|
|
|
|
| 57 |
for metric in task_metrics:
|
| 58 |
if metric not in df.columns:
|
| 59 |
df[metric] = np.nan
|
|
|
|
| 60 |
df["average"] = compute_normalized_average(df, task_metrics)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
| 62 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
| 63 |
df["rank"] = df.index + 1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
df = df[
|
| 65 |
[
|
| 66 |
"rank",
|
|
@@ -74,27 +106,41 @@ def make_model_table(df, models):
|
|
| 74 |
"license",
|
| 75 |
"cost",
|
| 76 |
"average",
|
| 77 |
-
*
|
| 78 |
]
|
| 79 |
]
|
| 80 |
return df
|
| 81 |
|
| 82 |
|
| 83 |
-
def make_language_table(
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
|
|
|
|
|
|
| 88 |
)
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
for metric in task_metrics:
|
| 93 |
if metric not in df.columns:
|
| 94 |
df[metric] = np.nan
|
|
|
|
| 95 |
df["average"] = compute_normalized_average(df, task_metrics)
|
| 96 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 97 |
df = df.sort_values(by="speakers", ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
df = df[
|
| 99 |
[
|
| 100 |
"bcp_47",
|
|
@@ -104,7 +150,7 @@ def make_language_table(df, languages):
|
|
| 104 |
"family",
|
| 105 |
"average",
|
| 106 |
"in_benchmark",
|
| 107 |
-
*
|
| 108 |
]
|
| 109 |
]
|
| 110 |
return df
|
|
@@ -125,35 +171,39 @@ async def data(request: Request):
|
|
| 125 |
body = await request.body()
|
| 126 |
data = json.loads(body)
|
| 127 |
selected_languages = data.get("selectedLanguages", {})
|
| 128 |
-
|
| 129 |
-
#
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 135 |
if len(df) == 0:
|
| 136 |
model_table = pd.DataFrame()
|
| 137 |
countries = pd.DataFrame()
|
| 138 |
else:
|
| 139 |
model_table = make_model_table(df, models)
|
| 140 |
countries = make_country_table(make_language_table(df, languages))
|
| 141 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 142 |
"model_table": serialize(model_table),
|
| 143 |
"language_table": serialize(language_table),
|
| 144 |
"dataset_table": serialize(datasets_df),
|
| 145 |
"countries": serialize(countries),
|
| 146 |
-
|
| 147 |
-
|
| 148 |
|
| 149 |
|
| 150 |
-
# Only serve static files if build directory exists
|
| 151 |
if os.path.exists("frontend/build"):
|
| 152 |
app.mount("/", StaticFiles(directory="frontend/build", html=True), name="frontend")
|
| 153 |
-
else:
|
| 154 |
-
print("π§ͺ Development mode: frontend/build directory not found")
|
| 155 |
-
print("π Frontend should be running on http://localhost:3000")
|
| 156 |
-
print("π‘ API available at http://localhost:8000/api/data")
|
| 157 |
|
| 158 |
if __name__ == "__main__":
|
| 159 |
uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 8000)))
|
|
|
|
| 4 |
import numpy as np
|
| 5 |
import pandas as pd
|
| 6 |
import uvicorn
|
| 7 |
+
|
| 8 |
from countries import make_country_table
|
| 9 |
+
from datasets_.util import load
|
| 10 |
from fastapi import FastAPI, Request
|
| 11 |
from fastapi.middleware.cors import CORSMiddleware
|
| 12 |
from fastapi.middleware.gzip import GZipMiddleware
|
| 13 |
from fastapi.responses import JSONResponse
|
| 14 |
from fastapi.staticfiles import StaticFiles
|
| 15 |
|
| 16 |
+
scores = load("results")
|
| 17 |
+
languages = load("languages")
|
| 18 |
+
models = load("models")
|
| 19 |
|
| 20 |
|
| 21 |
def mean(lst):
|
|
|
|
| 28 |
"classification_accuracy",
|
| 29 |
"mmlu_accuracy",
|
| 30 |
"arc_accuracy",
|
| 31 |
+
"truthfulqa_accuracy",
|
| 32 |
"mgsm_accuracy",
|
| 33 |
]
|
| 34 |
|
|
|
|
| 41 |
col_min = normalized_df[col].min()
|
| 42 |
col_max = normalized_df[col].max()
|
| 43 |
if col_max > col_min: # Avoid division by zero
|
| 44 |
+
normalized_df[col] = (normalized_df[col] - col_min) / (
|
| 45 |
+
col_max - col_min
|
| 46 |
+
)
|
| 47 |
else:
|
| 48 |
normalized_df[col] = 0 # If all values are the same, set to 0
|
| 49 |
return normalized_df.mean(axis=1, skipna=False)
|
| 50 |
|
| 51 |
|
| 52 |
+
def make_model_table(scores_df, models):
|
| 53 |
+
scores_df = scores_df.copy()
|
| 54 |
+
# Create a combined task_metric for origin
|
| 55 |
+
scores_df["task_metric_origin"] = (
|
| 56 |
+
scores_df["task"] + "_" + scores_df["metric"] + "_" + scores_df["origin"]
|
| 57 |
+
)
|
| 58 |
+
|
| 59 |
+
# Pivot to get scores for each origin-specific metric
|
| 60 |
+
scores_pivot = scores_df.pivot_table(
|
| 61 |
+
index="model",
|
| 62 |
+
columns="task_metric_origin",
|
| 63 |
+
values="score",
|
| 64 |
+
aggfunc="mean",
|
| 65 |
+
)
|
| 66 |
+
|
| 67 |
+
# Create the regular task_metric for the main average calculation
|
| 68 |
+
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 69 |
+
main_pivot = scores_df.pivot_table(
|
| 70 |
+
index="model", columns="task_metric", values="score", aggfunc="mean"
|
| 71 |
)
|
| 72 |
+
|
| 73 |
+
# Merge the two pivots
|
| 74 |
+
df = pd.merge(main_pivot, scores_pivot, on="model", how="outer")
|
| 75 |
+
|
| 76 |
for metric in task_metrics:
|
| 77 |
if metric not in df.columns:
|
| 78 |
df[metric] = np.nan
|
| 79 |
+
|
| 80 |
df["average"] = compute_normalized_average(df, task_metrics)
|
| 81 |
+
|
| 82 |
+
# Add flag if any machine-origin data was used
|
| 83 |
+
machine_presence = scores_df[scores_df["origin"] == "machine"].groupby(["model", "task_metric"]).size()
|
| 84 |
+
for metric in task_metrics:
|
| 85 |
+
df[f"{metric}_contains_machine"] = df.index.map(lambda m: (m, metric) in machine_presence.index)
|
| 86 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
| 87 |
df = pd.merge(df, models, left_on="model", right_on="id", how="left")
|
| 88 |
df["rank"] = df.index + 1
|
| 89 |
+
|
| 90 |
+
# Dynamically find all metric columns to include
|
| 91 |
+
final_cols = df.columns
|
| 92 |
+
metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
|
| 93 |
+
|
| 94 |
+
df["creation_date"] = df["creation_date"].apply(lambda x: x.isoformat() if x else None)
|
| 95 |
+
|
| 96 |
df = df[
|
| 97 |
[
|
| 98 |
"rank",
|
|
|
|
| 106 |
"license",
|
| 107 |
"cost",
|
| 108 |
"average",
|
| 109 |
+
*sorted(list(set(metric_cols))),
|
| 110 |
]
|
| 111 |
]
|
| 112 |
return df
|
| 113 |
|
| 114 |
|
| 115 |
+
def make_language_table(scores_df, languages):
|
| 116 |
+
scores_df = scores_df.copy()
|
| 117 |
+
scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"]
|
| 118 |
+
|
| 119 |
+
# Pivot scores
|
| 120 |
+
score_pivot = scores_df.pivot_table(
|
| 121 |
+
index="bcp_47", columns="task_metric", values="score", aggfunc="mean"
|
| 122 |
)
|
| 123 |
+
|
| 124 |
+
# Pivot origins (first origin since each task+lang combo has only one)
|
| 125 |
+
origin_pivot = scores_df.pivot_table(
|
| 126 |
+
index="bcp_47", columns="task_metric", values="origin", aggfunc="first"
|
| 127 |
+
)
|
| 128 |
+
origin_pivot = origin_pivot.add_suffix("_origin")
|
| 129 |
+
|
| 130 |
+
df = pd.merge(score_pivot, origin_pivot, on="bcp_47", how="outer")
|
| 131 |
+
|
| 132 |
for metric in task_metrics:
|
| 133 |
if metric not in df.columns:
|
| 134 |
df[metric] = np.nan
|
| 135 |
+
|
| 136 |
df["average"] = compute_normalized_average(df, task_metrics)
|
| 137 |
df = pd.merge(languages, df, on="bcp_47", how="outer")
|
| 138 |
df = df.sort_values(by="speakers", ascending=False)
|
| 139 |
+
|
| 140 |
+
# Dynamically find all metric columns to include
|
| 141 |
+
final_cols = df.columns
|
| 142 |
+
metric_cols = [m for m in final_cols if any(tm in m for tm in task_metrics)]
|
| 143 |
+
|
| 144 |
df = df[
|
| 145 |
[
|
| 146 |
"bcp_47",
|
|
|
|
| 150 |
"family",
|
| 151 |
"average",
|
| 152 |
"in_benchmark",
|
| 153 |
+
*sorted(list(set(metric_cols))),
|
| 154 |
]
|
| 155 |
]
|
| 156 |
return df
|
|
|
|
| 171 |
body = await request.body()
|
| 172 |
data = json.loads(body)
|
| 173 |
selected_languages = data.get("selectedLanguages", {})
|
| 174 |
+
|
| 175 |
+
# Identify which metrics have machine translations available
|
| 176 |
+
machine_translated_metrics = {
|
| 177 |
+
f"{row['task']}_{row['metric']}"
|
| 178 |
+
for _, row in scores.iterrows()
|
| 179 |
+
if row["origin"] == "machine"
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
# Filter by selected languages if provided
|
| 183 |
+
df = scores[scores["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)] if selected_languages else scores
|
| 184 |
+
|
| 185 |
if len(df) == 0:
|
| 186 |
model_table = pd.DataFrame()
|
| 187 |
countries = pd.DataFrame()
|
| 188 |
else:
|
| 189 |
model_table = make_model_table(df, models)
|
| 190 |
countries = make_country_table(make_language_table(df, languages))
|
| 191 |
+
|
| 192 |
+
language_table = make_language_table(scores, languages)
|
| 193 |
+
datasets_df = pd.read_json("data/datasets.json")
|
| 194 |
+
|
| 195 |
+
return JSONResponse(content={
|
| 196 |
"model_table": serialize(model_table),
|
| 197 |
"language_table": serialize(language_table),
|
| 198 |
"dataset_table": serialize(datasets_df),
|
| 199 |
"countries": serialize(countries),
|
| 200 |
+
"machine_translated_metrics": list(machine_translated_metrics),
|
| 201 |
+
})
|
| 202 |
|
| 203 |
|
| 204 |
+
# Only serve static files if build directory exists
|
| 205 |
if os.path.exists("frontend/build"):
|
| 206 |
app.mount("/", StaticFiles(directory="frontend/build", html=True), name="frontend")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
if __name__ == "__main__":
|
| 209 |
uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 8000)))
|
evals/countries.py
CHANGED
|
@@ -15,6 +15,7 @@ def population(bcp_47):
|
|
| 15 |
}
|
| 16 |
return items
|
| 17 |
|
|
|
|
| 18 |
@cache
|
| 19 |
def make_country_table(language_table):
|
| 20 |
countries = defaultdict(list)
|
|
@@ -30,10 +31,15 @@ def make_country_table(language_table):
|
|
| 30 |
)
|
| 31 |
for country, languages in countries.items():
|
| 32 |
speaker_pop = sum(entry["population"] for entry in languages)
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 37 |
countries[country] = {
|
| 38 |
"score": score,
|
| 39 |
"languages": languages,
|
|
|
|
| 15 |
}
|
| 16 |
return items
|
| 17 |
|
| 18 |
+
|
| 19 |
@cache
|
| 20 |
def make_country_table(language_table):
|
| 21 |
countries = defaultdict(list)
|
|
|
|
| 31 |
)
|
| 32 |
for country, languages in countries.items():
|
| 33 |
speaker_pop = sum(entry["population"] for entry in languages)
|
| 34 |
+
|
| 35 |
+
if speaker_pop < 1000: # Grey out low-population countries
|
| 36 |
+
score = None # This will make them appear grey on the map
|
| 37 |
+
else:
|
| 38 |
+
score = (
|
| 39 |
+
sum(entry["score"] * entry["population"] for entry in languages)
|
| 40 |
+
/ speaker_pop
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
countries[country] = {
|
| 44 |
"score": score,
|
| 45 |
"languages": languages,
|
evals/datasets_/arc.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
import random
|
| 2 |
-
from collections import Counter, defaultdict
|
| 3 |
|
| 4 |
-
from langcodes import
|
| 5 |
from rich import print
|
| 6 |
-
from models import translate_google,
|
| 7 |
from tqdm import tqdm
|
| 8 |
-
from datasets import
|
| 9 |
import asyncio
|
| 10 |
from tqdm.asyncio import tqdm_asyncio
|
| 11 |
import os
|
|
@@ -14,27 +13,33 @@ from datasets_.util import _get_dataset_config_names, _load_dataset
|
|
| 14 |
|
| 15 |
slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
|
| 16 |
tags_uhura_arc_easy = {
|
| 17 |
-
standardize_tag(a.split("_")[0], macro=True): a
|
|
|
|
| 18 |
if not a.endswith("unmatched")
|
| 19 |
}
|
| 20 |
|
| 21 |
|
| 22 |
random.seed(42)
|
| 23 |
-
id_sets_train = [
|
|
|
|
|
|
|
|
|
|
| 24 |
common_ids_train = list(sorted(set.intersection(*id_sets_train)))
|
| 25 |
random.shuffle(common_ids_train)
|
| 26 |
-
id_sets_test = [
|
|
|
|
|
|
|
|
|
|
| 27 |
common_ids_test = list(sorted(set.intersection(*id_sets_test)))
|
| 28 |
random.shuffle(common_ids_test)
|
| 29 |
|
| 30 |
slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
|
| 31 |
tags_uhura_arc_easy_translated = {
|
| 32 |
-
standardize_tag(a.split("_")[0], macro=True): a
|
|
|
|
| 33 |
}
|
| 34 |
|
| 35 |
|
| 36 |
-
|
| 37 |
-
|
| 38 |
def add_choices(row):
|
| 39 |
row["choices"] = row["choices"]["text"]
|
| 40 |
return row
|
|
@@ -45,37 +50,40 @@ def load_uhura_arc_easy(language_bcp_47, nr):
|
|
| 45 |
ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
|
| 46 |
ds = ds.map(add_choices)
|
| 47 |
ds = ds.rename_column("answerKey", "answer")
|
| 48 |
-
train_ids = common_ids_train[nr:nr+3]
|
| 49 |
-
examples = ds["train"].filter(lambda x: x["id"] in train_ids)
|
| 50 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
| 51 |
-
return "masakhane/uhura-arc-easy",
|
| 52 |
if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
|
| 53 |
-
ds = _load_dataset(
|
|
|
|
|
|
|
|
|
|
| 54 |
ds = ds.rename_column("answerKey", "answer")
|
| 55 |
-
train_ids = common_ids_train[nr:nr+3]
|
| 56 |
-
examples = ds["train"].filter(lambda x: x["id"] in train_ids)
|
| 57 |
-
# raise Exception(language_bcp_47)
|
| 58 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
| 59 |
-
return "fair-forward/arc-easy-autotranslated",
|
| 60 |
else:
|
| 61 |
return None, None, None
|
| 62 |
|
|
|
|
| 63 |
def translate_arc(languages):
|
| 64 |
human_translated = tags_uhura_arc_easy.keys()
|
| 65 |
untranslated = [
|
| 66 |
lang
|
| 67 |
-
for lang in languages["bcp_47"].values
|
| 68 |
-
if lang not in human_translated and lang in
|
| 69 |
]
|
| 70 |
n_samples = 10
|
| 71 |
-
train_ids = common_ids_train[:n_samples+3]
|
| 72 |
-
en_train = _load_dataset(
|
|
|
|
|
|
|
| 73 |
en_train = en_train.filter(lambda x: x["id"] in train_ids)
|
| 74 |
test_ids = common_ids_test[:n_samples]
|
| 75 |
-
en_test = _load_dataset(
|
|
|
|
|
|
|
| 76 |
en_test = en_test.filter(lambda x: x["id"] in test_ids)
|
| 77 |
data = {"train": en_train, "test": en_test}
|
| 78 |
-
|
| 79 |
slug = "fair-forward/arc-easy-autotranslated"
|
| 80 |
for lang in tqdm(untranslated):
|
| 81 |
# check if already exists on hub
|
|
@@ -84,16 +92,22 @@ def translate_arc(languages):
|
|
| 84 |
except (ValueError, Exception):
|
| 85 |
print(f"Translating {lang}...")
|
| 86 |
for split, data_en in data.items():
|
| 87 |
-
questions_tr = [
|
|
|
|
|
|
|
| 88 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
| 89 |
choices_texts_concatenated = []
|
| 90 |
for choice in data_en["choices"]:
|
| 91 |
for option in choice["text"]:
|
| 92 |
choices_texts_concatenated.append(option)
|
| 93 |
-
choices_tr = [
|
|
|
|
|
|
|
| 94 |
choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
|
| 95 |
# group into chunks of 4
|
| 96 |
-
choices_tr = [
|
|
|
|
|
|
|
| 97 |
|
| 98 |
ds_lang = Dataset.from_dict(
|
| 99 |
{
|
|
@@ -110,5 +124,8 @@ def translate_arc(languages):
|
|
| 110 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
| 111 |
)
|
| 112 |
ds_lang.to_json(
|
| 113 |
-
f"data/translations/arc/{lang}_{split}.json",
|
|
|
|
|
|
|
|
|
|
| 114 |
)
|
|
|
|
| 1 |
import random
|
|
|
|
| 2 |
|
| 3 |
+
from langcodes import standardize_tag
|
| 4 |
from rich import print
|
| 5 |
+
from models import translate_google, get_google_supported_languages
|
| 6 |
from tqdm import tqdm
|
| 7 |
+
from datasets import load_dataset, Dataset
|
| 8 |
import asyncio
|
| 9 |
from tqdm.asyncio import tqdm_asyncio
|
| 10 |
import os
|
|
|
|
| 13 |
|
| 14 |
slug_uhura_arc_easy = "masakhane/uhura-arc-easy"
|
| 15 |
tags_uhura_arc_easy = {
|
| 16 |
+
standardize_tag(a.split("_")[0], macro=True): a
|
| 17 |
+
for a in _get_dataset_config_names(slug_uhura_arc_easy)
|
| 18 |
if not a.endswith("unmatched")
|
| 19 |
}
|
| 20 |
|
| 21 |
|
| 22 |
random.seed(42)
|
| 23 |
+
id_sets_train = [
|
| 24 |
+
set(_load_dataset(slug_uhura_arc_easy, tag, split="train")["id"])
|
| 25 |
+
for tag in tags_uhura_arc_easy.values()
|
| 26 |
+
]
|
| 27 |
common_ids_train = list(sorted(set.intersection(*id_sets_train)))
|
| 28 |
random.shuffle(common_ids_train)
|
| 29 |
+
id_sets_test = [
|
| 30 |
+
set(_load_dataset(slug_uhura_arc_easy, tag, split="test")["id"])
|
| 31 |
+
for tag in tags_uhura_arc_easy.values()
|
| 32 |
+
]
|
| 33 |
common_ids_test = list(sorted(set.intersection(*id_sets_test)))
|
| 34 |
random.shuffle(common_ids_test)
|
| 35 |
|
| 36 |
slug_uhura_arc_easy_translated = "fair-forward/arc-easy-autotranslated"
|
| 37 |
tags_uhura_arc_easy_translated = {
|
| 38 |
+
standardize_tag(a.split("_")[0], macro=True): a
|
| 39 |
+
for a in _get_dataset_config_names(slug_uhura_arc_easy_translated)
|
| 40 |
}
|
| 41 |
|
| 42 |
|
|
|
|
|
|
|
| 43 |
def add_choices(row):
|
| 44 |
row["choices"] = row["choices"]["text"]
|
| 45 |
return row
|
|
|
|
| 50 |
ds = _load_dataset(slug_uhura_arc_easy, tags_uhura_arc_easy[language_bcp_47])
|
| 51 |
ds = ds.map(add_choices)
|
| 52 |
ds = ds.rename_column("answerKey", "answer")
|
|
|
|
|
|
|
| 53 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
| 54 |
+
return "masakhane/uhura-arc-easy", task, "human"
|
| 55 |
if language_bcp_47 in tags_uhura_arc_easy_translated.keys():
|
| 56 |
+
ds = _load_dataset(
|
| 57 |
+
slug_uhura_arc_easy_translated,
|
| 58 |
+
tags_uhura_arc_easy_translated[language_bcp_47],
|
| 59 |
+
)
|
| 60 |
ds = ds.rename_column("answerKey", "answer")
|
|
|
|
|
|
|
|
|
|
| 61 |
task = ds["test"].filter(lambda x: x["id"] == common_ids_test[nr])[0]
|
| 62 |
+
return "fair-forward/arc-easy-autotranslated", task, "machine"
|
| 63 |
else:
|
| 64 |
return None, None, None
|
| 65 |
|
| 66 |
+
|
| 67 |
def translate_arc(languages):
|
| 68 |
human_translated = tags_uhura_arc_easy.keys()
|
| 69 |
untranslated = [
|
| 70 |
lang
|
| 71 |
+
for lang in languages["bcp_47"].values
|
| 72 |
+
if lang not in human_translated and lang in get_google_supported_languages()
|
| 73 |
]
|
| 74 |
n_samples = 10
|
| 75 |
+
train_ids = common_ids_train[: n_samples + 3]
|
| 76 |
+
en_train = _load_dataset(
|
| 77 |
+
slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="train"
|
| 78 |
+
)
|
| 79 |
en_train = en_train.filter(lambda x: x["id"] in train_ids)
|
| 80 |
test_ids = common_ids_test[:n_samples]
|
| 81 |
+
en_test = _load_dataset(
|
| 82 |
+
slug_uhura_arc_easy, subset=tags_uhura_arc_easy["en"], split="test"
|
| 83 |
+
)
|
| 84 |
en_test = en_test.filter(lambda x: x["id"] in test_ids)
|
| 85 |
data = {"train": en_train, "test": en_test}
|
| 86 |
+
|
| 87 |
slug = "fair-forward/arc-easy-autotranslated"
|
| 88 |
for lang in tqdm(untranslated):
|
| 89 |
# check if already exists on hub
|
|
|
|
| 92 |
except (ValueError, Exception):
|
| 93 |
print(f"Translating {lang}...")
|
| 94 |
for split, data_en in data.items():
|
| 95 |
+
questions_tr = [
|
| 96 |
+
translate_google(q, "en", lang) for q in data_en["question"]
|
| 97 |
+
]
|
| 98 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
| 99 |
choices_texts_concatenated = []
|
| 100 |
for choice in data_en["choices"]:
|
| 101 |
for option in choice["text"]:
|
| 102 |
choices_texts_concatenated.append(option)
|
| 103 |
+
choices_tr = [
|
| 104 |
+
translate_google(c, "en", lang) for c in choices_texts_concatenated
|
| 105 |
+
]
|
| 106 |
choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
|
| 107 |
# group into chunks of 4
|
| 108 |
+
choices_tr = [
|
| 109 |
+
choices_tr[i : i + 4] for i in range(0, len(choices_tr), 4)
|
| 110 |
+
]
|
| 111 |
|
| 112 |
ds_lang = Dataset.from_dict(
|
| 113 |
{
|
|
|
|
| 124 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
| 125 |
)
|
| 126 |
ds_lang.to_json(
|
| 127 |
+
f"data/translations/arc/{lang}_{split}.json",
|
| 128 |
+
lines=False,
|
| 129 |
+
force_ascii=False,
|
| 130 |
+
indent=2,
|
| 131 |
)
|
evals/datasets_/fleurs.py
CHANGED
|
@@ -11,6 +11,7 @@ fleurs["bcp_47"] = fleurs["fleurs_tag"].apply(
|
|
| 11 |
lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
|
| 12 |
)
|
| 13 |
|
|
|
|
| 14 |
def download_file(url, path):
|
| 15 |
response = requests.get(url)
|
| 16 |
with open(path, "wb") as f:
|
|
@@ -34,4 +35,4 @@ def download_fleurs(transcription_langs_eval):
|
|
| 34 |
if not tsv_path.exists():
|
| 35 |
print(f"Downloading {tsv_url} to {tsv_path}")
|
| 36 |
tsv_path.parent.mkdir(parents=True, exist_ok=True)
|
| 37 |
-
download_file(tsv_url, tsv_path)
|
|
|
|
| 11 |
lambda x: standardize_tag(x.rsplit("_")[0], macro=True)
|
| 12 |
)
|
| 13 |
|
| 14 |
+
|
| 15 |
def download_file(url, path):
|
| 16 |
response = requests.get(url)
|
| 17 |
with open(path, "wb") as f:
|
|
|
|
| 35 |
if not tsv_path.exists():
|
| 36 |
print(f"Downloading {tsv_url} to {tsv_path}")
|
| 37 |
tsv_path.parent.mkdir(parents=True, exist_ok=True)
|
| 38 |
+
download_file(tsv_url, tsv_path)
|
evals/datasets_/mgsm.py
CHANGED
|
@@ -1,10 +1,12 @@
|
|
| 1 |
import asyncio
|
| 2 |
import os
|
|
|
|
| 3 |
|
| 4 |
from datasets import Dataset, load_dataset
|
| 5 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 6 |
-
from langcodes import standardize_tag
|
| 7 |
-
from models import
|
|
|
|
| 8 |
from tqdm import tqdm
|
| 9 |
from tqdm.asyncio import tqdm_asyncio
|
| 10 |
|
|
@@ -37,39 +39,58 @@ def parse_number(i):
|
|
| 37 |
return None
|
| 38 |
|
| 39 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
def load_mgsm(language_bcp_47, nr):
|
| 41 |
if language_bcp_47 in tags_mgsm.keys():
|
| 42 |
-
|
| 43 |
-
return slug_mgsm,
|
| 44 |
elif language_bcp_47 in tags_afrimgsm.keys():
|
| 45 |
-
|
| 46 |
-
|
|
|
|
|
|
|
|
|
|
| 47 |
)
|
| 48 |
-
return
|
| 49 |
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
| 50 |
-
|
| 51 |
-
slug_gsm_autotranslated,
|
| 52 |
)
|
| 53 |
-
return slug_gsm_autotranslated,
|
| 54 |
-
elif language_bcp_47 in tags_gsm8kx.keys():
|
| 55 |
-
row = _load_dataset(
|
| 56 |
-
slug_gsm8kx,
|
| 57 |
-
subset=tags_gsm8kx[language_bcp_47],
|
| 58 |
-
split="test",
|
| 59 |
-
trust_remote_code=True,
|
| 60 |
-
)[nr]
|
| 61 |
-
row["answer_number"] = row["answer"].split("####")[1].strip()
|
| 62 |
-
return slug_gsm8kx, row
|
| 63 |
else:
|
| 64 |
-
return None, None
|
| 65 |
|
| 66 |
|
| 67 |
def translate_mgsm(languages):
|
| 68 |
human_translated = [*tags_mgsm.keys(), *tags_afrimgsm.keys()]
|
| 69 |
untranslated = [
|
| 70 |
lang
|
| 71 |
-
for lang in languages["bcp_47"].values
|
| 72 |
-
if lang not in human_translated and lang in
|
| 73 |
]
|
| 74 |
en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
|
| 75 |
slug = "fair-forward/gsm-autotranslated"
|
|
@@ -96,5 +117,8 @@ def translate_mgsm(languages):
|
|
| 96 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
| 97 |
)
|
| 98 |
ds_lang.to_json(
|
| 99 |
-
f"data/translations/mgsm/{lang}.json",
|
|
|
|
|
|
|
|
|
|
| 100 |
)
|
|
|
|
| 1 |
import asyncio
|
| 2 |
import os
|
| 3 |
+
import random
|
| 4 |
|
| 5 |
from datasets import Dataset, load_dataset
|
| 6 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset, cache
|
| 7 |
+
from langcodes import Language, standardize_tag
|
| 8 |
+
from models import get_google_supported_languages, translate_google
|
| 9 |
+
from rich import print
|
| 10 |
from tqdm import tqdm
|
| 11 |
from tqdm.asyncio import tqdm_asyncio
|
| 12 |
|
|
|
|
| 39 |
return None
|
| 40 |
|
| 41 |
|
| 42 |
+
@cache
|
| 43 |
+
def _get_mgsm_item(dataset_slug, subset_tag, nr, trust_remote_code=False):
|
| 44 |
+
"""Cache individual MGSM items efficiently"""
|
| 45 |
+
try:
|
| 46 |
+
ds = _load_dataset(
|
| 47 |
+
dataset_slug,
|
| 48 |
+
subset=subset_tag,
|
| 49 |
+
split="test",
|
| 50 |
+
trust_remote_code=trust_remote_code,
|
| 51 |
+
)
|
| 52 |
+
if nr >= len(ds):
|
| 53 |
+
return None
|
| 54 |
+
|
| 55 |
+
row = ds[nr]
|
| 56 |
+
|
| 57 |
+
# Post-process based on dataset type
|
| 58 |
+
if dataset_slug == slug_gsm8kx:
|
| 59 |
+
row["answer_number"] = row["answer"].split("####")[1].strip()
|
| 60 |
+
|
| 61 |
+
return row
|
| 62 |
+
except Exception:
|
| 63 |
+
# Dataset doesn't exist or doesn't have test split
|
| 64 |
+
return None
|
| 65 |
+
|
| 66 |
+
|
| 67 |
def load_mgsm(language_bcp_47, nr):
|
| 68 |
if language_bcp_47 in tags_mgsm.keys():
|
| 69 |
+
item = _get_mgsm_item(slug_mgsm, tags_mgsm[language_bcp_47], nr)
|
| 70 |
+
return slug_mgsm, item, "human" if item else (None, None, None)
|
| 71 |
elif language_bcp_47 in tags_afrimgsm.keys():
|
| 72 |
+
item = _get_mgsm_item(slug_afrimgsm, tags_afrimgsm[language_bcp_47], nr)
|
| 73 |
+
return slug_afrimgsm, item, "human" if item else (None, None, None)
|
| 74 |
+
elif language_bcp_47 in tags_gsm8kx.keys():
|
| 75 |
+
item = _get_mgsm_item(
|
| 76 |
+
slug_gsm8kx, tags_gsm8kx[language_bcp_47], nr, trust_remote_code=True
|
| 77 |
)
|
| 78 |
+
return slug_gsm8kx, item, "machine" if item else (None, None, None)
|
| 79 |
elif language_bcp_47 in tags_gsm_autotranslated.keys():
|
| 80 |
+
item = _get_mgsm_item(
|
| 81 |
+
slug_gsm_autotranslated, tags_gsm_autotranslated[language_bcp_47], nr
|
| 82 |
)
|
| 83 |
+
return slug_gsm_autotranslated, item, "machine" if item else (None, None, None)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
else:
|
| 85 |
+
return None, None, None
|
| 86 |
|
| 87 |
|
| 88 |
def translate_mgsm(languages):
|
| 89 |
human_translated = [*tags_mgsm.keys(), *tags_afrimgsm.keys()]
|
| 90 |
untranslated = [
|
| 91 |
lang
|
| 92 |
+
for lang in languages["bcp_47"].values
|
| 93 |
+
if lang not in human_translated and lang in get_google_supported_languages()
|
| 94 |
]
|
| 95 |
en = _load_dataset(slug_mgsm, subset=tags_mgsm["en"], split="test")
|
| 96 |
slug = "fair-forward/gsm-autotranslated"
|
|
|
|
| 117 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
| 118 |
)
|
| 119 |
ds_lang.to_json(
|
| 120 |
+
f"data/translations/mgsm/{lang}.json",
|
| 121 |
+
lines=False,
|
| 122 |
+
force_ascii=False,
|
| 123 |
+
indent=2,
|
| 124 |
)
|
evals/datasets_/mmlu.py
CHANGED
|
@@ -4,9 +4,9 @@ import random
|
|
| 4 |
from collections import Counter, defaultdict
|
| 5 |
|
| 6 |
from datasets import Dataset, load_dataset
|
| 7 |
-
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 8 |
from langcodes import Language, standardize_tag
|
| 9 |
-
from models import
|
| 10 |
from rich import print
|
| 11 |
from tqdm import tqdm
|
| 12 |
from tqdm.asyncio import tqdm_asyncio
|
|
@@ -111,6 +111,7 @@ def print_datasets_analysis():
|
|
| 111 |
# MMLUX is translated using DeepL
|
| 112 |
# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
|
| 113 |
|
|
|
|
| 114 |
# print_datasets_analysis()
|
| 115 |
|
| 116 |
|
|
@@ -143,32 +144,61 @@ tags_mmlux = set(
|
|
| 143 |
a.rsplit("_", 1)[1].split("-")[0].lower()
|
| 144 |
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
| 145 |
)
|
| 146 |
-
tags_mmlu_autotranslated =
|
|
|
|
|
|
|
|
|
|
| 147 |
|
| 148 |
categories = sorted(
|
| 149 |
-
|
| 150 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
category = categories[nr % len(categories)]
|
| 155 |
if language_bcp_47 in tags_afrimmlu.keys():
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
return "masakhane/afrimmlu", examples, task
|
| 161 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
elif language_bcp_47 in tags_mmlu_autotranslated:
|
| 168 |
-
|
| 169 |
-
|
| 170 |
-
|
| 171 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
else:
|
| 173 |
return None, None, None
|
| 174 |
|
|
@@ -177,10 +207,10 @@ def translate_mmlu(languages):
|
|
| 177 |
human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
|
| 178 |
untranslated = [
|
| 179 |
lang
|
| 180 |
-
for lang in languages["bcp_47"].values
|
| 181 |
-
if lang not in human_translated and lang in
|
| 182 |
]
|
| 183 |
-
n_samples =
|
| 184 |
|
| 185 |
slug = "fair-forward/mmlu-autotranslated"
|
| 186 |
for lang in tqdm(untranslated):
|
|
@@ -196,8 +226,10 @@ def translate_mmlu(languages):
|
|
| 196 |
if split == "dev":
|
| 197 |
samples.extend(ds.filter(lambda x: x["subject"] == category))
|
| 198 |
else:
|
| 199 |
-
|
| 200 |
-
|
|
|
|
|
|
|
| 201 |
samples.append(task)
|
| 202 |
questions_tr = [
|
| 203 |
translate_google(s["question"], "en", lang) for s in samples
|
|
|
|
| 4 |
from collections import Counter, defaultdict
|
| 5 |
|
| 6 |
from datasets import Dataset, load_dataset
|
| 7 |
+
from datasets_.util import _get_dataset_config_names, _load_dataset, cache
|
| 8 |
from langcodes import Language, standardize_tag
|
| 9 |
+
from models import get_google_supported_languages, translate_google
|
| 10 |
from rich import print
|
| 11 |
from tqdm import tqdm
|
| 12 |
from tqdm.asyncio import tqdm_asyncio
|
|
|
|
| 111 |
# MMLUX is translated using DeepL
|
| 112 |
# Therefore, the priority is: AfriMMLU, Global-MMLU, MMLUX, Okapi-MMLU
|
| 113 |
|
| 114 |
+
|
| 115 |
# print_datasets_analysis()
|
| 116 |
|
| 117 |
|
|
|
|
| 144 |
a.rsplit("_", 1)[1].split("-")[0].lower()
|
| 145 |
for a in _get_dataset_config_names("Eurolingua/mmlux", trust_remote_code=True)
|
| 146 |
)
|
| 147 |
+
tags_mmlu_autotranslated = {
|
| 148 |
+
standardize_tag(a, macro=True): a
|
| 149 |
+
for a in _get_dataset_config_names("fair-forward/mmlu-autotranslated")
|
| 150 |
+
}
|
| 151 |
|
| 152 |
categories = sorted(
|
| 153 |
+
list(set(_load_dataset("masakhane/afrimmlu", "eng")["dev"]["subject"]))
|
| 154 |
+
)
|
| 155 |
+
|
| 156 |
+
|
| 157 |
+
@cache
|
| 158 |
+
def _get_processed_mmlu_dataset(dataset_name, subset_tag):
|
| 159 |
+
"""Cache processed datasets to avoid reprocessing"""
|
| 160 |
+
ds = _load_dataset(dataset_name, subset_tag)
|
| 161 |
+
if dataset_name == "masakhane/afrimmlu":
|
| 162 |
+
ds = ds.map(parse_choices)
|
| 163 |
+
elif dataset_name == "CohereForAI/Global-MMLU":
|
| 164 |
+
ds = ds.map(add_choices)
|
| 165 |
+
return ds
|
| 166 |
|
| 167 |
|
| 168 |
+
@cache
|
| 169 |
+
def _get_mmlu_item(dataset_name, subset_tag, category, nr):
|
| 170 |
+
"""Cache individual MMLU items efficiently"""
|
| 171 |
+
ds = _get_processed_mmlu_dataset(dataset_name, subset_tag)
|
| 172 |
+
if dataset_name in ["masakhane/afrimmlu", "CohereForAI/Global-MMLU"]:
|
| 173 |
+
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
| 174 |
+
return filtered[nr] if nr < len(filtered) else None
|
| 175 |
+
else: # fair-forward/mmlu-autotranslated
|
| 176 |
+
filtered = ds["test"].filter(lambda x: x["subject"] == category)
|
| 177 |
+
return filtered[nr] if nr < len(filtered) else None
|
| 178 |
+
|
| 179 |
+
|
| 180 |
+
async def load_mmlu(language_bcp_47, nr):
|
| 181 |
category = categories[nr % len(categories)]
|
| 182 |
if language_bcp_47 in tags_afrimmlu.keys():
|
| 183 |
+
task = _get_mmlu_item(
|
| 184 |
+
"masakhane/afrimmlu", tags_afrimmlu[language_bcp_47], category, nr
|
| 185 |
+
)
|
| 186 |
+
return "masakhane/afrimmlu", task, "human" if task else (None, None, None)
|
|
|
|
| 187 |
elif language_bcp_47 in tags_global_mmlu.keys():
|
| 188 |
+
task = _get_mmlu_item(
|
| 189 |
+
"CohereForAI/Global-MMLU", tags_global_mmlu[language_bcp_47], category, nr
|
| 190 |
+
)
|
| 191 |
+
return "CohereForAI/Global-MMLU", task, "human" if task else (None, None, None)
|
| 192 |
+
# TODO: add in Okapi, MMLUX @Jonas
|
| 193 |
elif language_bcp_47 in tags_mmlu_autotranslated:
|
| 194 |
+
task = _get_mmlu_item(
|
| 195 |
+
"fair-forward/mmlu-autotranslated", language_bcp_47, category, nr
|
| 196 |
+
)
|
| 197 |
+
return (
|
| 198 |
+
"fair-forward/mmlu-autotranslated",
|
| 199 |
+
task,
|
| 200 |
+
"machine" if task else (None, None, None),
|
| 201 |
+
)
|
| 202 |
else:
|
| 203 |
return None, None, None
|
| 204 |
|
|
|
|
| 207 |
human_translated = [*tags_afrimmlu.keys(), *tags_global_mmlu.keys()]
|
| 208 |
untranslated = [
|
| 209 |
lang
|
| 210 |
+
for lang in languages["bcp_47"].values
|
| 211 |
+
if lang not in human_translated and lang in get_google_supported_languages()
|
| 212 |
]
|
| 213 |
+
n_samples = 20
|
| 214 |
|
| 215 |
slug = "fair-forward/mmlu-autotranslated"
|
| 216 |
for lang in tqdm(untranslated):
|
|
|
|
| 226 |
if split == "dev":
|
| 227 |
samples.extend(ds.filter(lambda x: x["subject"] == category))
|
| 228 |
else:
|
| 229 |
+
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
|
| 230 |
+
filtered = ds.filter(lambda x: x["subject"] == category)
|
| 231 |
+
for i in range(min(n_samples, len(filtered))):
|
| 232 |
+
task = filtered[i]
|
| 233 |
samples.append(task)
|
| 234 |
questions_tr = [
|
| 235 |
translate_google(s["question"], "en", lang) for s in samples
|
evals/datasets_/truthfulqa.py
CHANGED
|
@@ -9,16 +9,25 @@ from tqdm.asyncio import tqdm_asyncio
|
|
| 9 |
import os
|
| 10 |
|
| 11 |
from datasets import Dataset, load_dataset
|
| 12 |
-
from models import translate_google,
|
| 13 |
|
| 14 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 15 |
|
| 16 |
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
|
|
|
|
|
|
|
| 17 |
tags_uhura_truthfulqa = {
|
| 18 |
-
standardize_tag(a.split("_")[0], macro=True): a
|
|
|
|
| 19 |
if a.endswith("multiple_choice")
|
| 20 |
}
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
def add_choices(row):
|
| 24 |
row["choices"] = row["mc1_targets"]["choices"]
|
|
@@ -26,26 +35,43 @@ def add_choices(row):
|
|
| 26 |
return row
|
| 27 |
|
| 28 |
|
| 29 |
-
def load_truthfulqa(language_bcp_47, nr):
|
| 30 |
if language_bcp_47 in tags_uhura_truthfulqa.keys():
|
| 31 |
-
ds = _load_dataset(
|
|
|
|
|
|
|
| 32 |
ds = ds.map(add_choices)
|
| 33 |
-
examples = ds["train"]
|
| 34 |
task = ds["test"][nr]
|
| 35 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
else:
|
| 37 |
return None, None, None
|
| 38 |
|
| 39 |
|
| 40 |
-
|
| 41 |
def translate_truthfulqa(languages):
|
| 42 |
human_translated = [*tags_uhura_truthfulqa.keys()]
|
| 43 |
untranslated = [
|
| 44 |
lang
|
| 45 |
-
for lang in languages["bcp_47"].values[:
|
| 46 |
-
if lang not in human_translated and lang in
|
| 47 |
]
|
| 48 |
-
n_samples =
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
slug = "fair-forward/truthfulqa-autotranslated"
|
| 51 |
for lang in tqdm(untranslated):
|
|
@@ -55,37 +81,47 @@ def translate_truthfulqa(languages):
|
|
| 55 |
except (ValueError, Exception):
|
| 56 |
print(f"Translating {lang}...")
|
| 57 |
for split in ["train", "test"]:
|
| 58 |
-
ds = _load_dataset(
|
|
|
|
|
|
|
| 59 |
samples = []
|
| 60 |
if split == "train":
|
| 61 |
samples.extend(ds)
|
| 62 |
else:
|
| 63 |
-
|
|
|
|
| 64 |
task = ds[i]
|
| 65 |
samples.append(task)
|
|
|
|
|
|
|
| 66 |
questions_tr = [
|
| 67 |
translate_google(s["question"], "en", lang) for s in samples
|
| 68 |
]
|
| 69 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
| 70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 71 |
for s in samples:
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
choices_tr
|
| 81 |
-
|
|
|
|
|
|
|
| 82 |
|
| 83 |
ds_lang = Dataset.from_dict(
|
| 84 |
{
|
| 85 |
-
"subject": [s["subject"] for s in samples],
|
| 86 |
"question": questions_tr,
|
| 87 |
-
"choices":
|
| 88 |
-
"
|
| 89 |
}
|
| 90 |
)
|
| 91 |
ds_lang.push_to_hub(
|
|
@@ -95,7 +131,7 @@ def translate_truthfulqa(languages):
|
|
| 95 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
| 96 |
)
|
| 97 |
ds_lang.to_json(
|
| 98 |
-
f"data/translations/
|
| 99 |
lines=False,
|
| 100 |
force_ascii=False,
|
| 101 |
indent=2,
|
|
|
|
| 9 |
import os
|
| 10 |
|
| 11 |
from datasets import Dataset, load_dataset
|
| 12 |
+
from models import translate_google, get_google_supported_languages
|
| 13 |
|
| 14 |
from datasets_.util import _get_dataset_config_names, _load_dataset
|
| 15 |
|
| 16 |
slug_uhura_truthfulqa = "masakhane/uhura-truthfulqa"
|
| 17 |
+
slug_truthfulqa_autotranslated = "fair-forward/truthfulqa-autotranslated"
|
| 18 |
+
|
| 19 |
tags_uhura_truthfulqa = {
|
| 20 |
+
standardize_tag(a.split("_")[0], macro=True): a
|
| 21 |
+
for a in _get_dataset_config_names(slug_uhura_truthfulqa)
|
| 22 |
if a.endswith("multiple_choice")
|
| 23 |
}
|
| 24 |
|
| 25 |
+
tags_truthfulqa_autotranslated = {
|
| 26 |
+
standardize_tag(a, macro=True): a
|
| 27 |
+
for a in _get_dataset_config_names(slug_truthfulqa_autotranslated)
|
| 28 |
+
}
|
| 29 |
+
tags_truthfulqa_autotranslated = {}
|
| 30 |
+
|
| 31 |
|
| 32 |
def add_choices(row):
|
| 33 |
row["choices"] = row["mc1_targets"]["choices"]
|
|
|
|
| 35 |
return row
|
| 36 |
|
| 37 |
|
| 38 |
+
async def load_truthfulqa(language_bcp_47, nr):
|
| 39 |
if language_bcp_47 in tags_uhura_truthfulqa.keys():
|
| 40 |
+
ds = _load_dataset(
|
| 41 |
+
slug_uhura_truthfulqa, tags_uhura_truthfulqa[language_bcp_47]
|
| 42 |
+
)
|
| 43 |
ds = ds.map(add_choices)
|
|
|
|
| 44 |
task = ds["test"][nr]
|
| 45 |
+
# Ensure there is a correct answer before returning the task
|
| 46 |
+
if 1 not in task["labels"]:
|
| 47 |
+
return None, None, None
|
| 48 |
+
return "masakhane/uhura-truthfulqa", task, "human"
|
| 49 |
+
# TODO check quality/completeness of autotranslated dataset
|
| 50 |
+
# elif language_bcp_47 in tags_truthfulqa_autotranslated.keys():
|
| 51 |
+
# # Load from auto-translated dataset (same samples as translation)
|
| 52 |
+
# ds = _load_dataset(slug_truthfulqa_autotranslated, language_bcp_47)
|
| 53 |
+
# test_split = ds["test"] if "test" in ds else ds
|
| 54 |
+
# task = test_split[nr]
|
| 55 |
+
# # Ensure there is a correct answer before returning the task
|
| 56 |
+
# if 1 not in task.get("labels", []):
|
| 57 |
+
# return None, None, None
|
| 58 |
+
# return slug_truthfulqa_autotranslated, task, "machine"
|
| 59 |
+
# TODO: add Okapi, TruthfulQA-X @Jonas
|
| 60 |
else:
|
| 61 |
return None, None, None
|
| 62 |
|
| 63 |
|
|
|
|
| 64 |
def translate_truthfulqa(languages):
|
| 65 |
human_translated = [*tags_uhura_truthfulqa.keys()]
|
| 66 |
untranslated = [
|
| 67 |
lang
|
| 68 |
+
for lang in languages["bcp_47"].values[:150]
|
| 69 |
+
if lang not in human_translated and lang in get_google_supported_languages()
|
| 70 |
]
|
| 71 |
+
n_samples = 20
|
| 72 |
+
|
| 73 |
+
# Set fixed seed for consistent sample selection across all languages
|
| 74 |
+
random.seed(42)
|
| 75 |
|
| 76 |
slug = "fair-forward/truthfulqa-autotranslated"
|
| 77 |
for lang in tqdm(untranslated):
|
|
|
|
| 81 |
except (ValueError, Exception):
|
| 82 |
print(f"Translating {lang}...")
|
| 83 |
for split in ["train", "test"]:
|
| 84 |
+
ds = _load_dataset(
|
| 85 |
+
slug_uhura_truthfulqa, tags_uhura_truthfulqa["en"], split=split
|
| 86 |
+
)
|
| 87 |
samples = []
|
| 88 |
if split == "train":
|
| 89 |
samples.extend(ds)
|
| 90 |
else:
|
| 91 |
+
# Use the same 20 samples that the evaluation pipeline uses (indices 0-19)
|
| 92 |
+
for i in range(min(n_samples, len(ds))):
|
| 93 |
task = ds[i]
|
| 94 |
samples.append(task)
|
| 95 |
+
|
| 96 |
+
# Translate questions
|
| 97 |
questions_tr = [
|
| 98 |
translate_google(s["question"], "en", lang) for s in samples
|
| 99 |
]
|
| 100 |
questions_tr = asyncio.run(tqdm_asyncio.gather(*questions_tr))
|
| 101 |
+
|
| 102 |
+
# Translate choices for each sample
|
| 103 |
+
all_choices_tr = []
|
| 104 |
+
all_labels = []
|
| 105 |
+
|
| 106 |
for s in samples:
|
| 107 |
+
# Get choices from mc1_targets
|
| 108 |
+
choices = s["mc1_targets"]["choices"]
|
| 109 |
+
labels = s["mc1_targets"]["labels"]
|
| 110 |
+
|
| 111 |
+
# Translate choices
|
| 112 |
+
choices_tr = [
|
| 113 |
+
translate_google(choice, "en", lang) for choice in choices
|
| 114 |
+
]
|
| 115 |
+
choices_tr = asyncio.run(tqdm_asyncio.gather(*choices_tr))
|
| 116 |
+
|
| 117 |
+
all_choices_tr.append(choices_tr)
|
| 118 |
+
all_labels.append(labels)
|
| 119 |
|
| 120 |
ds_lang = Dataset.from_dict(
|
| 121 |
{
|
|
|
|
| 122 |
"question": questions_tr,
|
| 123 |
+
"choices": all_choices_tr,
|
| 124 |
+
"labels": all_labels,
|
| 125 |
}
|
| 126 |
)
|
| 127 |
ds_lang.push_to_hub(
|
|
|
|
| 131 |
token=os.getenv("HUGGINGFACE_ACCESS_TOKEN"),
|
| 132 |
)
|
| 133 |
ds_lang.to_json(
|
| 134 |
+
f"data/translations/truthfulqa/{lang}_{split}.json",
|
| 135 |
lines=False,
|
| 136 |
force_ascii=False,
|
| 137 |
indent=2,
|
evals/datasets_/util.py
CHANGED
|
@@ -1,7 +1,14 @@
|
|
| 1 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 2 |
from joblib.memory import Memory
|
| 3 |
|
| 4 |
cache = Memory(location=".cache", verbose=0).cache
|
|
|
|
| 5 |
|
| 6 |
|
| 7 |
@cache
|
|
@@ -12,3 +19,27 @@ def _get_dataset_config_names(dataset, **kwargs):
|
|
| 12 |
@cache
|
| 13 |
def _load_dataset(dataset, subset, **kwargs):
|
| 14 |
return load_dataset(dataset, subset, **kwargs)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
|
| 4 |
+
import pandas as pd
|
| 5 |
+
from datasets import Dataset, get_dataset_config_names, load_dataset
|
| 6 |
+
from datasets.exceptions import DatasetNotFoundError
|
| 7 |
+
from huggingface_hub.errors import RepositoryNotFoundError
|
| 8 |
from joblib.memory import Memory
|
| 9 |
|
| 10 |
cache = Memory(location=".cache", verbose=0).cache
|
| 11 |
+
TOKEN = os.getenv("HUGGINGFACE_ACCESS_TOKEN")
|
| 12 |
|
| 13 |
|
| 14 |
@cache
|
|
|
|
| 19 |
@cache
|
| 20 |
def _load_dataset(dataset, subset, **kwargs):
|
| 21 |
return load_dataset(dataset, subset, **kwargs)
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
# Cache individual dataset items to avoid reloading entire datasets
|
| 25 |
+
@cache
|
| 26 |
+
def _get_dataset_item(dataset, subset, split, index, **kwargs):
|
| 27 |
+
"""Load a single item from a dataset efficiently"""
|
| 28 |
+
ds = load_dataset(dataset, subset, split=split, **kwargs)
|
| 29 |
+
return ds[index] if index < len(ds) else None
|
| 30 |
+
|
| 31 |
+
|
| 32 |
+
def load(fname: str):
|
| 33 |
+
try:
|
| 34 |
+
ds = load_dataset(f"fair-forward/evals-for-every-language-{fname}", token=TOKEN)
|
| 35 |
+
return ds["train"].to_pandas()
|
| 36 |
+
except (DatasetNotFoundError, RepositoryNotFoundError, KeyError):
|
| 37 |
+
return pd.DataFrame()
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def save(df: pd.DataFrame, fname: str):
|
| 41 |
+
df = df.drop(columns=["__index_level_0__"], errors="ignore")
|
| 42 |
+
ds = Dataset.from_pandas(df)
|
| 43 |
+
ds.push_to_hub(f"fair-forward/evals-for-every-language-{fname}", token=TOKEN)
|
| 44 |
+
Path("results").mkdir(exist_ok=True)
|
| 45 |
+
df.to_json(f"results/{fname}.json", orient="records", force_ascii=False, indent=2)
|
evals/download_data.py
CHANGED
|
@@ -8,6 +8,7 @@ from pathlib import Path
|
|
| 8 |
import sys
|
| 9 |
import huggingface_hub
|
| 10 |
from datasets import load_dataset, DatasetDict
|
|
|
|
| 11 |
# Import fleurs DataFrame directly from its source module
|
| 12 |
from datasets_.fleurs import fleurs
|
| 13 |
|
|
@@ -24,22 +25,25 @@ DATA_DIR = project_root / "data"
|
|
| 24 |
FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
|
| 25 |
FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
|
| 26 |
|
| 27 |
-
GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip"
|
| 28 |
GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
|
| 29 |
GLOTTOLOG_CSV_NAME = "languoid.csv"
|
| 30 |
|
| 31 |
-
SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html"
|
| 32 |
SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
|
| 33 |
|
| 34 |
-
SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm"
|
| 35 |
SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
|
| 36 |
SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
|
| 37 |
-
SPBLEU_DICT_URL =
|
|
|
|
|
|
|
| 38 |
SPBLEU_DICT_NAME = "dictionary.txt"
|
| 39 |
|
| 40 |
|
| 41 |
# --- Helper Functions ---
|
| 42 |
|
|
|
|
| 43 |
def download_file(url, path: Path):
|
| 44 |
"""Downloads a file from a URL to a local path."""
|
| 45 |
print(f"Downloading {url} to {path}...")
|
|
@@ -84,11 +88,16 @@ def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
|
|
| 84 |
break
|
| 85 |
|
| 86 |
if target_zip_path:
|
| 87 |
-
with
|
|
|
|
|
|
|
|
|
|
| 88 |
target.write(source.read())
|
| 89 |
print(f"Successfully extracted {target_filename}.")
|
| 90 |
else:
|
| 91 |
-
print(
|
|
|
|
|
|
|
| 92 |
|
| 93 |
except zipfile.BadZipFile:
|
| 94 |
print("Error: Downloaded file is not a valid zip archive.")
|
|
@@ -98,13 +107,14 @@ def extract_zip(zip_content: bytes, extract_path: Path, target_filename: str):
|
|
| 98 |
|
| 99 |
# --- Download Functions ---
|
| 100 |
|
|
|
|
| 101 |
def download_fleurs_data():
|
| 102 |
"""Downloads Fleurs audio and text data."""
|
| 103 |
print("\n--- Downloading Fleurs Data ---")
|
| 104 |
FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
|
| 105 |
|
| 106 |
# Use the fleurs_tag column from the imported DataFrame
|
| 107 |
-
fleurs_tags_list = fleurs[
|
| 108 |
|
| 109 |
if not fleurs_tags_list:
|
| 110 |
print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
|
|
@@ -117,7 +127,9 @@ def download_fleurs_data():
|
|
| 117 |
audio_dir = lang_dir / "audio"
|
| 118 |
dev_tsv_path = lang_dir / "dev.tsv"
|
| 119 |
dev_audio_archive_path = audio_dir / "dev.tar.gz"
|
| 120 |
-
audio_extracted_marker =
|
|
|
|
|
|
|
| 121 |
|
| 122 |
# Download TSV
|
| 123 |
if not dev_tsv_path.exists():
|
|
@@ -129,15 +141,15 @@ def download_fleurs_data():
|
|
| 129 |
# Download and Extract Audio
|
| 130 |
if not audio_extracted_marker.exists():
|
| 131 |
if not dev_audio_archive_path.exists():
|
| 132 |
-
|
| 133 |
-
|
| 134 |
|
| 135 |
if dev_audio_archive_path.exists():
|
| 136 |
-
|
| 137 |
else:
|
| 138 |
print(f"Audio archive missing, cannot extract for {lang_tag}")
|
| 139 |
else:
|
| 140 |
-
|
| 141 |
|
| 142 |
|
| 143 |
def download_glottolog_data():
|
|
@@ -165,7 +177,9 @@ def download_scriptcodes_data():
|
|
| 165 |
# The URL points to an HTML page, not a direct CSV link.
|
| 166 |
# Manual download is likely required for ScriptCodes.csv.
|
| 167 |
print(f"Cannot automatically download from {SCRIPTCODES_URL}")
|
| 168 |
-
print(
|
|
|
|
|
|
|
| 169 |
print("from the Unicode website or related sources and save it as:")
|
| 170 |
print(f"{SCRIPTCODES_TARGET_FILE}")
|
| 171 |
if SCRIPTCODES_TARGET_FILE.exists():
|
|
@@ -196,21 +210,24 @@ def download_spbleu_data():
|
|
| 196 |
|
| 197 |
# --- Main Execution ---
|
| 198 |
|
|
|
|
| 199 |
def main():
|
| 200 |
"""Runs all download functions and the conversion step."""
|
| 201 |
print("Starting data download process...")
|
| 202 |
DATA_DIR.mkdir(exist_ok=True)
|
| 203 |
|
| 204 |
-
#download_fleurs_data()
|
| 205 |
download_glottolog_data()
|
| 206 |
download_scriptcodes_data()
|
| 207 |
download_spbleu_data()
|
| 208 |
|
| 209 |
print("\nData download process finished.")
|
| 210 |
print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
|
| 211 |
-
print(
|
|
|
|
|
|
|
| 212 |
print("in 'evals/datasets_/flores.py' to be read correctly.")
|
| 213 |
|
| 214 |
|
| 215 |
if __name__ == "__main__":
|
| 216 |
-
main()
|
|
|
|
| 8 |
import sys
|
| 9 |
import huggingface_hub
|
| 10 |
from datasets import load_dataset, DatasetDict
|
| 11 |
+
|
| 12 |
# Import fleurs DataFrame directly from its source module
|
| 13 |
from datasets_.fleurs import fleurs
|
| 14 |
|
|
|
|
| 25 |
FLEURS_BASE_URL = "https://huggingface.co/datasets/google/fleurs/resolve/main/data"
|
| 26 |
FLEURS_TARGET_DIR = DATA_DIR / "fleurs"
|
| 27 |
|
| 28 |
+
GLOTTOLOG_URL = "https://cdstar.shh.mpg.de/bitstreams/EAEA0-B44E-8CEC-EA65-0/glottolog_languoid.zip" # Assumed direct link from https://glottolog.org/meta/downloads
|
| 29 |
GLOTTOLOG_TARGET_DIR = DATA_DIR / "glottolog_languoid.csv"
|
| 30 |
GLOTTOLOG_CSV_NAME = "languoid.csv"
|
| 31 |
|
| 32 |
+
SCRIPTCODES_URL = "https://www.unicode.org/iso15924/iso15924-codes.html" # This is HTML, need manual download or parsing
|
| 33 |
SCRIPTCODES_TARGET_FILE = DATA_DIR / "ScriptCodes.csv"
|
| 34 |
|
| 35 |
+
SPBLEU_SPM_URL = "https://tinyurl.com/flores200sacrebleuspm" # Assumed direct link
|
| 36 |
SPBLEU_TARGET_DIR = DATA_DIR / "spbleu"
|
| 37 |
SPBLEU_SPM_NAME = "flores200_sacrebleu_tokenizer_spm.model"
|
| 38 |
+
SPBLEU_DICT_URL = (
|
| 39 |
+
"https://dl.fbaipublicfiles.com/large_objects/nllb/models/spm_200/dictionary.txt"
|
| 40 |
+
)
|
| 41 |
SPBLEU_DICT_NAME = "dictionary.txt"
|
| 42 |
|
| 43 |
|
| 44 |
# --- Helper Functions ---
|
| 45 |
|
| 46 |
+
|
| 47 |
def download_file(url, path: Path):
|
| 48 |
"""Downloads a file from a URL to a local path."""
|
| 49 |
print(f"Downloading {url} to {path}...")
|
|
|
|
| 88 |
break
|
| 89 |
|
| 90 |
if target_zip_path:
|
| 91 |
+
with (
|
| 92 |
+
z.open(target_zip_path) as source,
|
| 93 |
+
open(extract_path / target_filename, "wb") as target,
|
| 94 |
+
):
|
| 95 |
target.write(source.read())
|
| 96 |
print(f"Successfully extracted {target_filename}.")
|
| 97 |
else:
|
| 98 |
+
print(
|
| 99 |
+
f"Error: Could not find {target_filename} within the zip archive."
|
| 100 |
+
)
|
| 101 |
|
| 102 |
except zipfile.BadZipFile:
|
| 103 |
print("Error: Downloaded file is not a valid zip archive.")
|
|
|
|
| 107 |
|
| 108 |
# --- Download Functions ---
|
| 109 |
|
| 110 |
+
|
| 111 |
def download_fleurs_data():
|
| 112 |
"""Downloads Fleurs audio and text data."""
|
| 113 |
print("\n--- Downloading Fleurs Data ---")
|
| 114 |
FLEURS_TARGET_DIR.mkdir(parents=True, exist_ok=True)
|
| 115 |
|
| 116 |
# Use the fleurs_tag column from the imported DataFrame
|
| 117 |
+
fleurs_tags_list = fleurs["fleurs_tag"].tolist()
|
| 118 |
|
| 119 |
if not fleurs_tags_list:
|
| 120 |
print("No Fleurs tags found in imported fleurs DataFrame. Skipping Fleurs.")
|
|
|
|
| 127 |
audio_dir = lang_dir / "audio"
|
| 128 |
dev_tsv_path = lang_dir / "dev.tsv"
|
| 129 |
dev_audio_archive_path = audio_dir / "dev.tar.gz"
|
| 130 |
+
audio_extracted_marker = (
|
| 131 |
+
audio_dir / "dev"
|
| 132 |
+
) # Check if extraction likely happened
|
| 133 |
|
| 134 |
# Download TSV
|
| 135 |
if not dev_tsv_path.exists():
|
|
|
|
| 141 |
# Download and Extract Audio
|
| 142 |
if not audio_extracted_marker.exists():
|
| 143 |
if not dev_audio_archive_path.exists():
|
| 144 |
+
tar_url = f"{FLEURS_BASE_URL}/{lang_tag}/audio/dev.tar.gz"
|
| 145 |
+
download_file(tar_url, dev_audio_archive_path)
|
| 146 |
|
| 147 |
if dev_audio_archive_path.exists():
|
| 148 |
+
extract_tar_gz(dev_audio_archive_path, audio_dir)
|
| 149 |
else:
|
| 150 |
print(f"Audio archive missing, cannot extract for {lang_tag}")
|
| 151 |
else:
|
| 152 |
+
print(f"Found extracted audio: {audio_extracted_marker}")
|
| 153 |
|
| 154 |
|
| 155 |
def download_glottolog_data():
|
|
|
|
| 177 |
# The URL points to an HTML page, not a direct CSV link.
|
| 178 |
# Manual download is likely required for ScriptCodes.csv.
|
| 179 |
print(f"Cannot automatically download from {SCRIPTCODES_URL}")
|
| 180 |
+
print(
|
| 181 |
+
"Please manually download the ISO 15924 codes list (often available as a .txt file)"
|
| 182 |
+
)
|
| 183 |
print("from the Unicode website or related sources and save it as:")
|
| 184 |
print(f"{SCRIPTCODES_TARGET_FILE}")
|
| 185 |
if SCRIPTCODES_TARGET_FILE.exists():
|
|
|
|
| 210 |
|
| 211 |
# --- Main Execution ---
|
| 212 |
|
| 213 |
+
|
| 214 |
def main():
|
| 215 |
"""Runs all download functions and the conversion step."""
|
| 216 |
print("Starting data download process...")
|
| 217 |
DATA_DIR.mkdir(exist_ok=True)
|
| 218 |
|
| 219 |
+
# download_fleurs_data()
|
| 220 |
download_glottolog_data()
|
| 221 |
download_scriptcodes_data()
|
| 222 |
download_spbleu_data()
|
| 223 |
|
| 224 |
print("\nData download process finished.")
|
| 225 |
print("Please verify downloads and manually obtain ScriptCodes.csv if needed.")
|
| 226 |
+
print(
|
| 227 |
+
"Note: Flores+ was downloaded as parquet, which might require changes but has been processed as well"
|
| 228 |
+
)
|
| 229 |
print("in 'evals/datasets_/flores.py' to be read correctly.")
|
| 230 |
|
| 231 |
|
| 232 |
if __name__ == "__main__":
|
| 233 |
+
main()
|
evals/languages.py
CHANGED
|
@@ -31,6 +31,7 @@ glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
|
|
| 31 |
lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
|
| 32 |
)
|
| 33 |
|
|
|
|
| 34 |
@cache
|
| 35 |
def language_family(bcp_47):
|
| 36 |
languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
|
|
@@ -39,6 +40,7 @@ def language_family(bcp_47):
|
|
| 39 |
family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
|
| 40 |
return family["name"]
|
| 41 |
|
|
|
|
| 42 |
languages["family"] = languages["bcp_47"].apply(language_family)
|
| 43 |
|
| 44 |
# load script codes and names
|
|
@@ -46,6 +48,7 @@ scripts = pd.read_csv("data/ScriptCodes.csv").rename(
|
|
| 46 |
columns={"Code": "iso15924", "English Name": "script_name"}
|
| 47 |
)
|
| 48 |
|
|
|
|
| 49 |
def script_name(iso15924):
|
| 50 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
| 51 |
|
|
|
|
| 31 |
lambda x: standardize_tag(x, macro=True) if not pd.isna(x) else None
|
| 32 |
)
|
| 33 |
|
| 34 |
+
|
| 35 |
@cache
|
| 36 |
def language_family(bcp_47):
|
| 37 |
languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
|
|
|
|
| 40 |
family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
|
| 41 |
return family["name"]
|
| 42 |
|
| 43 |
+
|
| 44 |
languages["family"] = languages["bcp_47"].apply(language_family)
|
| 45 |
|
| 46 |
# load script codes and names
|
|
|
|
| 48 |
columns={"Code": "iso15924", "English Name": "script_name"}
|
| 49 |
)
|
| 50 |
|
| 51 |
+
|
| 52 |
def script_name(iso15924):
|
| 53 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
| 54 |
|
evals/main.py
CHANGED
|
@@ -1,62 +1,80 @@
|
|
| 1 |
import asyncio
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
import pandas as pd
|
| 4 |
from languages import languages
|
| 5 |
from models import models
|
|
|
|
| 6 |
from tasks import tasks
|
| 7 |
from tqdm.asyncio import tqdm_asyncio
|
|
|
|
|
|
|
| 8 |
|
| 9 |
-
|
|
|
|
|
|
|
| 10 |
|
| 11 |
-
|
|
|
|
| 12 |
|
| 13 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
|
| 28 |
-
|
| 29 |
-
|
| 30 |
-
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
results = pd.concat([old_results, results])
|
| 53 |
-
results = results.sort_values(by=["model", "bcp_47", "task", "metric"])
|
| 54 |
-
results.to_json("results.json", **args)
|
| 55 |
-
# save up-to-date info on models and languages
|
| 56 |
-
all_models = pd.concat([pd.DataFrame(models), old_models])
|
| 57 |
-
all_models = all_models.drop_duplicates(subset=["id"]).sort_values(by=["id"])
|
| 58 |
-
all_models.to_json("models.json", **args)
|
| 59 |
-
pd.DataFrame(languages).to_json("languages.json", **args)
|
| 60 |
|
| 61 |
|
| 62 |
if __name__ == "__main__":
|
|
|
|
| 1 |
import asyncio
|
| 2 |
+
import time
|
| 3 |
+
from datetime import timedelta
|
| 4 |
+
from os import environ
|
| 5 |
|
| 6 |
import pandas as pd
|
| 7 |
from languages import languages
|
| 8 |
from models import models
|
| 9 |
+
from rich import print
|
| 10 |
from tasks import tasks
|
| 11 |
from tqdm.asyncio import tqdm_asyncio
|
| 12 |
+
from datasets_.util import load, save
|
| 13 |
+
from tqdm import tqdm
|
| 14 |
|
| 15 |
+
n_sentences = int(environ.get("N_SENTENCES", 10))
|
| 16 |
+
n_languages = int(environ.get("N_LANGUAGES", 300))
|
| 17 |
+
n_models = int(environ.get("N_MODELS", 35))
|
| 18 |
|
| 19 |
+
async def evaluate():
|
| 20 |
+
start_time = time.time()
|
| 21 |
|
| 22 |
+
# Pre-compute model tasks to avoid O(nΒ²) lookups
|
| 23 |
+
model_tasks = models.set_index("id")["tasks"].to_dict()
|
| 24 |
+
|
| 25 |
+
# get all combinations that need evaluation
|
| 26 |
+
combis = [
|
| 27 |
+
(task_name, model, lang.bcp_47, i)
|
| 28 |
+
for i in range(n_sentences)
|
| 29 |
+
for lang in languages.head(n_languages).itertuples()
|
| 30 |
+
for task_name, task in tasks.items()
|
| 31 |
+
for model in models.iloc[:n_models]["id"]
|
| 32 |
+
if task_name in model_tasks[model]
|
| 33 |
+
]
|
| 34 |
+
combis = pd.DataFrame(combis, columns=["task", "model", "bcp_47", "sentence_nr"])
|
| 35 |
|
| 36 |
+
# Load cached results and filter out completed combinations
|
| 37 |
+
old_results = load("results-detailed")
|
| 38 |
+
if not old_results.empty:
|
| 39 |
+
completed = set(old_results[["task", "model", "bcp_47", "sentence_nr"]].apply(tuple, axis=1))
|
| 40 |
+
combis = combis[~combis.apply(lambda row: tuple(row) in completed, axis=1)]
|
| 41 |
|
| 42 |
+
print(f"Running {len(combis)} evaluation tasks...")
|
| 43 |
+
|
| 44 |
+
# batching (asyncio.gather + rate-limiting can in principle run everything at once, but in practice batching is more efficient / necessary)
|
| 45 |
+
batch_size = 2000
|
| 46 |
+
batch_results = [
|
| 47 |
+
await tqdm_asyncio.gather(
|
| 48 |
+
*[tasks[task_name](model, bcp_47, sentence_nr)
|
| 49 |
+
for _, (task_name, model, bcp_47, sentence_nr) in batch.iterrows()]
|
| 50 |
+
)
|
| 51 |
+
for i in tqdm(range(0, len(combis), batch_size), colour='blue', desc='Batches')
|
| 52 |
+
for batch in [combis[i:i + batch_size]]
|
| 53 |
+
]
|
| 54 |
+
results = [r for batch in batch_results for result in batch for r in result]
|
| 55 |
+
results = pd.DataFrame(results) if results else pd.DataFrame(columns=["task", "model", "bcp_47", "metric", "sentence_nr", "score", "origin"])
|
| 56 |
+
|
| 57 |
+
# Merge with cached results (immutable log)
|
| 58 |
+
all_results = pd.concat([old_results, results]).drop_duplicates(
|
| 59 |
+
subset=["task", "model", "bcp_47", "metric", "sentence_nr"]
|
| 60 |
+
) if not old_results.empty else results
|
| 61 |
+
|
| 62 |
+
# Filter to current models Γ languages and aggregate
|
| 63 |
+
current_models = set(models.iloc[:n_models]["id"])
|
| 64 |
+
current_languages = set(languages.head(n_languages)["bcp_47"])
|
| 65 |
+
results_agg = (
|
| 66 |
+
all_results[all_results["model"].isin(current_models) & all_results["bcp_47"].isin(current_languages)]
|
| 67 |
+
.groupby(["model", "bcp_47", "task", "metric"])
|
| 68 |
+
.agg({"score": "mean", "origin": "first"})
|
| 69 |
+
.reset_index()
|
| 70 |
+
)
|
| 71 |
+
|
| 72 |
+
save(all_results, "results-detailed")
|
| 73 |
+
save(results_agg, "results")
|
| 74 |
+
save(models, "models")
|
| 75 |
+
save(languages, "languages")
|
| 76 |
+
elapsed = time.time() - start_time
|
| 77 |
+
print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
if __name__ == "__main__":
|
evals/models.py
CHANGED
|
@@ -1,13 +1,10 @@
|
|
| 1 |
-
import json
|
| 2 |
import re
|
| 3 |
-
from collections import defaultdict
|
| 4 |
from datetime import date
|
| 5 |
from os import getenv
|
| 6 |
|
| 7 |
import pandas as pd
|
| 8 |
from aiolimiter import AsyncLimiter
|
| 9 |
from dotenv import load_dotenv
|
| 10 |
-
from elevenlabs import AsyncElevenLabs
|
| 11 |
from google.cloud import translate_v2 as translate
|
| 12 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
| 13 |
from joblib.memory import Memory
|
|
@@ -22,20 +19,30 @@ important_models = [
|
|
| 22 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
| 23 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
| 24 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
|
|
|
|
|
|
|
|
|
| 25 |
"openai/gpt-4.1", # 8$
|
| 26 |
-
"openai/gpt-
|
| 27 |
-
"openai/gpt-
|
| 28 |
-
"openai/gpt-
|
| 29 |
-
|
| 30 |
-
"
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
"
|
|
|
|
|
|
|
|
|
|
| 34 |
"mistralai/mistral-saba", # 0.6$
|
| 35 |
"mistralai/mistral-nemo", # 0.08$
|
|
|
|
| 36 |
"google/gemini-2.5-flash", # 0.6$
|
| 37 |
-
"google/gemini-2.
|
| 38 |
"google/gemma-3-27b-it", # 0.2$
|
|
|
|
|
|
|
|
|
|
| 39 |
"qwen/qwen3-32b",
|
| 40 |
"qwen/qwen3-235b-a22b",
|
| 41 |
"qwen/qwen3-30b-a3b", # 0.29$
|
|
@@ -43,15 +50,16 @@ important_models = [
|
|
| 43 |
# "qwen/qwq-32b", # 0.2$
|
| 44 |
# "qwen/qwen-2.5-72b-instruct", # 0.39$
|
| 45 |
# "qwen/qwen-2-72b-instruct", # 0.9$
|
| 46 |
-
"deepseek/deepseek-
|
| 47 |
-
"deepseek/deepseek-chat", # 0.89$
|
| 48 |
"microsoft/phi-4", # 0.07$
|
| 49 |
-
"
|
| 50 |
-
"
|
|
|
|
| 51 |
]
|
| 52 |
|
| 53 |
blocklist = [
|
| 54 |
"google/gemini-2.5-pro-preview",
|
|
|
|
| 55 |
"google/gemini-2.5-flash-preview",
|
| 56 |
"google/gemini-2.5-flash-lite-preview",
|
| 57 |
"google/gemini-2.5-flash-preview-04-17",
|
|
@@ -59,6 +67,11 @@ blocklist = [
|
|
| 59 |
"google/gemini-2.5-flash-lite-preview-06-17",
|
| 60 |
"google/gemini-2.5-pro-preview-06-05",
|
| 61 |
"google/gemini-2.5-pro-preview-05-06",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
]
|
| 63 |
|
| 64 |
transcription_models = [
|
|
@@ -72,49 +85,104 @@ cache = Memory(location=".cache", verbose=0).cache
|
|
| 72 |
|
| 73 |
|
| 74 |
@cache
|
| 75 |
-
def
|
| 76 |
return get("https://openrouter.ai/api/frontend/models").json()["data"]
|
| 77 |
|
| 78 |
|
| 79 |
-
def
|
| 80 |
-
models =
|
| 81 |
slugs = [
|
| 82 |
m
|
| 83 |
for m in models
|
| 84 |
-
if m["permaslug"] == permaslug
|
|
|
|
| 85 |
and m["endpoint"]
|
|
|
|
|
|
|
| 86 |
and not m["endpoint"]["is_free"]
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
]
|
| 88 |
if len(slugs) == 0:
|
| 89 |
-
|
| 90 |
-
print(f"no non-free model found for {permaslug}")
|
| 91 |
return slugs[0] if len(slugs) >= 1 else None
|
| 92 |
|
| 93 |
|
| 94 |
@cache
|
| 95 |
def get_historical_popular_models(date: date):
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
| 109 |
|
| 110 |
@cache
|
| 111 |
def get_current_popular_models(date: date):
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
|
| 120 |
def get_translation_models():
|
|
@@ -125,6 +193,7 @@ def get_translation_models():
|
|
| 125 |
"name": "Google Translate",
|
| 126 |
"provider_name": "Google",
|
| 127 |
"cost": 20.0,
|
|
|
|
| 128 |
"size": None,
|
| 129 |
"type": "closed-source",
|
| 130 |
"license": None,
|
|
@@ -161,7 +230,10 @@ async def complete(**kwargs) -> str | None:
|
|
| 161 |
|
| 162 |
|
| 163 |
translate_client = translate.Client()
|
| 164 |
-
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
|
| 167 |
@cache
|
|
@@ -173,42 +245,35 @@ async def translate_google(text, source_language, target_language):
|
|
| 173 |
return response["translatedText"]
|
| 174 |
|
| 175 |
|
| 176 |
-
@cache
|
| 177 |
-
async def transcribe_elevenlabs(path, model):
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
|
| 187 |
|
| 188 |
-
@cache
|
| 189 |
-
async def transcribe_huggingface(path, model):
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
|
| 195 |
|
| 196 |
-
async def transcribe(path, model="elevenlabs/scribe_v1"):
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
def get_or_metadata(id):
|
| 208 |
-
# get metadata from OpenRouter
|
| 209 |
-
models = get_models(date.today())
|
| 210 |
-
metadata = next((m for m in models if m["slug"] == id), None)
|
| 211 |
-
return metadata
|
| 212 |
|
| 213 |
|
| 214 |
api = HfApi()
|
|
@@ -231,12 +296,15 @@ def get_hf_metadata(row):
|
|
| 231 |
return empty
|
| 232 |
try:
|
| 233 |
info = api.model_info(id)
|
| 234 |
-
license =
|
| 235 |
-
|
| 236 |
-
.
|
| 237 |
-
.
|
| 238 |
-
.
|
| 239 |
-
)
|
|
|
|
|
|
|
|
|
|
| 240 |
return {
|
| 241 |
"hf_id": info.id,
|
| 242 |
"creation_date": info.created_at,
|
|
@@ -249,20 +317,39 @@ def get_hf_metadata(row):
|
|
| 249 |
|
| 250 |
|
| 251 |
def get_cost(row):
|
| 252 |
-
|
| 253 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 254 |
|
| 255 |
|
| 256 |
@cache
|
| 257 |
-
def load_models(date: date):
|
| 258 |
-
popular_models = (
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
)
|
|
|
|
| 262 |
popular_models = [m["slug"] for m in popular_models]
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
hf_metadata = or_metadata.apply(get_hf_metadata)
|
| 267 |
creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
|
| 268 |
creation_date_or = pd.to_datetime(
|
|
@@ -274,16 +361,30 @@ def load_models(date: date):
|
|
| 274 |
.str.replace(" (free)", "")
|
| 275 |
.str.replace(" (self-moderated)", ""),
|
| 276 |
provider_name=or_metadata.str["name"].str.split(": ").str[0],
|
|
|
|
| 277 |
cost=or_metadata.apply(get_cost),
|
|
|
|
| 278 |
hf_id=hf_metadata.str["hf_id"],
|
| 279 |
size=hf_metadata.str["size"],
|
| 280 |
type=hf_metadata.str["type"],
|
| 281 |
license=hf_metadata.str["license"],
|
| 282 |
creation_date=creation_date_hf.combine_first(creation_date_or),
|
| 283 |
)
|
| 284 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
models["tasks"] = [
|
| 286 |
-
[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
] * len(models)
|
| 288 |
models = pd.concat([models, get_translation_models()])
|
| 289 |
return models
|
|
|
|
|
|
|
| 1 |
import re
|
|
|
|
| 2 |
from datetime import date
|
| 3 |
from os import getenv
|
| 4 |
|
| 5 |
import pandas as pd
|
| 6 |
from aiolimiter import AsyncLimiter
|
| 7 |
from dotenv import load_dotenv
|
|
|
|
| 8 |
from google.cloud import translate_v2 as translate
|
| 9 |
from huggingface_hub import AsyncInferenceClient, HfApi
|
| 10 |
from joblib.memory import Memory
|
|
|
|
| 19 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$
|
| 20 |
"meta-llama/llama-3-70b-instruct", # 0.4$
|
| 21 |
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
|
| 22 |
+
"openai/gpt-5",
|
| 23 |
+
"openai/gpt-5-mini",
|
| 24 |
+
"openai/gpt-5-nano",
|
| 25 |
"openai/gpt-4.1", # 8$
|
| 26 |
+
"openai/gpt-4o", # 10$
|
| 27 |
+
"openai/gpt-3.5-turbo", # $1.50
|
| 28 |
+
"openai/gpt-oss-120b",
|
| 29 |
+
"anthropic/claude-4.5-sonnet",
|
| 30 |
+
"anthropic/claude-4.5-haiku",
|
| 31 |
+
"anthropic/claude-opus-4.1", # 15$
|
| 32 |
+
"anthropic/claude-4-sonnet",
|
| 33 |
+
"anthropic/claude-3.7-sonnet", # 15$
|
| 34 |
+
"anthropic/claude-3.5-sonnet",
|
| 35 |
+
"mistralai/mistral-small-3.2-24b-instruct", # 0.3$
|
| 36 |
+
"mistralai/mistral-medium-3.1",
|
| 37 |
"mistralai/mistral-saba", # 0.6$
|
| 38 |
"mistralai/mistral-nemo", # 0.08$
|
| 39 |
+
"google/gemini-2.5-pro", # $10
|
| 40 |
"google/gemini-2.5-flash", # 0.6$
|
| 41 |
+
"google/gemini-2.5-flash-lite", # 0.3$
|
| 42 |
"google/gemma-3-27b-it", # 0.2$
|
| 43 |
+
# "x-ai/grok-4", # $15
|
| 44 |
+
# "x-ai/grok-3", # $15
|
| 45 |
+
"cohere/command-a",
|
| 46 |
"qwen/qwen3-32b",
|
| 47 |
"qwen/qwen3-235b-a22b",
|
| 48 |
"qwen/qwen3-30b-a3b", # 0.29$
|
|
|
|
| 50 |
# "qwen/qwq-32b", # 0.2$
|
| 51 |
# "qwen/qwen-2.5-72b-instruct", # 0.39$
|
| 52 |
# "qwen/qwen-2-72b-instruct", # 0.9$
|
| 53 |
+
"deepseek/deepseek-v3.2-exp",
|
|
|
|
| 54 |
"microsoft/phi-4", # 0.07$
|
| 55 |
+
"amazon/nova-pro-v1", # 0.09$
|
| 56 |
+
"moonshotai/kimi-k2", # 0.6$
|
| 57 |
+
"baidu/ernie-4.5-300b-a47b",
|
| 58 |
]
|
| 59 |
|
| 60 |
blocklist = [
|
| 61 |
"google/gemini-2.5-pro-preview",
|
| 62 |
+
# "google/gemini-2.5-pro",
|
| 63 |
"google/gemini-2.5-flash-preview",
|
| 64 |
"google/gemini-2.5-flash-lite-preview",
|
| 65 |
"google/gemini-2.5-flash-preview-04-17",
|
|
|
|
| 67 |
"google/gemini-2.5-flash-lite-preview-06-17",
|
| 68 |
"google/gemini-2.5-pro-preview-06-05",
|
| 69 |
"google/gemini-2.5-pro-preview-05-06",
|
| 70 |
+
"perplexity/sonar-deep-research",
|
| 71 |
+
"perplexity/sonar-reasoning",
|
| 72 |
+
"perplexity/sonar-reasoning-pro",
|
| 73 |
+
"qwen/qwen3-vl-30b-a3b-thinking",
|
| 74 |
+
"alpindale/goliath-120b"
|
| 75 |
]
|
| 76 |
|
| 77 |
transcription_models = [
|
|
|
|
| 85 |
|
| 86 |
|
| 87 |
@cache
|
| 88 |
+
def load_or_metadata(date: date):
|
| 89 |
return get("https://openrouter.ai/api/frontend/models").json()["data"]
|
| 90 |
|
| 91 |
|
| 92 |
+
def get_or_metadata(permaslug):
|
| 93 |
+
models = load_or_metadata(date.today())
|
| 94 |
slugs = [
|
| 95 |
m
|
| 96 |
for m in models
|
| 97 |
+
if (m["permaslug"] == permaslug or m["slug"] == permaslug)
|
| 98 |
+
# ensure that a provider endpoint is available
|
| 99 |
and m["endpoint"]
|
| 100 |
+
# exclude free models
|
| 101 |
+
# the problem is that free models typically have very high rate-limiting
|
| 102 |
and not m["endpoint"]["is_free"]
|
| 103 |
+
# exclude providers that train on user data
|
| 104 |
+
# this is crucial since we are submitting benchmark data
|
| 105 |
+
# make sure to additionally configure this in OpenRouter settings to avoid mistakes!
|
| 106 |
+
and m["endpoint"]["provider_info"]["dataPolicy"]["training"] is False
|
| 107 |
]
|
| 108 |
if len(slugs) == 0:
|
| 109 |
+
print(f"no appropriate model (not free and no user data training) found for {permaslug}")
|
|
|
|
| 110 |
return slugs[0] if len(slugs) >= 1 else None
|
| 111 |
|
| 112 |
|
| 113 |
@cache
|
| 114 |
def get_historical_popular_models(date: date):
|
| 115 |
+
# date parameter is used for daily caching
|
| 116 |
+
try:
|
| 117 |
+
raw = get("https://openrouter.ai/rankings").text
|
| 118 |
+
|
| 119 |
+
# Extract model data from rankingData using regex
|
| 120 |
+
# Find all count and model_permaslug pairs in the data
|
| 121 |
+
# Format: "count":number,"model_permaslug":"model/name"
|
| 122 |
+
pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
|
| 123 |
+
matches = re.findall(pattern, raw)
|
| 124 |
+
|
| 125 |
+
if matches:
|
| 126 |
+
# Aggregate model counts
|
| 127 |
+
model_counts = {}
|
| 128 |
+
for count_str, model_slug in matches:
|
| 129 |
+
count = float(count_str)
|
| 130 |
+
if not model_slug.startswith("openrouter") and model_slug != "Others":
|
| 131 |
+
# Remove variant suffixes for aggregation
|
| 132 |
+
base_model = model_slug.split(":")[0]
|
| 133 |
+
model_counts[base_model] = model_counts.get(base_model, 0) + count
|
| 134 |
+
|
| 135 |
+
# Sort by popularity and return top models
|
| 136 |
+
sorted_models = sorted(
|
| 137 |
+
model_counts.items(), key=lambda x: x[1], reverse=True
|
| 138 |
+
)
|
| 139 |
+
result = []
|
| 140 |
+
for model_slug, count in sorted_models:
|
| 141 |
+
result.append({"slug": model_slug, "count": int(count)})
|
| 142 |
+
|
| 143 |
+
return result
|
| 144 |
+
else:
|
| 145 |
+
return []
|
| 146 |
+
|
| 147 |
+
except Exception as e:
|
| 148 |
+
return []
|
| 149 |
|
| 150 |
|
| 151 |
@cache
|
| 152 |
def get_current_popular_models(date: date):
|
| 153 |
+
# date parameter is used for daily caching
|
| 154 |
+
try:
|
| 155 |
+
raw = get("https://openrouter.ai/rankings?view=day").text
|
| 156 |
+
|
| 157 |
+
# Extract model data from daily rankings
|
| 158 |
+
# Find all count and model_permaslug pairs in the daily data
|
| 159 |
+
pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
|
| 160 |
+
matches = re.findall(pattern, raw)
|
| 161 |
+
|
| 162 |
+
if matches:
|
| 163 |
+
# Aggregate model counts
|
| 164 |
+
model_counts = {}
|
| 165 |
+
for count_str, model_slug in matches:
|
| 166 |
+
count = float(count_str)
|
| 167 |
+
if not model_slug.startswith("openrouter") and model_slug != "Others":
|
| 168 |
+
# Remove variant suffixes for aggregation
|
| 169 |
+
base_model = model_slug.split(":")[0]
|
| 170 |
+
model_counts[base_model] = model_counts.get(base_model, 0) + count
|
| 171 |
+
|
| 172 |
+
# Sort by popularity and return top models
|
| 173 |
+
sorted_models = sorted(
|
| 174 |
+
model_counts.items(), key=lambda x: x[1], reverse=True
|
| 175 |
+
)
|
| 176 |
+
result = []
|
| 177 |
+
for model_slug, count in sorted_models:
|
| 178 |
+
result.append({"slug": model_slug, "count": int(count)})
|
| 179 |
+
|
| 180 |
+
return result
|
| 181 |
+
else:
|
| 182 |
+
return []
|
| 183 |
+
|
| 184 |
+
except Exception as e:
|
| 185 |
+
return []
|
| 186 |
|
| 187 |
|
| 188 |
def get_translation_models():
|
|
|
|
| 193 |
"name": "Google Translate",
|
| 194 |
"provider_name": "Google",
|
| 195 |
"cost": 20.0,
|
| 196 |
+
"train_on_prompts": False, # they don't do it in the API
|
| 197 |
"size": None,
|
| 198 |
"type": "closed-source",
|
| 199 |
"license": None,
|
|
|
|
| 230 |
|
| 231 |
|
| 232 |
translate_client = translate.Client()
|
| 233 |
+
|
| 234 |
+
|
| 235 |
+
def get_google_supported_languages():
|
| 236 |
+
return [l["language"] for l in translate_client.get_languages()]
|
| 237 |
|
| 238 |
|
| 239 |
@cache
|
|
|
|
| 245 |
return response["translatedText"]
|
| 246 |
|
| 247 |
|
| 248 |
+
# @cache
|
| 249 |
+
# async def transcribe_elevenlabs(path, model):
|
| 250 |
+
# modelname = model.split("/")[-1]
|
| 251 |
+
# client = AsyncElevenLabs(api_key=getenv("ELEVENLABS_API_KEY"))
|
| 252 |
+
# async with elevenlabs_rate_limit:
|
| 253 |
+
# with open(path, "rb") as file:
|
| 254 |
+
# response = await client.speech_to_text.convert(
|
| 255 |
+
# model_id=modelname, file=file
|
| 256 |
+
# )
|
| 257 |
+
# return response.text
|
| 258 |
|
| 259 |
|
| 260 |
+
# @cache
|
| 261 |
+
# async def transcribe_huggingface(path, model):
|
| 262 |
+
# client = AsyncInferenceClient(api_key=getenv("HUGGINGFACE_ACCESS_TOKEN"))
|
| 263 |
+
# async with huggingface_rate_limit:
|
| 264 |
+
# output = await client.automatic_speech_recognition(model=model, audio=path)
|
| 265 |
+
# return output.text
|
| 266 |
|
| 267 |
|
| 268 |
+
# async def transcribe(path, model="elevenlabs/scribe_v1"):
|
| 269 |
+
# provider, modelname = model.split("/")
|
| 270 |
+
# match provider:
|
| 271 |
+
# case "elevenlabs":
|
| 272 |
+
# return await transcribe_elevenlabs(path, modelname)
|
| 273 |
+
# case "openai" | "facebook":
|
| 274 |
+
# return await transcribe_huggingface(path, model)
|
| 275 |
+
# case _:
|
| 276 |
+
# raise ValueError(f"Model {model} not supported")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
|
| 279 |
api = HfApi()
|
|
|
|
| 296 |
return empty
|
| 297 |
try:
|
| 298 |
info = api.model_info(id)
|
| 299 |
+
license = ""
|
| 300 |
+
if (
|
| 301 |
+
info.card_data
|
| 302 |
+
and hasattr(info.card_data, "license")
|
| 303 |
+
and info.card_data.license
|
| 304 |
+
):
|
| 305 |
+
license = (
|
| 306 |
+
info.card_data.license.replace("-", " ").replace("mit", "MIT").title()
|
| 307 |
+
)
|
| 308 |
return {
|
| 309 |
"hf_id": info.id,
|
| 310 |
"creation_date": info.created_at,
|
|
|
|
| 317 |
|
| 318 |
|
| 319 |
def get_cost(row):
|
| 320 |
+
try:
|
| 321 |
+
cost = float(row["endpoint"]["pricing"]["completion"])
|
| 322 |
+
return round(cost * 1_000_000, 2)
|
| 323 |
+
except (TypeError, KeyError):
|
| 324 |
+
return None
|
| 325 |
+
|
| 326 |
+
|
| 327 |
+
def get_training_policy(row):
|
| 328 |
+
# get openrouter info whether the provider may train on prompts
|
| 329 |
+
# (this needs to be thoroughly avoided for our benchmark prompts!)
|
| 330 |
+
return row["endpoint"]["provider_info"]["dataPolicy"]["training"]
|
| 331 |
|
| 332 |
|
| 333 |
@cache
|
| 334 |
+
def load_models(date: date) -> pd.DataFrame:
|
| 335 |
+
# popular_models = (
|
| 336 |
+
# get_historical_popular_models(date.today())[:20]
|
| 337 |
+
# + get_current_popular_models(date.today())[:10]
|
| 338 |
+
# )
|
| 339 |
+
popular_models = []
|
| 340 |
popular_models = [m["slug"] for m in popular_models]
|
| 341 |
+
all_model_candidates = set(important_models + popular_models) - set(blocklist)
|
| 342 |
+
|
| 343 |
+
# Validate models exist on OpenRouter before including them
|
| 344 |
+
valid_models = []
|
| 345 |
+
|
| 346 |
+
for model_id in all_model_candidates:
|
| 347 |
+
metadata = get_or_metadata(model_id)
|
| 348 |
+
if metadata is not None:
|
| 349 |
+
valid_models.append(model_id)
|
| 350 |
+
|
| 351 |
+
models = pd.DataFrame(sorted(valid_models), columns=["id"])
|
| 352 |
+
or_metadata = models["id"].apply(get_or_metadata) # TODO this is double-doubled
|
| 353 |
hf_metadata = or_metadata.apply(get_hf_metadata)
|
| 354 |
creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
|
| 355 |
creation_date_or = pd.to_datetime(
|
|
|
|
| 361 |
.str.replace(" (free)", "")
|
| 362 |
.str.replace(" (self-moderated)", ""),
|
| 363 |
provider_name=or_metadata.str["name"].str.split(": ").str[0],
|
| 364 |
+
# openrouter_metadata=or_metadata.astype(str),
|
| 365 |
cost=or_metadata.apply(get_cost),
|
| 366 |
+
train_on_prompts=or_metadata.apply(get_training_policy),
|
| 367 |
hf_id=hf_metadata.str["hf_id"],
|
| 368 |
size=hf_metadata.str["size"],
|
| 369 |
type=hf_metadata.str["type"],
|
| 370 |
license=hf_metadata.str["license"],
|
| 371 |
creation_date=creation_date_hf.combine_first(creation_date_or),
|
| 372 |
)
|
| 373 |
+
models.to_json(
|
| 374 |
+
"models_unfiltered.json", orient="records", indent=2, force_ascii=False
|
| 375 |
+
)
|
| 376 |
+
# Filter out expensive models to keep costs reasonable
|
| 377 |
+
models = models[models["cost"] <= 15.0].reset_index(drop=True)
|
| 378 |
models["tasks"] = [
|
| 379 |
+
[
|
| 380 |
+
"translation_from",
|
| 381 |
+
"translation_to",
|
| 382 |
+
"classification",
|
| 383 |
+
"mmlu",
|
| 384 |
+
"arc",
|
| 385 |
+
"truthfulqa",
|
| 386 |
+
"mgsm",
|
| 387 |
+
]
|
| 388 |
] * len(models)
|
| 389 |
models = pd.concat([models, get_translation_models()])
|
| 390 |
return models
|
evals/plots.py
CHANGED
|
@@ -9,34 +9,33 @@ df = pd.read_json("../results.json")
|
|
| 9 |
df = df[df["metric"] != "chrf"]
|
| 10 |
df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()
|
| 11 |
|
|
|
|
| 12 |
# Apply logit transformation to classification scores to reduce skewness
|
| 13 |
def transform_classification_scores(row):
|
| 14 |
-
if row[
|
| 15 |
# Avoid division by zero and infinite values by clipping
|
| 16 |
-
score = np.clip(row[
|
| 17 |
# Apply logit transformation (log(p/(1-p)))
|
| 18 |
return logit(score)
|
| 19 |
else:
|
| 20 |
-
return row[
|
|
|
|
| 21 |
|
| 22 |
-
df[
|
| 23 |
|
| 24 |
# Create a pivot table with tasks as columns and languages as rows
|
| 25 |
pivot_df = df.pivot_table(
|
| 26 |
-
values=
|
| 27 |
-
index='bcp_47',
|
| 28 |
-
columns='task',
|
| 29 |
-
aggfunc='mean'
|
| 30 |
)
|
| 31 |
|
| 32 |
# Sort and filter tasks
|
| 33 |
ordered_tasks = [
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
]
|
| 41 |
# Drop 'truthfulqa' if present and reindex columns
|
| 42 |
pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]
|
|
@@ -46,29 +45,29 @@ correlation_matrix = pivot_df.corr()
|
|
| 46 |
|
| 47 |
# Create the correlation plot
|
| 48 |
plt.figure(figsize=(8, 6))
|
| 49 |
-
# Create mask for upper triangle including diagonal to show only lower triangle
|
| 50 |
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
|
| 51 |
|
| 52 |
# Create a heatmap
|
| 53 |
sns.heatmap(
|
| 54 |
-
correlation_matrix,
|
| 55 |
-
annot=True,
|
| 56 |
-
cmap=
|
| 57 |
center=0,
|
| 58 |
square=True,
|
| 59 |
mask=mask,
|
| 60 |
-
cbar_kws={"shrink": .8},
|
| 61 |
-
fmt=
|
| 62 |
)
|
| 63 |
|
| 64 |
-
plt.xlabel(
|
| 65 |
-
plt.ylabel(
|
| 66 |
-
plt.xticks(rotation=45, ha=
|
| 67 |
plt.yticks(rotation=0)
|
| 68 |
plt.tight_layout()
|
| 69 |
|
| 70 |
# Save the plot
|
| 71 |
-
plt.savefig(
|
| 72 |
plt.show()
|
| 73 |
|
| 74 |
# Print correlation values for reference
|
|
@@ -77,56 +76,91 @@ print("Note: Classification scores have been logit-transformed to reduce skewnes
|
|
| 77 |
print(correlation_matrix.round(3))
|
| 78 |
|
| 79 |
# Also create a scatter plot matrix for pairwise relationships with highlighted languages
|
| 80 |
-
highlighted_languages = [
|
|
|
|
| 81 |
|
| 82 |
# Create color mapping
|
| 83 |
def get_color_and_label(lang_code):
|
| 84 |
if lang_code in highlighted_languages:
|
| 85 |
-
color_map = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
return color_map[lang_code], lang_code
|
| 87 |
else:
|
| 88 |
-
return
|
|
|
|
| 89 |
|
| 90 |
# Create custom scatter plot matrix
|
| 91 |
tasks = pivot_df.columns.tolist()
|
| 92 |
n_tasks = len(tasks)
|
| 93 |
|
| 94 |
fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
|
| 95 |
-
fig.suptitle(
|
| 96 |
|
| 97 |
# Create legend elements
|
| 98 |
legend_elements = []
|
| 99 |
for lang in highlighted_languages:
|
| 100 |
color, _ = get_color_and_label(lang)
|
| 101 |
-
legend_elements.append(
|
| 102 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
for i, task_y in enumerate(tasks):
|
| 105 |
for j, task_x in enumerate(tasks):
|
| 106 |
ax = axes[i, j]
|
| 107 |
-
|
| 108 |
if i == j:
|
| 109 |
# Diagonal: histogram
|
| 110 |
task_data = pivot_df[task_y].dropna()
|
| 111 |
colors = [get_color_and_label(lang)[0] for lang in task_data.index]
|
| 112 |
-
ax.hist(task_data, bins=20, alpha=0.7, color=
|
| 113 |
-
ax.set_title(f
|
| 114 |
else:
|
| 115 |
# Off-diagonal: scatter plot
|
| 116 |
for lang_code in pivot_df.index:
|
| 117 |
-
if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(
|
|
|
|
|
|
|
| 118 |
color, _ = get_color_and_label(lang_code)
|
| 119 |
alpha = 0.8 if lang_code in highlighted_languages else 0.3
|
| 120 |
size = 50 if lang_code in highlighted_languages else 20
|
| 121 |
-
ax.scatter(
|
| 122 |
-
|
| 123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 124 |
# Set labels
|
| 125 |
if i == n_tasks - 1:
|
| 126 |
ax.set_xlabel(task_x, fontsize=10)
|
| 127 |
if j == 0:
|
| 128 |
ax.set_ylabel(task_y, fontsize=10)
|
| 129 |
-
|
| 130 |
# Remove tick labels except for edges
|
| 131 |
if i != n_tasks - 1:
|
| 132 |
ax.set_xticklabels([])
|
|
@@ -136,15 +170,15 @@ for i, task_y in enumerate(tasks):
|
|
| 136 |
# Add legend
|
| 137 |
fig.legend(
|
| 138 |
handles=legend_elements,
|
| 139 |
-
loc=
|
| 140 |
bbox_to_anchor=(0.5, -0.05),
|
| 141 |
ncol=len(legend_elements),
|
| 142 |
frameon=False,
|
| 143 |
fontsize=10,
|
| 144 |
handletextpad=0.5,
|
| 145 |
-
columnspacing=1.0
|
| 146 |
)
|
| 147 |
|
| 148 |
plt.tight_layout()
|
| 149 |
-
plt.savefig(
|
| 150 |
plt.show()
|
|
|
|
| 9 |
df = df[df["metric"] != "chrf"]
|
| 10 |
df = df.groupby(["task", "metric", "bcp_47"]).agg({"score": "mean"}).reset_index()
|
| 11 |
|
| 12 |
+
|
| 13 |
# Apply logit transformation to classification scores to reduce skewness
|
| 14 |
def transform_classification_scores(row):
|
| 15 |
+
if row["task"] == "classification":
|
| 16 |
# Avoid division by zero and infinite values by clipping
|
| 17 |
+
score = np.clip(row["score"], 0.001, 0.999)
|
| 18 |
# Apply logit transformation (log(p/(1-p)))
|
| 19 |
return logit(score)
|
| 20 |
else:
|
| 21 |
+
return row["score"]
|
| 22 |
+
|
| 23 |
|
| 24 |
+
df["score"] = df.apply(transform_classification_scores, axis=1)
|
| 25 |
|
| 26 |
# Create a pivot table with tasks as columns and languages as rows
|
| 27 |
pivot_df = df.pivot_table(
|
| 28 |
+
values="score", index="bcp_47", columns="task", aggfunc="mean"
|
|
|
|
|
|
|
|
|
|
| 29 |
)
|
| 30 |
|
| 31 |
# Sort and filter tasks
|
| 32 |
ordered_tasks = [
|
| 33 |
+
"translation_from",
|
| 34 |
+
"translation_to",
|
| 35 |
+
"classification",
|
| 36 |
+
"mmlu",
|
| 37 |
+
"arc",
|
| 38 |
+
"mgsm",
|
| 39 |
]
|
| 40 |
# Drop 'truthfulqa' if present and reindex columns
|
| 41 |
pivot_df = pivot_df[[task for task in ordered_tasks if task in pivot_df.columns]]
|
|
|
|
| 45 |
|
| 46 |
# Create the correlation plot
|
| 47 |
plt.figure(figsize=(8, 6))
|
| 48 |
+
# Create mask for upper triangle including diagonal to show only lower triangle
|
| 49 |
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
|
| 50 |
|
| 51 |
# Create a heatmap
|
| 52 |
sns.heatmap(
|
| 53 |
+
correlation_matrix,
|
| 54 |
+
annot=True,
|
| 55 |
+
cmap="Blues",
|
| 56 |
center=0,
|
| 57 |
square=True,
|
| 58 |
mask=mask,
|
| 59 |
+
cbar_kws={"shrink": 0.8},
|
| 60 |
+
fmt=".3f",
|
| 61 |
)
|
| 62 |
|
| 63 |
+
plt.xlabel("Tasks", fontsize=12)
|
| 64 |
+
plt.ylabel("Tasks", fontsize=12)
|
| 65 |
+
plt.xticks(rotation=45, ha="right")
|
| 66 |
plt.yticks(rotation=0)
|
| 67 |
plt.tight_layout()
|
| 68 |
|
| 69 |
# Save the plot
|
| 70 |
+
plt.savefig("task_correlation_matrix.png", dpi=300, bbox_inches="tight")
|
| 71 |
plt.show()
|
| 72 |
|
| 73 |
# Print correlation values for reference
|
|
|
|
| 76 |
print(correlation_matrix.round(3))
|
| 77 |
|
| 78 |
# Also create a scatter plot matrix for pairwise relationships with highlighted languages
|
| 79 |
+
highlighted_languages = ["en", "zh", "hi", "es", "ar"]
|
| 80 |
+
|
| 81 |
|
| 82 |
# Create color mapping
|
| 83 |
def get_color_and_label(lang_code):
|
| 84 |
if lang_code in highlighted_languages:
|
| 85 |
+
color_map = {
|
| 86 |
+
"en": "red",
|
| 87 |
+
"zh": "blue",
|
| 88 |
+
"hi": "green",
|
| 89 |
+
"es": "orange",
|
| 90 |
+
"ar": "purple",
|
| 91 |
+
}
|
| 92 |
return color_map[lang_code], lang_code
|
| 93 |
else:
|
| 94 |
+
return "lightgray", "Other"
|
| 95 |
+
|
| 96 |
|
| 97 |
# Create custom scatter plot matrix
|
| 98 |
tasks = pivot_df.columns.tolist()
|
| 99 |
n_tasks = len(tasks)
|
| 100 |
|
| 101 |
fig, axes = plt.subplots(n_tasks, n_tasks, figsize=(15, 12))
|
| 102 |
+
fig.suptitle("Pairwise Task Performance", fontsize=16, fontweight="bold")
|
| 103 |
|
| 104 |
# Create legend elements
|
| 105 |
legend_elements = []
|
| 106 |
for lang in highlighted_languages:
|
| 107 |
color, _ = get_color_and_label(lang)
|
| 108 |
+
legend_elements.append(
|
| 109 |
+
plt.Line2D(
|
| 110 |
+
[0],
|
| 111 |
+
[0],
|
| 112 |
+
marker="o",
|
| 113 |
+
color="w",
|
| 114 |
+
markerfacecolor=color,
|
| 115 |
+
markersize=8,
|
| 116 |
+
label=lang,
|
| 117 |
+
)
|
| 118 |
+
)
|
| 119 |
+
legend_elements.append(
|
| 120 |
+
plt.Line2D(
|
| 121 |
+
[0],
|
| 122 |
+
[0],
|
| 123 |
+
marker="o",
|
| 124 |
+
color="w",
|
| 125 |
+
markerfacecolor="lightgray",
|
| 126 |
+
markersize=8,
|
| 127 |
+
label="Other",
|
| 128 |
+
)
|
| 129 |
+
)
|
| 130 |
|
| 131 |
for i, task_y in enumerate(tasks):
|
| 132 |
for j, task_x in enumerate(tasks):
|
| 133 |
ax = axes[i, j]
|
| 134 |
+
|
| 135 |
if i == j:
|
| 136 |
# Diagonal: histogram
|
| 137 |
task_data = pivot_df[task_y].dropna()
|
| 138 |
colors = [get_color_and_label(lang)[0] for lang in task_data.index]
|
| 139 |
+
ax.hist(task_data, bins=20, alpha=0.7, color="skyblue", edgecolor="black")
|
| 140 |
+
ax.set_title(f"{task_y}", fontsize=10)
|
| 141 |
else:
|
| 142 |
# Off-diagonal: scatter plot
|
| 143 |
for lang_code in pivot_df.index:
|
| 144 |
+
if pd.notna(pivot_df.loc[lang_code, task_x]) and pd.notna(
|
| 145 |
+
pivot_df.loc[lang_code, task_y]
|
| 146 |
+
):
|
| 147 |
color, _ = get_color_and_label(lang_code)
|
| 148 |
alpha = 0.8 if lang_code in highlighted_languages else 0.3
|
| 149 |
size = 50 if lang_code in highlighted_languages else 20
|
| 150 |
+
ax.scatter(
|
| 151 |
+
pivot_df.loc[lang_code, task_x],
|
| 152 |
+
pivot_df.loc[lang_code, task_y],
|
| 153 |
+
c=color,
|
| 154 |
+
alpha=alpha,
|
| 155 |
+
s=size,
|
| 156 |
+
)
|
| 157 |
+
|
| 158 |
# Set labels
|
| 159 |
if i == n_tasks - 1:
|
| 160 |
ax.set_xlabel(task_x, fontsize=10)
|
| 161 |
if j == 0:
|
| 162 |
ax.set_ylabel(task_y, fontsize=10)
|
| 163 |
+
|
| 164 |
# Remove tick labels except for edges
|
| 165 |
if i != n_tasks - 1:
|
| 166 |
ax.set_xticklabels([])
|
|
|
|
| 170 |
# Add legend
|
| 171 |
fig.legend(
|
| 172 |
handles=legend_elements,
|
| 173 |
+
loc="lower center",
|
| 174 |
bbox_to_anchor=(0.5, -0.05),
|
| 175 |
ncol=len(legend_elements),
|
| 176 |
frameon=False,
|
| 177 |
fontsize=10,
|
| 178 |
handletextpad=0.5,
|
| 179 |
+
columnspacing=1.0,
|
| 180 |
)
|
| 181 |
|
| 182 |
plt.tight_layout()
|
| 183 |
+
plt.savefig("task_scatter_matrix.png", dpi=300, bbox_inches="tight")
|
| 184 |
plt.show()
|
evals/tasks.py
CHANGED
|
@@ -1,19 +1,19 @@
|
|
| 1 |
import random
|
|
|
|
| 2 |
from functools import partial
|
| 3 |
from textwrap import dedent
|
| 4 |
|
| 5 |
import evaluate
|
| 6 |
-
import pandas as pd
|
| 7 |
import sentencepiece as spm
|
|
|
|
| 8 |
from datasets_.flores import flores_sentences
|
| 9 |
from datasets_.mgsm import load_mgsm, parse_number
|
| 10 |
from datasets_.mmlu import load_mmlu
|
| 11 |
-
from datasets_.arc import load_uhura_arc_easy
|
| 12 |
from datasets_.truthfulqa import load_truthfulqa
|
| 13 |
from google.cloud import translate_v2 as translate
|
| 14 |
from langcodes import closest_supported_match
|
| 15 |
from languages import languages, script_name
|
| 16 |
-
from models import complete,
|
| 17 |
|
| 18 |
bleu = evaluate.load("bleu")
|
| 19 |
chrf = evaluate.load("chrf")
|
|
@@ -30,6 +30,58 @@ target_languages = languages[languages["in_benchmark"]].sample(
|
|
| 30 |
translate_client = translate.Client()
|
| 31 |
supported_languages = [l["language"] for l in translate_client.get_languages()]
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
| 35 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
|
@@ -47,31 +99,24 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 47 |
original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
|
| 48 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
| 49 |
script = script_name(target_language.flores_path.split("_")[1])
|
|
|
|
| 50 |
if model == "google/translate-v2":
|
| 51 |
original_language = closest_supported_match(
|
| 52 |
-
original_language, supported_languages
|
|
|
|
|
|
|
|
|
|
| 53 |
)
|
| 54 |
-
target_language = closest_supported_match(target_language, supported_languages)
|
| 55 |
if original_language == target_language:
|
| 56 |
prediction = original_sentence
|
| 57 |
elif original_language is None or target_language is None:
|
| 58 |
prediction = None
|
| 59 |
else:
|
| 60 |
prediction = await translate_google(
|
| 61 |
-
original_sentence, original_language
|
| 62 |
)
|
| 63 |
else:
|
| 64 |
-
prediction = await
|
| 65 |
-
model=model,
|
| 66 |
-
messages=[
|
| 67 |
-
{
|
| 68 |
-
"role": "user",
|
| 69 |
-
"content": f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}",
|
| 70 |
-
}
|
| 71 |
-
],
|
| 72 |
-
temperature=0,
|
| 73 |
-
max_tokens=1024,
|
| 74 |
-
)
|
| 75 |
if prediction:
|
| 76 |
bleu_score = bleu.compute(
|
| 77 |
predictions=[prediction],
|
|
@@ -84,6 +129,7 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 84 |
else:
|
| 85 |
bleu_score = {"bleu": 0}
|
| 86 |
chrf_score = {"score": 0}
|
|
|
|
| 87 |
return [
|
| 88 |
{
|
| 89 |
"model": model,
|
|
@@ -91,7 +137,10 @@ async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
|
| 91 |
"task": f"translation_{mode}",
|
| 92 |
"metric": metric,
|
| 93 |
"score": score,
|
|
|
|
| 94 |
"sentence_nr": sentence_nr,
|
|
|
|
|
|
|
| 95 |
}
|
| 96 |
for metric, score in (
|
| 97 |
("bleu", bleu_score["bleu"]),
|
|
@@ -112,57 +161,27 @@ async def classify_and_evaluate(model, bcp_47, nr):
|
|
| 112 |
)
|
| 113 |
top_topics = paragraphs.value_counts("topic").head(5).index
|
| 114 |
paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
)
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
messages += [
|
| 132 |
-
{"role": "user", "content": format_prompt(example.text)},
|
| 133 |
-
{"role": "assistant", "content": example.topic},
|
| 134 |
-
]
|
| 135 |
-
# some models have poor tokenization for some languages, and the prompt for this task is relatively long, so it sometimes exceeds the context window
|
| 136 |
-
# this is not just to blame on the context window but mostly on the model's tokenization, so we assign 0 accuracy in this case
|
| 137 |
-
try:
|
| 138 |
-
pred = await complete(
|
| 139 |
-
model=model,
|
| 140 |
-
messages=[
|
| 141 |
-
*messages,
|
| 142 |
-
{
|
| 143 |
-
"role": "user",
|
| 144 |
-
"content": format_prompt(test_paragraph.text),
|
| 145 |
-
},
|
| 146 |
-
],
|
| 147 |
-
temperature=0,
|
| 148 |
-
max_tokens=30,
|
| 149 |
-
)
|
| 150 |
-
true = test_paragraph.topic
|
| 151 |
-
others = [t for t in top_topics if t != true]
|
| 152 |
-
acc = (
|
| 153 |
-
int(
|
| 154 |
-
pred.startswith(true)
|
| 155 |
-
or (true in pred and not any(o in pred for o in others))
|
| 156 |
-
)
|
| 157 |
-
if pred
|
| 158 |
-
else 0
|
| 159 |
)
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
else:
|
| 165 |
-
raise e
|
| 166 |
return [
|
| 167 |
{
|
| 168 |
"model": model,
|
|
@@ -170,101 +189,74 @@ async def classify_and_evaluate(model, bcp_47, nr):
|
|
| 170 |
"task": "classification",
|
| 171 |
"metric": "accuracy",
|
| 172 |
"score": acc,
|
|
|
|
| 173 |
"sentence_nr": nr,
|
|
|
|
|
|
|
| 174 |
}
|
| 175 |
]
|
| 176 |
|
| 177 |
|
| 178 |
-
def corrupt_sentence(sentence):
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
async def mlm_and_evaluate(model, language_bcp_47, nr):
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
|
| 225 |
-
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
def format_multiple_choice(item):
|
| 230 |
-
return f"""{item["question"]}
|
| 231 |
-
|
| 232 |
-
A: {item["choices"][0]}
|
| 233 |
-
B: {item["choices"][1]}
|
| 234 |
-
C: {item["choices"][2]}
|
| 235 |
-
D: {item["choices"][3]}
|
| 236 |
-
|
| 237 |
-
A|B|C|D?"""
|
| 238 |
|
| 239 |
|
| 240 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
| 241 |
-
ds_name,
|
| 242 |
if not task:
|
| 243 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
| 244 |
|
| 245 |
-
messages = []
|
| 246 |
-
for example in examples:
|
| 247 |
-
messages += [
|
| 248 |
-
{"role": "user", "content": format_multiple_choice(example)},
|
| 249 |
-
{"role": "assistant", "content": example["answer"]},
|
| 250 |
-
]
|
| 251 |
-
messages += [{"role": "user", "content": format_multiple_choice(task)}]
|
| 252 |
-
try:
|
| 253 |
-
response = await complete(
|
| 254 |
-
model=model,
|
| 255 |
-
messages=messages,
|
| 256 |
-
temperature=0,
|
| 257 |
-
max_tokens=1,
|
| 258 |
-
)
|
| 259 |
-
if response:
|
| 260 |
-
acc = int(response[:1].strip() == task["answer"])
|
| 261 |
-
else:
|
| 262 |
-
acc = 0
|
| 263 |
-
except Exception as e:
|
| 264 |
-
if "ResponsibleAIPolicyViolation" in str(e):
|
| 265 |
-
acc = 0
|
| 266 |
-
else:
|
| 267 |
-
raise e
|
| 268 |
return [
|
| 269 |
{
|
| 270 |
"model": model,
|
|
@@ -272,39 +264,22 @@ async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
|
| 272 |
"task": "mmlu",
|
| 273 |
"metric": "accuracy",
|
| 274 |
"score": acc,
|
|
|
|
| 275 |
"sentence_nr": nr,
|
|
|
|
|
|
|
| 276 |
}
|
| 277 |
]
|
| 278 |
|
| 279 |
|
| 280 |
async def arc_and_evaluate(model, language_bcp_47, nr):
|
| 281 |
-
ds_name,
|
| 282 |
if not task:
|
| 283 |
return []
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
{"role": "user", "content": format_multiple_choice(example)},
|
| 289 |
-
{"role": "assistant", "content": example["answer"]},
|
| 290 |
-
]
|
| 291 |
-
messages += [{"role": "user", "content": format_multiple_choice(task)}]
|
| 292 |
-
try:
|
| 293 |
-
response = await complete(
|
| 294 |
-
model=model,
|
| 295 |
-
messages=messages,
|
| 296 |
-
temperature=0,
|
| 297 |
-
max_tokens=1,
|
| 298 |
-
)
|
| 299 |
-
if response:
|
| 300 |
-
acc = int(response[:1].strip() == task["answer"])
|
| 301 |
-
else:
|
| 302 |
-
acc = 0
|
| 303 |
-
except Exception as e:
|
| 304 |
-
if "ResponsibleAIPolicyViolation" in str(e):
|
| 305 |
-
acc = 0
|
| 306 |
-
else:
|
| 307 |
-
raise e
|
| 308 |
return [
|
| 309 |
{
|
| 310 |
"model": model,
|
|
@@ -312,7 +287,10 @@ async def arc_and_evaluate(model, language_bcp_47, nr):
|
|
| 312 |
"task": "arc",
|
| 313 |
"metric": "accuracy",
|
| 314 |
"score": acc,
|
|
|
|
| 315 |
"sentence_nr": nr,
|
|
|
|
|
|
|
| 316 |
}
|
| 317 |
]
|
| 318 |
|
|
@@ -332,40 +310,19 @@ def format_multiple_choice_truthfulqa(item):
|
|
| 332 |
text = item["question"] + "\n\n"
|
| 333 |
for i, choice in enumerate(item["choices"]):
|
| 334 |
text += f"{letters[i]}: {choice}\n"
|
| 335 |
-
text += "|".join(letters[: len(item["choices"])]) + "?"
|
| 336 |
return text
|
| 337 |
|
| 338 |
|
| 339 |
async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
| 340 |
-
ds_name,
|
| 341 |
if not task:
|
| 342 |
return []
|
| 343 |
-
|
| 344 |
-
answer = letters[
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
{"role": "user", "content": format_multiple_choice_truthfulqa(example)},
|
| 350 |
-
{"role": "assistant", "content": letters[example["labels"].index(1)]},
|
| 351 |
-
]
|
| 352 |
-
messages += [{"role": "user", "content": format_multiple_choice_truthfulqa(task)}]
|
| 353 |
-
try:
|
| 354 |
-
response = await complete(
|
| 355 |
-
model=model,
|
| 356 |
-
messages=messages,
|
| 357 |
-
temperature=0,
|
| 358 |
-
max_tokens=1,
|
| 359 |
-
)
|
| 360 |
-
if response:
|
| 361 |
-
acc = int(response[:1].strip() == answer)
|
| 362 |
-
else:
|
| 363 |
-
acc = 0
|
| 364 |
-
except Exception as e:
|
| 365 |
-
if "ResponsibleAIPolicyViolation" in str(e):
|
| 366 |
-
acc = 0
|
| 367 |
-
else:
|
| 368 |
-
raise e
|
| 369 |
return [
|
| 370 |
{
|
| 371 |
"model": model,
|
|
@@ -373,86 +330,86 @@ async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
|
| 373 |
"task": "truthfulqa",
|
| 374 |
"metric": "accuracy",
|
| 375 |
"score": acc,
|
|
|
|
| 376 |
"sentence_nr": nr,
|
|
|
|
|
|
|
| 377 |
}
|
| 378 |
]
|
| 379 |
|
| 380 |
|
| 381 |
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
| 382 |
-
|
| 383 |
-
Solve the math problem. Use reasoning, and finally give the answer as a number.
|
| 384 |
-
Response format: <reasoning> #### <number>
|
| 385 |
-
"""
|
| 386 |
-
system_prompt = dedent(system_prompt).strip()
|
| 387 |
-
ds_slug, question = load_mgsm(language_bcp_47, nr)
|
| 388 |
if not question:
|
| 389 |
return []
|
| 390 |
-
response = await complete(
|
| 391 |
-
model=model,
|
| 392 |
-
messages=[
|
| 393 |
-
{"role": "system", "content": system_prompt},
|
| 394 |
-
{"role": "user", "content": question["question"]},
|
| 395 |
-
],
|
| 396 |
-
temperature=0,
|
| 397 |
-
max_tokens=1024,
|
| 398 |
-
)
|
| 399 |
-
if response and len(response.split("####")) == 2:
|
| 400 |
-
number = response.split("####")[1].strip()
|
| 401 |
-
accuracy = int(parse_number(number) == parse_number(question["answer_number"]))
|
| 402 |
-
else:
|
| 403 |
-
accuracy = 0
|
| 404 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 405 |
return [
|
| 406 |
{
|
| 407 |
"model": model,
|
| 408 |
"bcp_47": language_bcp_47,
|
| 409 |
"task": "mgsm",
|
| 410 |
"metric": "accuracy",
|
| 411 |
-
"score":
|
|
|
|
| 412 |
"sentence_nr": nr,
|
|
|
|
|
|
|
| 413 |
}
|
| 414 |
]
|
| 415 |
|
| 416 |
|
| 417 |
-
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
| 430 |
-
|
| 431 |
-
|
| 432 |
-
|
| 433 |
-
|
| 434 |
-
|
| 435 |
-
|
| 436 |
-
|
| 437 |
-
|
| 438 |
-
|
| 439 |
-
|
| 440 |
-
|
| 441 |
-
|
| 442 |
-
|
| 443 |
-
|
| 444 |
-
|
| 445 |
-
|
| 446 |
|
| 447 |
|
| 448 |
tasks = {
|
| 449 |
"translation_from": partial(translate_and_evaluate, mode="from"),
|
| 450 |
"translation_to": partial(translate_and_evaluate, mode="to"),
|
| 451 |
"classification": classify_and_evaluate,
|
| 452 |
-
# "mlm": mlm_and_evaluate,
|
| 453 |
"mmlu": mmlu_and_evaluate,
|
| 454 |
"arc": arc_and_evaluate,
|
| 455 |
"truthfulqa": truthfulqa_and_evaluate,
|
| 456 |
"mgsm": mgsm_and_evaluate,
|
| 457 |
-
# "asr": transcribe_and_evaluate,
|
| 458 |
}
|
|
|
|
| 1 |
import random
|
| 2 |
+
import re
|
| 3 |
from functools import partial
|
| 4 |
from textwrap import dedent
|
| 5 |
|
| 6 |
import evaluate
|
|
|
|
| 7 |
import sentencepiece as spm
|
| 8 |
+
from datasets_.arc import load_uhura_arc_easy
|
| 9 |
from datasets_.flores import flores_sentences
|
| 10 |
from datasets_.mgsm import load_mgsm, parse_number
|
| 11 |
from datasets_.mmlu import load_mmlu
|
|
|
|
| 12 |
from datasets_.truthfulqa import load_truthfulqa
|
| 13 |
from google.cloud import translate_v2 as translate
|
| 14 |
from langcodes import closest_supported_match
|
| 15 |
from languages import languages, script_name
|
| 16 |
+
from models import complete, translate_google
|
| 17 |
|
| 18 |
bleu = evaluate.load("bleu")
|
| 19 |
chrf = evaluate.load("chrf")
|
|
|
|
| 30 |
translate_client = translate.Client()
|
| 31 |
supported_languages = [l["language"] for l in translate_client.get_languages()]
|
| 32 |
|
| 33 |
+
async def query(model, prompt):
|
| 34 |
+
# this is just for sharing config across tasks
|
| 35 |
+
try:
|
| 36 |
+
response = await complete(
|
| 37 |
+
model=model,
|
| 38 |
+
messages=[{"role": "user", "content": prompt}],
|
| 39 |
+
temperature=0,
|
| 40 |
+
max_tokens=1024,
|
| 41 |
+
extra_body=dict(
|
| 42 |
+
reasoning=dict(
|
| 43 |
+
effort="low", # Can be "high", "medium", or "low" (OpenAI-style)
|
| 44 |
+
# max_tokens=1024, # Specific token limit (Anthropic-style)
|
| 45 |
+
# Optional: Default is false. All models support this.
|
| 46 |
+
exclude=True, # Set to true to exclude reasoning tokens from response
|
| 47 |
+
)
|
| 48 |
+
),
|
| 49 |
+
)
|
| 50 |
+
except Exception as e:
|
| 51 |
+
print(f"exception for model {model}: {e}")
|
| 52 |
+
return None
|
| 53 |
+
# remove <think>...</think> sections (it's probably an OpenRouter bug that they are included)
|
| 54 |
+
response = re.sub(r"<think>.*</think>", "", response).strip()
|
| 55 |
+
# sometimes there's also a lone <think> at the start for some reason
|
| 56 |
+
response = re.sub(r"<think>", "", response).strip()
|
| 57 |
+
return response
|
| 58 |
+
|
| 59 |
+
|
| 60 |
+
reasoning_template = (
|
| 61 |
+
"Response format:<reasoning>...</reasoning><final_answer>...</final_answer>"
|
| 62 |
+
)
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def format_multiple_choice(item):
|
| 66 |
+
return dedent(f"""
|
| 67 |
+
{reasoning_template}
|
| 68 |
+
|
| 69 |
+
---
|
| 70 |
+
|
| 71 |
+
{item["question"]}
|
| 72 |
+
|
| 73 |
+
A: {item["choices"][0]}
|
| 74 |
+
B: {item["choices"][1]}
|
| 75 |
+
C: {item["choices"][2]}
|
| 76 |
+
D: {item["choices"][3]}""")
|
| 77 |
+
|
| 78 |
+
|
| 79 |
+
def extract_mc_response(response):
|
| 80 |
+
if not response:
|
| 81 |
+
return None
|
| 82 |
+
final_answer = re.search(r"\<final_answer\>(.*)\<\/final_answer\>", response)
|
| 83 |
+
return final_answer[1].strip() if final_answer else None
|
| 84 |
+
|
| 85 |
|
| 86 |
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
| 87 |
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
|
|
|
| 99 |
original_sentence = flores_sentences(original_language)["text"][sentence_nr].strip()
|
| 100 |
target_sentence = flores_sentences(target_language)["text"][sentence_nr].strip()
|
| 101 |
script = script_name(target_language.flores_path.split("_")[1])
|
| 102 |
+
translation_prompt = f"Translate the following text to the {target_language.language_name} language; use the {script} script; reply only with the translation:\n\n{original_sentence}"
|
| 103 |
if model == "google/translate-v2":
|
| 104 |
original_language = closest_supported_match(
|
| 105 |
+
original_language.bcp_47, supported_languages
|
| 106 |
+
)
|
| 107 |
+
target_language = closest_supported_match(
|
| 108 |
+
target_language.bcp_47, supported_languages
|
| 109 |
)
|
|
|
|
| 110 |
if original_language == target_language:
|
| 111 |
prediction = original_sentence
|
| 112 |
elif original_language is None or target_language is None:
|
| 113 |
prediction = None
|
| 114 |
else:
|
| 115 |
prediction = await translate_google(
|
| 116 |
+
original_sentence, original_language, target_language
|
| 117 |
)
|
| 118 |
else:
|
| 119 |
+
prediction = await query(model, translation_prompt)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
if prediction:
|
| 121 |
bleu_score = bleu.compute(
|
| 122 |
predictions=[prediction],
|
|
|
|
| 129 |
else:
|
| 130 |
bleu_score = {"bleu": 0}
|
| 131 |
chrf_score = {"score": 0}
|
| 132 |
+
|
| 133 |
return [
|
| 134 |
{
|
| 135 |
"model": model,
|
|
|
|
| 137 |
"task": f"translation_{mode}",
|
| 138 |
"metric": metric,
|
| 139 |
"score": score,
|
| 140 |
+
"origin": "human", # FLORES+ is human-translated
|
| 141 |
"sentence_nr": sentence_nr,
|
| 142 |
+
"prompt": translation_prompt,
|
| 143 |
+
"response": prediction,
|
| 144 |
}
|
| 145 |
for metric, score in (
|
| 146 |
("bleu", bleu_score["bleu"]),
|
|
|
|
| 161 |
)
|
| 162 |
top_topics = paragraphs.value_counts("topic").head(5).index
|
| 163 |
paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
|
| 164 |
+
test_paragraph = paragraphs.sample(n=1, random_state=nr).iloc[0]
|
| 165 |
+
|
| 166 |
+
prompt = f"""Classify the following text into one of these topics: {", ".join(top_topics)}.
|
| 167 |
+
Reply with only the topic name.
|
| 168 |
+
|
| 169 |
+
Text:
|
| 170 |
+
{test_paragraph.text}
|
| 171 |
+
"""
|
| 172 |
+
response = await query(model, prompt)
|
| 173 |
+
pred = response.lower().strip() if response else ""
|
| 174 |
+
true = test_paragraph.topic.lower().strip()
|
| 175 |
+
others = [t for t in top_topics if t != true]
|
| 176 |
+
acc = (
|
| 177 |
+
int(
|
| 178 |
+
pred.startswith(true)
|
| 179 |
+
or (true in pred and not any(o in pred for o in others))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
)
|
| 181 |
+
if pred
|
| 182 |
+
else 0
|
| 183 |
+
)
|
| 184 |
+
|
|
|
|
|
|
|
| 185 |
return [
|
| 186 |
{
|
| 187 |
"model": model,
|
|
|
|
| 189 |
"task": "classification",
|
| 190 |
"metric": "accuracy",
|
| 191 |
"score": acc,
|
| 192 |
+
"origin": "human", # FLORES+ is human-translated
|
| 193 |
"sentence_nr": nr,
|
| 194 |
+
"prompt": prompt,
|
| 195 |
+
"response": pred,
|
| 196 |
}
|
| 197 |
]
|
| 198 |
|
| 199 |
|
| 200 |
+
# def corrupt_sentence(sentence):
|
| 201 |
+
# # replace 5% of the sentence with <mask>
|
| 202 |
+
# mask_length = round(len(sentence) * 0.05)
|
| 203 |
+
# start = random.randint(0, len(sentence) - mask_length)
|
| 204 |
+
# end = start + mask_length
|
| 205 |
+
# return sentence[:start] + "<mask>" + sentence[end:]
|
| 206 |
+
|
| 207 |
+
|
| 208 |
+
# async def mlm_and_evaluate(model, language_bcp_47, nr):
|
| 209 |
+
# language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
| 210 |
+
# sentences = flores_sentences(language)
|
| 211 |
+
# if sentences is None:
|
| 212 |
+
# return []
|
| 213 |
+
# sentences = pd.DataFrame(sentences, columns=["text"])
|
| 214 |
+
# sentences["corrupt_text"] = sentences["text"].apply(corrupt_sentence)
|
| 215 |
+
# examples = sentences.sample(n=10, random_state=42)
|
| 216 |
+
# test_sentences = sentences[~sentences["text"].isin(examples["text"])].sample(
|
| 217 |
+
# frac=1, random_state=42
|
| 218 |
+
# )
|
| 219 |
+
# test_sentence = test_sentences.iloc[nr]
|
| 220 |
+
# messages = []
|
| 221 |
+
# for example in examples.itertuples():
|
| 222 |
+
# messages += [
|
| 223 |
+
# {"role": "user", "content": example.corrupt_text},
|
| 224 |
+
# {"role": "assistant", "content": example.text},
|
| 225 |
+
# ]
|
| 226 |
+
# prediction = await complete(
|
| 227 |
+
# model=model,
|
| 228 |
+
# messages=[
|
| 229 |
+
# *messages,
|
| 230 |
+
# {
|
| 231 |
+
# "role": "user",
|
| 232 |
+
# "content": test_sentence.corrupt_text,
|
| 233 |
+
# },
|
| 234 |
+
# ],
|
| 235 |
+
# temperature=0,
|
| 236 |
+
# max_tokens=1024,
|
| 237 |
+
# )
|
| 238 |
+
# chrf_score = chrf.compute(predictions=[prediction], references=[test_sentence.text])
|
| 239 |
+
# return [
|
| 240 |
+
# {
|
| 241 |
+
# "model": model,
|
| 242 |
+
# "bcp_47": language["bcp_47"],
|
| 243 |
+
# "task": "language_modeling",
|
| 244 |
+
# "metric": "chrf",
|
| 245 |
+
# "score": chrf_score["score"] / 100,
|
| 246 |
+
# "sentence_nr": nr,
|
| 247 |
+
# }
|
| 248 |
+
# ]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
|
| 250 |
|
| 251 |
async def mmlu_and_evaluate(model, language_bcp_47, nr):
|
| 252 |
+
ds_name, task, origin = await load_mmlu(language_bcp_47, nr)
|
| 253 |
if not task:
|
| 254 |
return []
|
| 255 |
+
prompt = f"""Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.\n\n{format_multiple_choice(task)}"""
|
| 256 |
+
response = await query(model, prompt)
|
| 257 |
+
final_response = extract_mc_response(response)
|
| 258 |
+
acc = int(final_response == task["answer"]) if final_response else 0
|
| 259 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
return [
|
| 261 |
{
|
| 262 |
"model": model,
|
|
|
|
| 264 |
"task": "mmlu",
|
| 265 |
"metric": "accuracy",
|
| 266 |
"score": acc,
|
| 267 |
+
"origin": origin,
|
| 268 |
"sentence_nr": nr,
|
| 269 |
+
"prompt": prompt,
|
| 270 |
+
"response": response,
|
| 271 |
}
|
| 272 |
]
|
| 273 |
|
| 274 |
|
| 275 |
async def arc_and_evaluate(model, language_bcp_47, nr):
|
| 276 |
+
ds_name, task, origin = load_uhura_arc_easy(language_bcp_47, nr)
|
| 277 |
if not task:
|
| 278 |
return []
|
| 279 |
+
prompt = f"Solve the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.\n\n{format_multiple_choice(task)}"
|
| 280 |
+
response = await query(model, prompt)
|
| 281 |
+
final_response = extract_mc_response(response)
|
| 282 |
+
acc = int(final_response == task["answer"]) if final_response else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
return [
|
| 284 |
{
|
| 285 |
"model": model,
|
|
|
|
| 287 |
"task": "arc",
|
| 288 |
"metric": "accuracy",
|
| 289 |
"score": acc,
|
| 290 |
+
"origin": origin,
|
| 291 |
"sentence_nr": nr,
|
| 292 |
+
"prompt": prompt,
|
| 293 |
+
"response": response,
|
| 294 |
}
|
| 295 |
]
|
| 296 |
|
|
|
|
| 310 |
text = item["question"] + "\n\n"
|
| 311 |
for i, choice in enumerate(item["choices"]):
|
| 312 |
text += f"{letters[i]}: {choice}\n"
|
|
|
|
| 313 |
return text
|
| 314 |
|
| 315 |
|
| 316 |
async def truthfulqa_and_evaluate(model, language_bcp_47, nr):
|
| 317 |
+
ds_name, task, origin = await load_truthfulqa(language_bcp_47, nr)
|
| 318 |
if not task:
|
| 319 |
return []
|
| 320 |
+
correct_choice_index = task["labels"].index(1)
|
| 321 |
+
answer = letters[correct_choice_index]
|
| 322 |
+
prompt = f"""Answer the following multiple choice question. Reason step-by-step and then write the final answer as a single letter.\n\n{format_multiple_choice_truthfulqa(task)}"""
|
| 323 |
+
response = await query(model, prompt)
|
| 324 |
+
final_response = extract_mc_response(response)
|
| 325 |
+
acc = int(final_response.upper() == answer) if final_response else 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 326 |
return [
|
| 327 |
{
|
| 328 |
"model": model,
|
|
|
|
| 330 |
"task": "truthfulqa",
|
| 331 |
"metric": "accuracy",
|
| 332 |
"score": acc,
|
| 333 |
+
"origin": origin,
|
| 334 |
"sentence_nr": nr,
|
| 335 |
+
"prompt": prompt,
|
| 336 |
+
"response": response,
|
| 337 |
}
|
| 338 |
]
|
| 339 |
|
| 340 |
|
| 341 |
async def mgsm_and_evaluate(model, language_bcp_47, nr):
|
| 342 |
+
ds_slug, question, origin = load_mgsm(language_bcp_47, nr)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 343 |
if not question:
|
| 344 |
return []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 345 |
|
| 346 |
+
prompt = dedent(f"""
|
| 347 |
+
Solve the following math problem. Reason step-by-step and then write the final answer as a single number.
|
| 348 |
+
|
| 349 |
+
{reasoning_template}
|
| 350 |
+
|
| 351 |
+
---
|
| 352 |
+
|
| 353 |
+
{question["question"]}""").strip()
|
| 354 |
+
response = await query(model, prompt)
|
| 355 |
+
number = extract_mc_response(response)
|
| 356 |
+
acc = (
|
| 357 |
+
int(parse_number(number) == parse_number(question["answer_number"]))
|
| 358 |
+
if number
|
| 359 |
+
else 0
|
| 360 |
+
)
|
| 361 |
return [
|
| 362 |
{
|
| 363 |
"model": model,
|
| 364 |
"bcp_47": language_bcp_47,
|
| 365 |
"task": "mgsm",
|
| 366 |
"metric": "accuracy",
|
| 367 |
+
"score": acc,
|
| 368 |
+
"origin": origin,
|
| 369 |
"sentence_nr": nr,
|
| 370 |
+
"prompt": prompt,
|
| 371 |
+
"response": response,
|
| 372 |
}
|
| 373 |
]
|
| 374 |
|
| 375 |
|
| 376 |
+
# async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
| 377 |
+
# language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
| 378 |
+
# fleurs = pd.read_csv(
|
| 379 |
+
# f"data/fleurs/{language.fleurs_tag}/dev.tsv",
|
| 380 |
+
# sep="\t",
|
| 381 |
+
# names=[
|
| 382 |
+
# "id",
|
| 383 |
+
# "fname",
|
| 384 |
+
# "raw_transcription",
|
| 385 |
+
# "transcription",
|
| 386 |
+
# "words",
|
| 387 |
+
# "id2",
|
| 388 |
+
# "gender",
|
| 389 |
+
# ],
|
| 390 |
+
# )
|
| 391 |
+
# item = fleurs.iloc[nr]
|
| 392 |
+
# path = f"data/fleurs/{language.fleurs_tag}/audio/dev/{item.fname}"
|
| 393 |
+
# pred = await transcribe(path, model=model)
|
| 394 |
+
# wer_score = wer.compute(predictions=[pred], references=[item.transcription])
|
| 395 |
+
# return [
|
| 396 |
+
# {
|
| 397 |
+
# "model": model,
|
| 398 |
+
# "bcp_47": language["bcp_47"],
|
| 399 |
+
# "task": "asr",
|
| 400 |
+
# "metric": "wer",
|
| 401 |
+
# "score": wer_score,
|
| 402 |
+
# "sentence_nr": nr,
|
| 403 |
+
# }
|
| 404 |
+
# ]
|
| 405 |
|
| 406 |
|
| 407 |
tasks = {
|
| 408 |
"translation_from": partial(translate_and_evaluate, mode="from"),
|
| 409 |
"translation_to": partial(translate_and_evaluate, mode="to"),
|
| 410 |
"classification": classify_and_evaluate,
|
|
|
|
| 411 |
"mmlu": mmlu_and_evaluate,
|
| 412 |
"arc": arc_and_evaluate,
|
| 413 |
"truthfulqa": truthfulqa_and_evaluate,
|
| 414 |
"mgsm": mgsm_and_evaluate,
|
|
|
|
| 415 |
}
|
evals/translate.py
CHANGED
|
@@ -6,4 +6,4 @@ from datasets_.mmlu import translate_mmlu
|
|
| 6 |
if __name__ == "__main__":
|
| 7 |
translate_mmlu(languages)
|
| 8 |
translate_mgsm(languages)
|
| 9 |
-
translate_arc(languages)
|
|
|
|
| 6 |
if __name__ == "__main__":
|
| 7 |
translate_mmlu(languages)
|
| 8 |
translate_mgsm(languages)
|
| 9 |
+
translate_arc(languages)
|
frontend/package-lock.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
frontend/package.json
CHANGED
|
@@ -6,13 +6,12 @@
|
|
| 6 |
"@observablehq/plot": "^0.6.17",
|
| 7 |
"@testing-library/dom": "^10.4.0",
|
| 8 |
"@testing-library/jest-dom": "^6.6.3",
|
| 9 |
-
"@testing-library/react": "^
|
| 10 |
"@testing-library/user-event": "^13.5.0",
|
| 11 |
"primeicons": "^7.0.0",
|
| 12 |
"primereact": "^10.9.3",
|
| 13 |
-
"react": "^
|
| 14 |
-
"react-dom": "^
|
| 15 |
-
"react-scripts": "5.0.1",
|
| 16 |
"topojson-simplify": "^3.0.3",
|
| 17 |
"web-vitals": "^2.1.4"
|
| 18 |
},
|
|
@@ -41,5 +40,8 @@
|
|
| 41 |
"last 1 safari version"
|
| 42 |
]
|
| 43 |
},
|
| 44 |
-
"proxy": "http://localhost:8000"
|
|
|
|
|
|
|
|
|
|
| 45 |
}
|
|
|
|
| 6 |
"@observablehq/plot": "^0.6.17",
|
| 7 |
"@testing-library/dom": "^10.4.0",
|
| 8 |
"@testing-library/jest-dom": "^6.6.3",
|
| 9 |
+
"@testing-library/react": "^15.0.0",
|
| 10 |
"@testing-library/user-event": "^13.5.0",
|
| 11 |
"primeicons": "^7.0.0",
|
| 12 |
"primereact": "^10.9.3",
|
| 13 |
+
"react": "^18.2.0",
|
| 14 |
+
"react-dom": "^18.2.0",
|
|
|
|
| 15 |
"topojson-simplify": "^3.0.3",
|
| 16 |
"web-vitals": "^2.1.4"
|
| 17 |
},
|
|
|
|
| 40 |
"last 1 safari version"
|
| 41 |
]
|
| 42 |
},
|
| 43 |
+
"proxy": "http://localhost:8000",
|
| 44 |
+
"devDependencies": {
|
| 45 |
+
"react-scripts": "^5.0.1"
|
| 46 |
+
}
|
| 47 |
}
|
frontend/public/sw.js
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
// Unregister service worker
|
| 2 |
+
self.addEventListener('install', () => {
|
| 3 |
+
self.skipWaiting();
|
| 4 |
+
});
|
| 5 |
+
|
| 6 |
+
self.addEventListener('activate', () => {
|
| 7 |
+
self.registration.unregister();
|
| 8 |
+
});
|
| 9 |
+
|
frontend/src/App.js
CHANGED
|
@@ -16,12 +16,18 @@ import { Button } from 'primereact/button'
|
|
| 16 |
|
| 17 |
function App () {
|
| 18 |
const [data, setData] = useState(null)
|
|
|
|
| 19 |
const [loading, setLoading] = useState(true)
|
| 20 |
const [error, setError] = useState(null)
|
| 21 |
const [selectedLanguages, setSelectedLanguages] = useState([])
|
|
|
|
| 22 |
const [dialogVisible, setDialogVisible] = useState(false)
|
| 23 |
const [aboutVisible, setAboutVisible] = useState(false)
|
| 24 |
const [contributeVisible, setContributeVisible] = useState(false)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 25 |
|
| 26 |
useEffect(() => {
|
| 27 |
fetch('/api/data', {
|
|
@@ -36,6 +42,8 @@ function App () {
|
|
| 36 |
})
|
| 37 |
.then(jsonData => {
|
| 38 |
setData(jsonData)
|
|
|
|
|
|
|
| 39 |
setLoading(false)
|
| 40 |
})
|
| 41 |
.catch(err => {
|
|
@@ -44,8 +52,27 @@ function App () {
|
|
| 44 |
})
|
| 45 |
}, [selectedLanguages])
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
const [windowWidth, setWindowWidth] = useState(window.innerWidth)
|
| 48 |
const [windowHeight, setWindowHeight] = useState(window.innerHeight)
|
|
|
|
| 49 |
useEffect(() => {
|
| 50 |
const handleResize = () => {
|
| 51 |
setWindowWidth(window.innerWidth)
|
|
@@ -55,6 +82,44 @@ function App () {
|
|
| 55 |
return () => window.removeEventListener('resize', handleResize)
|
| 56 |
}, [])
|
| 57 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 58 |
return (
|
| 59 |
<PrimeReactProvider>
|
| 60 |
<div
|
|
@@ -69,35 +134,50 @@ function App () {
|
|
| 69 |
style={{
|
| 70 |
backgroundColor: '#fff3cd',
|
| 71 |
color: '#856404',
|
| 72 |
-
padding: '
|
| 73 |
marginBottom: '1rem',
|
| 74 |
border: '1px solid #ffeeba',
|
| 75 |
borderRadius: '0.25rem',
|
| 76 |
-
textAlign: 'center'
|
|
|
|
|
|
|
| 77 |
}}
|
| 78 |
>
|
| 79 |
<strong>Work in Progress:</strong> This dashboard is currently under
|
| 80 |
-
active development. Evaluation results are not yet final.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 81 |
<a
|
| 82 |
href='https://github.com/datenlabor-bmz/ai-language-monitor'
|
| 83 |
target='_blank'
|
| 84 |
rel='noopener noreferrer'
|
| 85 |
style={{
|
| 86 |
textDecoration: 'none',
|
| 87 |
-
color: '#
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
}}
|
| 95 |
>
|
| 96 |
-
<i
|
| 97 |
-
className='pi pi-github'
|
| 98 |
-
title='View on GitHub'
|
| 99 |
-
style={{ marginRight: '0.3rem' }}
|
| 100 |
-
/>
|
| 101 |
GitHub
|
| 102 |
</a>
|
| 103 |
</div>
|
|
@@ -149,39 +229,88 @@ function App () {
|
|
| 149 |
<div
|
| 150 |
style={{
|
| 151 |
display: 'flex',
|
| 152 |
-
gap: '
|
| 153 |
-
marginBottom: '
|
| 154 |
flexWrap: 'wrap',
|
| 155 |
justifyContent: 'center'
|
| 156 |
}}
|
| 157 |
>
|
| 158 |
-
<
|
| 159 |
-
label='π About this tool'
|
| 160 |
-
className='p-button-text'
|
| 161 |
onClick={() => setAboutVisible(true)}
|
| 162 |
style={{
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
| 166 |
-
|
| 167 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
}}
|
| 169 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
-
<
|
| 172 |
-
label='π Add your model (soon)'
|
| 173 |
-
className='p-button-text'
|
| 174 |
onClick={() => setContributeVisible(true)}
|
| 175 |
-
|
| 176 |
-
tooltipOptions={{ position: 'bottom' }}
|
| 177 |
style={{
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 183 |
}}
|
| 184 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 185 |
</div>
|
| 186 |
|
| 187 |
{data && (
|
|
@@ -220,6 +349,7 @@ function App () {
|
|
| 220 |
data={data.model_table}
|
| 221 |
selectedLanguages={selectedLanguages}
|
| 222 |
allLanguages={data.language_table || []}
|
|
|
|
| 223 |
/>
|
| 224 |
<LanguageTable
|
| 225 |
data={data.language_table}
|
|
@@ -248,20 +378,18 @@ function App () {
|
|
| 248 |
color: '#666'
|
| 249 |
}}
|
| 250 |
/>
|
| 251 |
-
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
|
| 257 |
-
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
style={{ width: '100%', minHeight: '650px' }}
|
| 264 |
-
/>
|
| 265 |
</div>
|
| 266 |
</>
|
| 267 |
)}
|
|
@@ -409,36 +537,16 @@ function App () {
|
|
| 409 |
modal
|
| 410 |
header={null}
|
| 411 |
>
|
| 412 |
-
{
|
| 413 |
<div style={{ width: '100%', height: '100%' }}>
|
| 414 |
<Carousel
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
data={data.countries}
|
| 418 |
-
width={windowWidth * 0.7}
|
| 419 |
-
height={windowHeight * 0.6}
|
| 420 |
-
/>,
|
| 421 |
-
<LanguagePlot
|
| 422 |
-
data={data}
|
| 423 |
-
width={windowWidth * 0.7}
|
| 424 |
-
height={windowHeight * 0.6}
|
| 425 |
-
/>,
|
| 426 |
-
<SpeakerPlot
|
| 427 |
-
data={data}
|
| 428 |
-
width={windowWidth * 0.7}
|
| 429 |
-
height={windowHeight * 0.6}
|
| 430 |
-
/>,
|
| 431 |
-
<HistoryPlot
|
| 432 |
-
data={data}
|
| 433 |
-
width={windowWidth * 0.7}
|
| 434 |
-
height={windowHeight * 0.6}
|
| 435 |
-
/>,
|
| 436 |
-
<CostPlot data={data} />
|
| 437 |
-
]}
|
| 438 |
numScroll={1}
|
| 439 |
numVisible={1}
|
| 440 |
itemTemplate={item => item}
|
| 441 |
-
circular
|
|
|
|
| 442 |
style={{ width: '100%', height: 'calc(90vh - 120px)' }}
|
| 443 |
/>
|
| 444 |
</div>
|
|
@@ -449,4 +557,4 @@ function App () {
|
|
| 449 |
)
|
| 450 |
}
|
| 451 |
|
| 452 |
-
export default App
|
|
|
|
| 16 |
|
| 17 |
function App () {
|
| 18 |
const [data, setData] = useState(null)
|
| 19 |
+
const [baseData, setBaseData] = useState(null)
|
| 20 |
const [loading, setLoading] = useState(true)
|
| 21 |
const [error, setError] = useState(null)
|
| 22 |
const [selectedLanguages, setSelectedLanguages] = useState([])
|
| 23 |
+
const [machineTranslatedMetrics, setMachineTranslatedMetrics] = useState([])
|
| 24 |
const [dialogVisible, setDialogVisible] = useState(false)
|
| 25 |
const [aboutVisible, setAboutVisible] = useState(false)
|
| 26 |
const [contributeVisible, setContributeVisible] = useState(false)
|
| 27 |
+
|
| 28 |
+
// Add state for carousel items
|
| 29 |
+
const [carouselItems, setCarouselItems] = useState([])
|
| 30 |
+
const [fullScreenCarouselItems, setFullScreenCarouselItems] = useState([])
|
| 31 |
|
| 32 |
useEffect(() => {
|
| 33 |
fetch('/api/data', {
|
|
|
|
| 42 |
})
|
| 43 |
.then(jsonData => {
|
| 44 |
setData(jsonData)
|
| 45 |
+
setMachineTranslatedMetrics(jsonData.machine_translated_metrics || [])
|
| 46 |
+
if (!baseData) setBaseData(jsonData)
|
| 47 |
setLoading(false)
|
| 48 |
})
|
| 49 |
.catch(err => {
|
|
|
|
| 52 |
})
|
| 53 |
}, [selectedLanguages])
|
| 54 |
|
| 55 |
+
// Create carousel items when data is loaded
|
| 56 |
+
useEffect(() => {
|
| 57 |
+
if (data) {
|
| 58 |
+
// Add a small delay to ensure components are ready
|
| 59 |
+
const timer = setTimeout(() => {
|
| 60 |
+
setCarouselItems([
|
| 61 |
+
<WorldMap key="worldmap-0" data={(baseData || data).countries} allLanguages={(baseData || data).language_table} width={750} height={500} />,
|
| 62 |
+
<LanguagePlot key="langplot-1" data={data} width={750} height={500} />,
|
| 63 |
+
<SpeakerPlot key="speakerplot-2" data={data} width={750} height={500} />,
|
| 64 |
+
<HistoryPlot key="histplot-3" data={data} width={750} height={500} />,
|
| 65 |
+
<CostPlot key="costplot-4" data={data} width={750} height={500} />
|
| 66 |
+
]);
|
| 67 |
+
}, 100);
|
| 68 |
+
|
| 69 |
+
return () => clearTimeout(timer);
|
| 70 |
+
}
|
| 71 |
+
}, [data, baseData])
|
| 72 |
+
|
| 73 |
const [windowWidth, setWindowWidth] = useState(window.innerWidth)
|
| 74 |
const [windowHeight, setWindowHeight] = useState(window.innerHeight)
|
| 75 |
+
|
| 76 |
useEffect(() => {
|
| 77 |
const handleResize = () => {
|
| 78 |
setWindowWidth(window.innerWidth)
|
|
|
|
| 82 |
return () => window.removeEventListener('resize', handleResize)
|
| 83 |
}, [])
|
| 84 |
|
| 85 |
+
// Create full-screen carousel items when data or window size changes
|
| 86 |
+
useEffect(() => {
|
| 87 |
+
if (data) {
|
| 88 |
+
const timer = setTimeout(() => {
|
| 89 |
+
setFullScreenCarouselItems([
|
| 90 |
+
<WorldMap
|
| 91 |
+
key="fs-worldmap-0"
|
| 92 |
+
data={(baseData || data).countries}
|
| 93 |
+
allLanguages={(baseData || data).language_table}
|
| 94 |
+
width={windowWidth * 0.7}
|
| 95 |
+
height={windowHeight * 0.6}
|
| 96 |
+
/>,
|
| 97 |
+
<LanguagePlot
|
| 98 |
+
key="fs-langplot-1"
|
| 99 |
+
data={data}
|
| 100 |
+
width={windowWidth * 0.7}
|
| 101 |
+
height={windowHeight * 0.6}
|
| 102 |
+
/>,
|
| 103 |
+
<SpeakerPlot
|
| 104 |
+
key="fs-speakerplot-2"
|
| 105 |
+
data={data}
|
| 106 |
+
width={windowWidth * 0.7}
|
| 107 |
+
height={windowHeight * 0.6}
|
| 108 |
+
/>,
|
| 109 |
+
<HistoryPlot
|
| 110 |
+
key="fs-histplot-3"
|
| 111 |
+
data={data}
|
| 112 |
+
width={windowWidth * 0.7}
|
| 113 |
+
height={windowHeight * 0.6}
|
| 114 |
+
/>,
|
| 115 |
+
<CostPlot key="fs-costplot-4" data={data} width={windowWidth * 0.7} height={windowHeight * 0.6} />
|
| 116 |
+
]);
|
| 117 |
+
}, 100);
|
| 118 |
+
|
| 119 |
+
return () => clearTimeout(timer);
|
| 120 |
+
}
|
| 121 |
+
}, [data, baseData, windowWidth, windowHeight])
|
| 122 |
+
|
| 123 |
return (
|
| 124 |
<PrimeReactProvider>
|
| 125 |
<div
|
|
|
|
| 134 |
style={{
|
| 135 |
backgroundColor: '#fff3cd',
|
| 136 |
color: '#856404',
|
| 137 |
+
padding: '1rem 1.5rem',
|
| 138 |
marginBottom: '1rem',
|
| 139 |
border: '1px solid #ffeeba',
|
| 140 |
borderRadius: '0.25rem',
|
| 141 |
+
textAlign: 'center',
|
| 142 |
+
lineHeight: '1.5',
|
| 143 |
+
position: 'relative'
|
| 144 |
}}
|
| 145 |
>
|
| 146 |
<strong>Work in Progress:</strong> This dashboard is currently under
|
| 147 |
+
active development. Evaluation results are not yet final. More extensive evaluation runs will be released later this year.
|
| 148 |
+
</div>
|
| 149 |
+
<div
|
| 150 |
+
style={{
|
| 151 |
+
display: 'flex',
|
| 152 |
+
justifyContent: 'flex-end',
|
| 153 |
+
padding: '0 1.5rem',
|
| 154 |
+
marginBottom: '1rem'
|
| 155 |
+
}}
|
| 156 |
+
>
|
| 157 |
<a
|
| 158 |
href='https://github.com/datenlabor-bmz/ai-language-monitor'
|
| 159 |
target='_blank'
|
| 160 |
rel='noopener noreferrer'
|
| 161 |
style={{
|
| 162 |
textDecoration: 'none',
|
| 163 |
+
color: '#6c757d',
|
| 164 |
+
fontSize: '1rem',
|
| 165 |
+
fontWeight: '500',
|
| 166 |
+
padding: '0.5rem 1rem',
|
| 167 |
+
borderRadius: '0.375rem',
|
| 168 |
+
backgroundColor: '#f8f9fa',
|
| 169 |
+
border: '1px solid #e9ecef',
|
| 170 |
+
display: 'flex',
|
| 171 |
+
alignItems: 'center',
|
| 172 |
+
gap: '0.5rem',
|
| 173 |
+
transition: 'all 0.2s ease',
|
| 174 |
+
':hover': {
|
| 175 |
+
backgroundColor: '#e9ecef',
|
| 176 |
+
color: '#495057'
|
| 177 |
+
}
|
| 178 |
}}
|
| 179 |
>
|
| 180 |
+
<i className='pi pi-github' title='View on GitHub' />
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
GitHub
|
| 182 |
</a>
|
| 183 |
</div>
|
|
|
|
| 229 |
<div
|
| 230 |
style={{
|
| 231 |
display: 'flex',
|
| 232 |
+
gap: '0.75rem',
|
| 233 |
+
marginBottom: '2rem',
|
| 234 |
flexWrap: 'wrap',
|
| 235 |
justifyContent: 'center'
|
| 236 |
}}
|
| 237 |
>
|
| 238 |
+
<button
|
|
|
|
|
|
|
| 239 |
onClick={() => setAboutVisible(true)}
|
| 240 |
style={{
|
| 241 |
+
background: 'linear-gradient(135deg, #667eea 0%, #764ba2 100%)',
|
| 242 |
+
color: 'white',
|
| 243 |
+
border: 'none',
|
| 244 |
+
padding: '0.75rem 1.5rem',
|
| 245 |
+
borderRadius: '12px',
|
| 246 |
+
fontSize: '0.95rem',
|
| 247 |
+
fontWeight: '500',
|
| 248 |
+
cursor: 'pointer',
|
| 249 |
+
display: 'flex',
|
| 250 |
+
alignItems: 'center',
|
| 251 |
+
gap: '0.5rem',
|
| 252 |
+
boxShadow: '0 4px 15px rgba(102, 126, 234, 0.25)',
|
| 253 |
+
transition: 'all 0.3s ease',
|
| 254 |
+
':hover': {
|
| 255 |
+
transform: 'translateY(-2px)',
|
| 256 |
+
boxShadow: '0 8px 25px rgba(102, 126, 234, 0.35)'
|
| 257 |
+
}
|
| 258 |
}}
|
| 259 |
+
onMouseEnter={(e) => {
|
| 260 |
+
e.target.style.transform = 'translateY(-2px)';
|
| 261 |
+
e.target.style.boxShadow = '0 8px 25px rgba(102, 126, 234, 0.35)';
|
| 262 |
+
}}
|
| 263 |
+
onMouseLeave={(e) => {
|
| 264 |
+
e.target.style.transform = 'translateY(0)';
|
| 265 |
+
e.target.style.boxShadow = '0 4px 15px rgba(102, 126, 234, 0.25)';
|
| 266 |
+
}}
|
| 267 |
+
>
|
| 268 |
+
<span style={{ fontSize: '1.1rem' }}>π</span>
|
| 269 |
+
About this tool
|
| 270 |
+
</button>
|
| 271 |
|
| 272 |
+
<button
|
|
|
|
|
|
|
| 273 |
onClick={() => setContributeVisible(true)}
|
| 274 |
+
title='This feature is on our roadmap and will be available soon.'
|
|
|
|
| 275 |
style={{
|
| 276 |
+
background: 'linear-gradient(135deg, #ff9a9e 0%, #fecfef 50%, #fecfef 100%)',
|
| 277 |
+
color: '#6b46c1',
|
| 278 |
+
border: 'none',
|
| 279 |
+
padding: '0.75rem 1.5rem',
|
| 280 |
+
borderRadius: '12px',
|
| 281 |
+
fontSize: '0.95rem',
|
| 282 |
+
fontWeight: '500',
|
| 283 |
+
cursor: 'pointer',
|
| 284 |
+
display: 'flex',
|
| 285 |
+
alignItems: 'center',
|
| 286 |
+
gap: '0.5rem',
|
| 287 |
+
boxShadow: '0 4px 15px rgba(255, 154, 158, 0.25)',
|
| 288 |
+
transition: 'all 0.3s ease',
|
| 289 |
+
position: 'relative',
|
| 290 |
+
overflow: 'hidden'
|
| 291 |
}}
|
| 292 |
+
onMouseEnter={(e) => {
|
| 293 |
+
e.target.style.transform = 'translateY(-2px)';
|
| 294 |
+
e.target.style.boxShadow = '0 8px 25px rgba(255, 154, 158, 0.35)';
|
| 295 |
+
}}
|
| 296 |
+
onMouseLeave={(e) => {
|
| 297 |
+
e.target.style.transform = 'translateY(0)';
|
| 298 |
+
e.target.style.boxShadow = '0 4px 15px rgba(255, 154, 158, 0.25)';
|
| 299 |
+
}}
|
| 300 |
+
>
|
| 301 |
+
<span style={{ fontSize: '1.1rem' }}>π</span>
|
| 302 |
+
Add your model
|
| 303 |
+
<span style={{
|
| 304 |
+
fontSize: '0.75rem',
|
| 305 |
+
backgroundColor: 'rgba(107, 70, 193, 0.15)',
|
| 306 |
+
padding: '0.2rem 0.5rem',
|
| 307 |
+
borderRadius: '6px',
|
| 308 |
+
marginLeft: '0.5rem',
|
| 309 |
+
fontWeight: '600'
|
| 310 |
+
}}>
|
| 311 |
+
soon
|
| 312 |
+
</span>
|
| 313 |
+
</button>
|
| 314 |
</div>
|
| 315 |
|
| 316 |
{data && (
|
|
|
|
| 349 |
data={data.model_table}
|
| 350 |
selectedLanguages={selectedLanguages}
|
| 351 |
allLanguages={data.language_table || []}
|
| 352 |
+
machineTranslatedMetrics={machineTranslatedMetrics}
|
| 353 |
/>
|
| 354 |
<LanguageTable
|
| 355 |
data={data.language_table}
|
|
|
|
| 378 |
color: '#666'
|
| 379 |
}}
|
| 380 |
/>
|
| 381 |
+
{carouselItems.length > 0 && (
|
| 382 |
+
<Carousel
|
| 383 |
+
key={`main-carousel-${carouselItems.length}-${Date.now()}`}
|
| 384 |
+
value={carouselItems}
|
| 385 |
+
numScroll={1}
|
| 386 |
+
numVisible={1}
|
| 387 |
+
itemTemplate={item => item}
|
| 388 |
+
circular={false}
|
| 389 |
+
activeIndex={0}
|
| 390 |
+
style={{ width: '100%', minHeight: '650px' }}
|
| 391 |
+
/>
|
| 392 |
+
)}
|
|
|
|
|
|
|
| 393 |
</div>
|
| 394 |
</>
|
| 395 |
)}
|
|
|
|
| 537 |
modal
|
| 538 |
header={null}
|
| 539 |
>
|
| 540 |
+
{fullScreenCarouselItems.length > 0 && (
|
| 541 |
<div style={{ width: '100%', height: '100%' }}>
|
| 542 |
<Carousel
|
| 543 |
+
key={`fs-carousel-${fullScreenCarouselItems.length}-${Date.now()}`}
|
| 544 |
+
value={fullScreenCarouselItems}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 545 |
numScroll={1}
|
| 546 |
numVisible={1}
|
| 547 |
itemTemplate={item => item}
|
| 548 |
+
circular={false}
|
| 549 |
+
activeIndex={0}
|
| 550 |
style={{ width: '100%', height: 'calc(90vh - 120px)' }}
|
| 551 |
/>
|
| 552 |
</div>
|
|
|
|
| 557 |
)
|
| 558 |
}
|
| 559 |
|
| 560 |
+
export default App
|
frontend/src/components/HistoryPlot.js
CHANGED
|
@@ -50,12 +50,12 @@ const HistoryPlot = ({ data, width = 750, height = 500 }) => {
|
|
| 50 |
...models.filter(d => d.newRecord),
|
| 51 |
{
|
| 52 |
creation_date: new Date(),
|
| 53 |
-
maxAverage: models[models.length - 1]
|
| 54 |
}
|
| 55 |
],
|
| 56 |
{
|
| 57 |
x: d => d.creation_date,
|
| 58 |
-
y: d => d.maxAverage,
|
| 59 |
curve: 'step-after',
|
| 60 |
strokeOpacity: 0.3
|
| 61 |
}
|
|
|
|
| 50 |
...models.filter(d => d.newRecord),
|
| 51 |
{
|
| 52 |
creation_date: new Date(),
|
| 53 |
+
maxAverage: models[models.length - 1]?.maxAverage || 0
|
| 54 |
}
|
| 55 |
],
|
| 56 |
{
|
| 57 |
x: d => d.creation_date,
|
| 58 |
+
y: d => d.maxAverage || 0,
|
| 59 |
curve: 'step-after',
|
| 60 |
strokeOpacity: 0.3
|
| 61 |
}
|
frontend/src/components/LanguageTable.js
CHANGED
|
@@ -172,7 +172,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages, totalMod
|
|
| 172 |
filterElement={familyRowFilterTemplate}
|
| 173 |
style={{ minWidth: '10rem' }}
|
| 174 |
/>
|
| 175 |
-
{ScoreColumns}
|
| 176 |
</DataTable>
|
| 177 |
)
|
| 178 |
}
|
|
|
|
| 172 |
filterElement={familyRowFilterTemplate}
|
| 173 |
style={{ minWidth: '10rem' }}
|
| 174 |
/>
|
| 175 |
+
{ScoreColumns()}
|
| 176 |
</DataTable>
|
| 177 |
)
|
| 178 |
}
|
frontend/src/components/ModelTable.js
CHANGED
|
@@ -6,7 +6,7 @@ import { useState, useEffect } from 'react'
|
|
| 6 |
import Medal from './Medal'
|
| 7 |
import { Slider } from 'primereact/slider'
|
| 8 |
import ScoreColumns from './ScoreColumns'
|
| 9 |
-
const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
| 10 |
const [filters, setFilters] = useState({
|
| 11 |
type: { value: null, matchMode: FilterMatchMode.IN },
|
| 12 |
size: { value: null, matchMode: FilterMatchMode.BETWEEN },
|
|
@@ -50,10 +50,10 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
|
| 50 |
}
|
| 51 |
|
| 52 |
const SliderWithLabel = ({ value, onChange, min, max }) => {
|
| 53 |
-
const p = 10
|
| 54 |
-
const start = value === null ? min : Math.log(value[0]) / Math.log(p)
|
| 55 |
-
const stop = value === null ? max : Math.log(value[1]) / Math.log(p)
|
| 56 |
-
const [_value, _setValue] = useState([start, stop])
|
| 57 |
useEffect(() => {
|
| 58 |
const timer = setTimeout(() => {
|
| 59 |
onChange({
|
|
@@ -61,11 +61,11 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
|
| 61 |
// set to "no filter" when (almost) the whole range is selected
|
| 62 |
_value[0] <= min + 0.1 && _value[1] >= max - 0.1
|
| 63 |
? null
|
| 64 |
-
: [p ** _value[0], p ** _value[1]]
|
| 65 |
-
})
|
| 66 |
-
}, 1000)
|
| 67 |
-
return () => clearTimeout(timer)
|
| 68 |
-
}, [_value, onChange, min, max])
|
| 69 |
return (
|
| 70 |
<div style={{ minWidth: '20rem' }}>
|
| 71 |
<div>{formatSize(p ** _value[0])}</div>
|
|
@@ -147,21 +147,35 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
|
| 147 |
}
|
| 148 |
|
| 149 |
const costBodyTemplate = rowData => {
|
| 150 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
}
|
| 152 |
|
| 153 |
const getHeaderText = () => {
|
| 154 |
-
// Count languages that have evaluation data (
|
| 155 |
-
const evaluatedLanguagesCount = allLanguages.filter(lang =>
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
|
| 159 |
if (selectedLanguages.length === 0) {
|
| 160 |
return (
|
| 161 |
<span>
|
| 162 |
<span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
|
| 163 |
<span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
|
| 164 |
-
|
| 165 |
</span>
|
| 166 |
</span>
|
| 167 |
)
|
|
@@ -245,7 +259,7 @@ const ModelTable = ({ data, selectedLanguages = [], allLanguages = [] }) => {
|
|
| 245 |
body={costBodyTemplate}
|
| 246 |
style={{ minWidth: '5rem' }}
|
| 247 |
/>
|
| 248 |
-
{ScoreColumns}
|
| 249 |
</DataTable>
|
| 250 |
)
|
| 251 |
}
|
|
|
|
| 6 |
import Medal from './Medal'
|
| 7 |
import { Slider } from 'primereact/slider'
|
| 8 |
import ScoreColumns from './ScoreColumns'
|
| 9 |
+
const ModelTable = ({ data, selectedLanguages = [], allLanguages = [], machineTranslatedMetrics = [] }) => {
|
| 10 |
const [filters, setFilters] = useState({
|
| 11 |
type: { value: null, matchMode: FilterMatchMode.IN },
|
| 12 |
size: { value: null, matchMode: FilterMatchMode.BETWEEN },
|
|
|
|
| 50 |
}
|
| 51 |
|
| 52 |
const SliderWithLabel = ({ value, onChange, min, max }) => {
|
| 53 |
+
const p = 10;
|
| 54 |
+
const start = value === null || value[0] === null ? min : Math.log(value[0]) / Math.log(p);
|
| 55 |
+
const stop = value === null || value[1] === null ? max : Math.log(value[1]) / Math.log(p);
|
| 56 |
+
const [_value, _setValue] = useState([start, stop]);
|
| 57 |
useEffect(() => {
|
| 58 |
const timer = setTimeout(() => {
|
| 59 |
onChange({
|
|
|
|
| 61 |
// set to "no filter" when (almost) the whole range is selected
|
| 62 |
_value[0] <= min + 0.1 && _value[1] >= max - 0.1
|
| 63 |
? null
|
| 64 |
+
: [p ** _value[0], p ** _value[1]],
|
| 65 |
+
});
|
| 66 |
+
}, 1000);
|
| 67 |
+
return () => clearTimeout(timer);
|
| 68 |
+
}, [_value, onChange, min, max]);
|
| 69 |
return (
|
| 70 |
<div style={{ minWidth: '20rem' }}>
|
| 71 |
<div>{formatSize(p ** _value[0])}</div>
|
|
|
|
| 147 |
}
|
| 148 |
|
| 149 |
const costBodyTemplate = rowData => {
|
| 150 |
+
return (
|
| 151 |
+
<div style={{ textAlign: 'center' }}>
|
| 152 |
+
{rowData.cost === null ? 'n/a' : `$${rowData.cost.toFixed(2)}`}
|
| 153 |
+
</div>
|
| 154 |
+
)
|
| 155 |
}
|
| 156 |
|
| 157 |
const getHeaderText = () => {
|
| 158 |
+
// Count languages that have any evaluation data (any task scores available)
|
| 159 |
+
const evaluatedLanguagesCount = allLanguages.filter(lang => {
|
| 160 |
+
// Check if language has any task scores (not just average)
|
| 161 |
+
const hasAnyScores = [
|
| 162 |
+
'translation_from_bleu',
|
| 163 |
+
'translation_to_bleu',
|
| 164 |
+
'classification_accuracy',
|
| 165 |
+
'mmlu_accuracy',
|
| 166 |
+
'arc_accuracy',
|
| 167 |
+
'truthfulqa_accuracy',
|
| 168 |
+
'mgsm_accuracy'
|
| 169 |
+
].some(metric => lang[metric] !== null && lang[metric] !== undefined)
|
| 170 |
+
return hasAnyScores
|
| 171 |
+
}).length
|
| 172 |
|
| 173 |
if (selectedLanguages.length === 0) {
|
| 174 |
return (
|
| 175 |
<span>
|
| 176 |
<span style={{ fontWeight: 'bold', fontSize: '1.1em' }}>AI Models</span>
|
| 177 |
<span style={{ fontSize: '0.85em', marginLeft: '0.5rem' }}>
|
| 178 |
+
Performance across {evaluatedLanguagesCount} evaluated languages
|
| 179 |
</span>
|
| 180 |
</span>
|
| 181 |
)
|
|
|
|
| 259 |
body={costBodyTemplate}
|
| 260 |
style={{ minWidth: '5rem' }}
|
| 261 |
/>
|
| 262 |
+
{ScoreColumns(machineTranslatedMetrics)}
|
| 263 |
</DataTable>
|
| 264 |
)
|
| 265 |
}
|
frontend/src/components/ScoreColumns.js
CHANGED
|
@@ -2,21 +2,28 @@ import { Column } from 'primereact/column'
|
|
| 2 |
import ScoreField from './ScoreField'
|
| 3 |
|
| 4 |
const scoreBodyTemplate = (field, options = {}) => {
|
| 5 |
-
const { minScore = 0, maxScore = 1 } = options
|
| 6 |
|
| 7 |
return rowData => {
|
| 8 |
const score = rowData[field]
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
}
|
| 11 |
}
|
| 12 |
|
| 13 |
-
const ScoreColumns = [
|
| 14 |
<Column
|
| 15 |
field='average'
|
| 16 |
header='Proficiency'
|
| 17 |
headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
|
| 18 |
sortable
|
| 19 |
-
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5 })}
|
| 20 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 21 |
/>,
|
| 22 |
<Column
|
|
@@ -26,7 +33,8 @@ const ScoreColumns = [
|
|
| 26 |
sortable
|
| 27 |
body={scoreBodyTemplate('translation_from_bleu', {
|
| 28 |
minScore: 0,
|
| 29 |
-
maxScore: 0.5
|
|
|
|
| 30 |
})}
|
| 31 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 32 |
/>,
|
|
@@ -37,7 +45,8 @@ const ScoreColumns = [
|
|
| 37 |
sortable
|
| 38 |
body={scoreBodyTemplate('translation_to_bleu', {
|
| 39 |
minScore: 0,
|
| 40 |
-
maxScore: 0.5
|
|
|
|
| 41 |
})}
|
| 42 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 43 |
/>,
|
|
@@ -48,7 +57,8 @@ const ScoreColumns = [
|
|
| 48 |
sortable
|
| 49 |
body={scoreBodyTemplate('classification_accuracy', {
|
| 50 |
minScore: 0,
|
| 51 |
-
maxScore: 0.5
|
|
|
|
| 52 |
})}
|
| 53 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 54 |
/>,
|
|
@@ -69,7 +79,8 @@ const ScoreColumns = [
|
|
| 69 |
sortable
|
| 70 |
body={scoreBodyTemplate('mmlu_accuracy', {
|
| 71 |
minScore: 0,
|
| 72 |
-
maxScore: 1
|
|
|
|
| 73 |
})}
|
| 74 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 75 |
/>,
|
|
@@ -80,7 +91,8 @@ const ScoreColumns = [
|
|
| 80 |
sortable
|
| 81 |
body={scoreBodyTemplate('arc_accuracy', {
|
| 82 |
minScore: 0,
|
| 83 |
-
maxScore: 1
|
|
|
|
| 84 |
})}
|
| 85 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 86 |
/>,
|
|
@@ -91,7 +103,8 @@ const ScoreColumns = [
|
|
| 91 |
sortable
|
| 92 |
body={scoreBodyTemplate('mgsm_accuracy', {
|
| 93 |
minScore: 0,
|
| 94 |
-
maxScore: 1
|
|
|
|
| 95 |
})}
|
| 96 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 97 |
/>,
|
|
|
|
| 2 |
import ScoreField from './ScoreField'
|
| 3 |
|
| 4 |
const scoreBodyTemplate = (field, options = {}) => {
|
| 5 |
+
const { minScore = 0, maxScore = 1, machineTranslatedMetrics = [] } = options
|
| 6 |
|
| 7 |
return rowData => {
|
| 8 |
const score = rowData[field]
|
| 9 |
+
// Prefer per-row flag if present (backend sets `<metric>_is_machine`),
|
| 10 |
+
// otherwise fall back to global list
|
| 11 |
+
const rowFlagKey = `${field}_is_machine`
|
| 12 |
+
const hasRowFlag = Object.prototype.hasOwnProperty.call(rowData, rowFlagKey)
|
| 13 |
+
const isMachineTranslated = hasRowFlag
|
| 14 |
+
? !!rowData[rowFlagKey]
|
| 15 |
+
: machineTranslatedMetrics.includes(field)
|
| 16 |
+
return ScoreField(score, minScore, maxScore, isMachineTranslated)
|
| 17 |
}
|
| 18 |
}
|
| 19 |
|
| 20 |
+
const ScoreColumns = (machineTranslatedMetrics = []) => [
|
| 21 |
<Column
|
| 22 |
field='average'
|
| 23 |
header='Proficiency'
|
| 24 |
headerTooltip='Language Proficiency Score (average of the scores for each task, after min-max normalization)'
|
| 25 |
sortable
|
| 26 |
+
body={scoreBodyTemplate('average', { minScore: 0.2, maxScore: 0.5, machineTranslatedMetrics })}
|
| 27 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 28 |
/>,
|
| 29 |
<Column
|
|
|
|
| 33 |
sortable
|
| 34 |
body={scoreBodyTemplate('translation_from_bleu', {
|
| 35 |
minScore: 0,
|
| 36 |
+
maxScore: 0.5,
|
| 37 |
+
machineTranslatedMetrics
|
| 38 |
})}
|
| 39 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 40 |
/>,
|
|
|
|
| 45 |
sortable
|
| 46 |
body={scoreBodyTemplate('translation_to_bleu', {
|
| 47 |
minScore: 0,
|
| 48 |
+
maxScore: 0.5,
|
| 49 |
+
machineTranslatedMetrics
|
| 50 |
})}
|
| 51 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 52 |
/>,
|
|
|
|
| 57 |
sortable
|
| 58 |
body={scoreBodyTemplate('classification_accuracy', {
|
| 59 |
minScore: 0,
|
| 60 |
+
maxScore: 0.5,
|
| 61 |
+
machineTranslatedMetrics
|
| 62 |
})}
|
| 63 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 64 |
/>,
|
|
|
|
| 79 |
sortable
|
| 80 |
body={scoreBodyTemplate('mmlu_accuracy', {
|
| 81 |
minScore: 0,
|
| 82 |
+
maxScore: 1,
|
| 83 |
+
machineTranslatedMetrics
|
| 84 |
})}
|
| 85 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 86 |
/>,
|
|
|
|
| 91 |
sortable
|
| 92 |
body={scoreBodyTemplate('arc_accuracy', {
|
| 93 |
minScore: 0,
|
| 94 |
+
maxScore: 1,
|
| 95 |
+
machineTranslatedMetrics
|
| 96 |
})}
|
| 97 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 98 |
/>,
|
|
|
|
| 103 |
sortable
|
| 104 |
body={scoreBodyTemplate('mgsm_accuracy', {
|
| 105 |
minScore: 0,
|
| 106 |
+
maxScore: 1,
|
| 107 |
+
machineTranslatedMetrics
|
| 108 |
})}
|
| 109 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 110 |
/>,
|
frontend/src/components/ScoreField.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
const ScoreField = (score, minScore, maxScore) => {
|
| 2 |
let percentage = 100
|
| 3 |
let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
|
| 4 |
if (score !== null) {
|
|
@@ -50,6 +50,7 @@ const ScoreField = (score, minScore, maxScore) => {
|
|
| 50 |
}}
|
| 51 |
>
|
| 52 |
{score !== null ? (score * 100).toFixed(1)+"%" : 'β'}
|
|
|
|
| 53 |
</span>
|
| 54 |
</div>
|
| 55 |
)
|
|
|
|
| 1 |
+
const ScoreField = (score, minScore, maxScore, isMachineTranslated = false) => {
|
| 2 |
let percentage = 100
|
| 3 |
let barColor = "rgba(210, 106, 255, 0.1)" // light violet for missing data
|
| 4 |
if (score !== null) {
|
|
|
|
| 50 |
}}
|
| 51 |
>
|
| 52 |
{score !== null ? (score * 100).toFixed(1)+"%" : 'β'}
|
| 53 |
+
{isMachineTranslated && score !== null && <span style={{color: '#666', fontSize: '0.8em'}}>*</span>}
|
| 54 |
</span>
|
| 55 |
</div>
|
| 56 |
)
|
frontend/src/components/SpeakerPlot.js
CHANGED
|
@@ -73,10 +73,10 @@ const SpeakerPlot = ({ data, width = 750, height = 500 }) => {
|
|
| 73 |
textStrokeOpacity: 0,
|
| 74 |
textFillOpacity: 0
|
| 75 |
}),
|
| 76 |
-
Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
|
| 77 |
x: 40,
|
| 78 |
y: languages[39].cumSpeakers / 1e6
|
| 79 |
-
})
|
| 80 |
]
|
| 81 |
})
|
| 82 |
containerRef.current.append(plot)
|
|
|
|
| 73 |
textStrokeOpacity: 0,
|
| 74 |
textFillOpacity: 0
|
| 75 |
}),
|
| 76 |
+
...(languages.length >= 40 ? [Plot.tip(['The 40 most spoken languages cover 80% of all speakers.'], {
|
| 77 |
x: 40,
|
| 78 |
y: languages[39].cumSpeakers / 1e6
|
| 79 |
+
})] : [])
|
| 80 |
]
|
| 81 |
})
|
| 82 |
containerRef.current.append(plot)
|
frontend/src/components/WorldMap.js
CHANGED
|
@@ -26,13 +26,13 @@ const makeTitle = data => d => {
|
|
| 26 |
a =>
|
| 27 |
`${smoothProgressBar(a.population / pop)} ${
|
| 28 |
a.name
|
| 29 |
-
} β ${a.score.toFixed(2)}`
|
| 30 |
)
|
| 31 |
.join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
|
| 32 |
-
return `${d.properties.ADMIN} β ${cData?.score.toFixed(2)}\n\n${langstring}`
|
| 33 |
}
|
| 34 |
|
| 35 |
-
const WorldMap = ({ data, width = 750, height = 500 }) => {
|
| 36 |
const containerRef = useRef()
|
| 37 |
const [mapData, setMapData] = useState()
|
| 38 |
|
|
@@ -48,8 +48,22 @@ const WorldMap = ({ data, width = 750, height = 500 }) => {
|
|
| 48 |
acc[country.iso2] = country
|
| 49 |
return acc
|
| 50 |
}, {})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
const plot = Plot.plot({
|
| 52 |
-
subtitle:
|
| 53 |
width: width,
|
| 54 |
height: height,
|
| 55 |
projection: 'equal-earth',
|
|
@@ -61,11 +75,12 @@ const WorldMap = ({ data, width = 750, height = 500 }) => {
|
|
| 61 |
})
|
| 62 |
],
|
| 63 |
color: {
|
| 64 |
-
scheme: '
|
| 65 |
-
unknown: '
|
| 66 |
label: 'Score',
|
| 67 |
legend: true,
|
| 68 |
-
domain: [0, 1]
|
|
|
|
| 69 |
},
|
| 70 |
style: {
|
| 71 |
fontFamily: 'monospace'
|
|
|
|
| 26 |
a =>
|
| 27 |
`${smoothProgressBar(a.population / pop)} ${
|
| 28 |
a.name
|
| 29 |
+
} β ${a.score === null || a.score === undefined ? "n/a" : a.score.toFixed(2)}`
|
| 30 |
)
|
| 31 |
.join('\n\n') + (languages?.length > 10 ? `\n\n...` : '')
|
| 32 |
+
return `${d.properties.ADMIN} β ${cData?.score === null || cData?.score === undefined ? "n/a" : cData.score.toFixed(2)}\n\n${langstring}`
|
| 33 |
}
|
| 34 |
|
| 35 |
+
const WorldMap = ({ data, width = 750, height = 500, allLanguages = [] }) => {
|
| 36 |
const containerRef = useRef()
|
| 37 |
const [mapData, setMapData] = useState()
|
| 38 |
|
|
|
|
| 48 |
acc[country.iso2] = country
|
| 49 |
return acc
|
| 50 |
}, {})
|
| 51 |
+
// Count languages that have any evaluation data
|
| 52 |
+
const evaluatedLanguagesCount = allLanguages.filter(lang => {
|
| 53 |
+
const hasAnyScores = [
|
| 54 |
+
'translation_from_bleu',
|
| 55 |
+
'translation_to_bleu',
|
| 56 |
+
'classification_accuracy',
|
| 57 |
+
'mmlu_accuracy',
|
| 58 |
+
'arc_accuracy',
|
| 59 |
+
'truthfulqa_accuracy',
|
| 60 |
+
'mgsm_accuracy'
|
| 61 |
+
].some(metric => lang[metric] !== null && lang[metric] !== undefined)
|
| 62 |
+
return hasAnyScores
|
| 63 |
+
}).length
|
| 64 |
+
|
| 65 |
const plot = Plot.plot({
|
| 66 |
+
subtitle: `Language Proficiency Score by Country (Coverage: ~${evaluatedLanguagesCount} languages evaluated)`,
|
| 67 |
width: width,
|
| 68 |
height: height,
|
| 69 |
projection: 'equal-earth',
|
|
|
|
| 75 |
})
|
| 76 |
],
|
| 77 |
color: {
|
| 78 |
+
scheme: 'RdYlGn',
|
| 79 |
+
unknown: '#d0d0d0',
|
| 80 |
label: 'Score',
|
| 81 |
legend: true,
|
| 82 |
+
domain: [0, 1],
|
| 83 |
+
pivot: 0.5
|
| 84 |
},
|
| 85 |
style: {
|
| 86 |
fontFamily: 'monospace'
|
notes/system-architecture-diagram.md
ADDED
|
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# AI Language Monitor - System Architecture
|
| 2 |
+
|
| 3 |
+
\[AI-generated, not 100% up-to-date\]
|
| 4 |
+
|
| 5 |
+
This diagram shows the complete data flow from model discovery through evaluation to frontend visualization.
|
| 6 |
+
|
| 7 |
+
```mermaid
|
| 8 |
+
flowchart TD
|
| 9 |
+
%% Model Sources
|
| 10 |
+
A1["important_models<br/>Static Curated List"] --> D[load_models]
|
| 11 |
+
A2["get_historical_popular_models<br/>Web Scraping - Top 20"] --> D
|
| 12 |
+
A3["get_current_popular_models<br/>Web Scraping - Top 10"] --> D
|
| 13 |
+
A4["blocklist<br/>Exclusions"] --> D
|
| 14 |
+
|
| 15 |
+
%% Model Processing
|
| 16 |
+
D --> |"Combine & Dedupe"| E["Dynamic Model List<br/>~40-50 models"]
|
| 17 |
+
E --> |get_or_metadata| F["OpenRouter API<br/>Model Metadata"]
|
| 18 |
+
F --> |get_hf_metadata| G["HuggingFace API<br/>Model Details"]
|
| 19 |
+
G --> H["Enriched Model DataFrame"]
|
| 20 |
+
H --> |Save| I[models.json]
|
| 21 |
+
|
| 22 |
+
%% Model Validation & Cost Filtering
|
| 23 |
+
H --> |"Validate Models<br/>Check API Availability"| H1["Valid Models Only<br/>Cost β€ $20/1M tokens"]
|
| 24 |
+
H1 --> |"Timeout Protection<br/>120s for Large Models"| H2["Robust Model List"]
|
| 25 |
+
|
| 26 |
+
%% Language Data
|
| 27 |
+
J["languages.py<br/>BCP-47 + Population"] --> K["Top 100 Languages"]
|
| 28 |
+
|
| 29 |
+
%% Task Registry with Unified Prompting
|
| 30 |
+
L["tasks.py<br/>7 Evaluation Tasks"] --> M["Task Functions<br/>Unified English Zero-Shot"]
|
| 31 |
+
M --> M1["translation_from/to<br/>BLEU + ChrF"]
|
| 32 |
+
M --> M2["classification<br/>Accuracy"]
|
| 33 |
+
M --> M3["mmlu<br/>Accuracy"]
|
| 34 |
+
M --> M4["arc<br/>Accuracy"]
|
| 35 |
+
M --> M5["truthfulqa<br/>Accuracy"]
|
| 36 |
+
M --> M6["mgsm<br/>Accuracy"]
|
| 37 |
+
|
| 38 |
+
%% On-the-fly Translation with Origin Tagging
|
| 39 |
+
subgraph OTF [On-the-fly Dataset Translation]
|
| 40 |
+
direction LR
|
| 41 |
+
DS_raw["Raw English Dataset<br/>"] --> Google_Translate["Google Translate API"]
|
| 42 |
+
Google_Translate --> DS_translated["Translated Dataset<br/>(e.g., MGSM/ARC)<br/>Origin: 'machine'"]
|
| 43 |
+
DS_native["Native Dataset<br/>(e.g., AfriMMLU/Global-MMLU)<br/>Origin: 'human'"]
|
| 44 |
+
end
|
| 45 |
+
|
| 46 |
+
%% Evaluation Pipeline
|
| 47 |
+
H2 --> |"models ID"| N["main.py / main_gcs.py<br/>evaluate"]
|
| 48 |
+
K --> |"languages bcp_47"| N
|
| 49 |
+
L --> |"tasks.items"| N
|
| 50 |
+
N --> |"Filter by model.tasks"| O["Valid Combinations<br/>Model Γ Language Γ Task"]
|
| 51 |
+
O --> |"10 samples each"| P["Evaluation Execution<br/>Batch Processing"]
|
| 52 |
+
|
| 53 |
+
%% Task Execution with Origin Tracking
|
| 54 |
+
P --> Q1[translate_and_evaluate<br/>Origin: 'human']
|
| 55 |
+
P --> Q2[classify_and_evaluate<br/>Origin: 'human']
|
| 56 |
+
P --> Q3[mmlu_and_evaluate<br/>Origin: 'human' (no on-the-fly for missing; uses auto-translated dataset if available)]
|
| 57 |
+
P --> Q4[arc_and_evaluate<br/>Origin: 'human'/'machine']
|
| 58 |
+
P --> Q5[truthfulqa_and_evaluate<br/>Origin: 'human' (no on-the-fly for missing; relies on available datasets)]
|
| 59 |
+
P --> Q6[mgsm_and_evaluate<br/>Origin: 'human'/'machine']
|
| 60 |
+
|
| 61 |
+
%% API Calls with Error Handling
|
| 62 |
+
Q1 --> |"complete() API<br/>Rate Limiting"| R["OpenRouter<br/>Model Inference"]
|
| 63 |
+
Q2 --> |"complete() API<br/>Rate Limiting"| R
|
| 64 |
+
Q3 --> |"complete() API<br/>Rate Limiting"| R
|
| 65 |
+
Q4 --> |"complete() API<br/>Rate Limiting"| R
|
| 66 |
+
Q5 --> |"complete() API<br/>Rate Limiting"| R
|
| 67 |
+
Q6 --> |"complete() API<br/>Rate Limiting"| R
|
| 68 |
+
|
| 69 |
+
%% Results Processing with Origin Aggregation
|
| 70 |
+
R --> |Scores| S["Result Aggregation<br/>Mean by model+lang+task+origin"]
|
| 71 |
+
S --> |Save| T[results.json]
|
| 72 |
+
|
| 73 |
+
%% Backend & Frontend with Origin-Specific Metrics
|
| 74 |
+
T --> |Read| U[backend.py]
|
| 75 |
+
I --> |Read| U
|
| 76 |
+
U --> |make_model_table| V["Model Rankings<br/>Origin-Specific Metrics"]
|
| 77 |
+
U --> |make_country_table| W["Country Aggregation"]
|
| 78 |
+
U --> |"API Endpoint"| X["FastAPI /api/data<br/>arc_accuracy_human<br/>arc_accuracy_machine"]
|
| 79 |
+
X --> |"JSON Response"| Y["Frontend React App"]
|
| 80 |
+
|
| 81 |
+
%% UI Components
|
| 82 |
+
Y --> Z1["WorldMap.js<br/>Country Visualization"]
|
| 83 |
+
Y --> Z2["ModelTable.js<br/>Model Rankings"]
|
| 84 |
+
Y --> Z3["LanguageTable.js<br/>Language Coverage"]
|
| 85 |
+
Y --> Z4["DatasetTable.js<br/>Task Performance"]
|
| 86 |
+
|
| 87 |
+
%% Data Sources with Origin Information
|
| 88 |
+
subgraph DS ["Data Sources"]
|
| 89 |
+
DS1["Flores-200<br/>Translation Sentences<br/>Origin: 'human'"]
|
| 90 |
+
DS2["MMLU/AfriMMLU/Global-MMLU<br/>Knowledge QA<br/>Origin: 'human' or 'machine' (HF auto-translated only)"]
|
| 91 |
+
DS3["ARC<br/>Science Reasoning<br/>Origin: 'human'"]
|
| 92 |
+
DS4["TruthfulQA<br/>Truthfulness<br/>Origin: 'human'"]
|
| 93 |
+
DS5["MGSM<br/>Math Problems<br/>Origin: 'human'"]
|
| 94 |
+
end
|
| 95 |
+
|
| 96 |
+
DS1 --> Q1
|
| 97 |
+
DS2 --> Q3
|
| 98 |
+
DS3 --> Q4
|
| 99 |
+
DS4 --> Q5
|
| 100 |
+
DS5 --> Q6
|
| 101 |
+
|
| 102 |
+
%% No on-the-fly DS_translated for MMLU anymore; only HF auto-translated used
|
| 103 |
+
DS_translated --> Q4
|
| 104 |
+
DS_translated --> Q5
|
| 105 |
+
|
| 106 |
+
DS_native --> Q3
|
| 107 |
+
DS_native --> Q4
|
| 108 |
+
DS_native --> Q5
|
| 109 |
+
|
| 110 |
+
%% Styling - Neutral colors that work in both dark and light modes
|
| 111 |
+
classDef modelSource fill:#f8f9fa,stroke:#6c757d,color:#212529
|
| 112 |
+
classDef evaluation fill:#e9ecef,stroke:#495057,color:#212529
|
| 113 |
+
classDef api fill:#dee2e6,stroke:#6c757d,color:#212529
|
| 114 |
+
classDef storage fill:#d1ecf1,stroke:#0c5460,color:#0c5460
|
| 115 |
+
classDef frontend fill:#f8d7da,stroke:#721c24,color:#721c24
|
| 116 |
+
classDef translation fill:#d4edda,stroke:#155724,color:#155724
|
| 117 |
+
|
| 118 |
+
class A1,A2,A3,A4 modelSource
|
| 119 |
+
class Q1,Q2,Q3,Q4,Q5,Q6,P evaluation
|
| 120 |
+
class R,F,G,X api
|
| 121 |
+
class T,I storage
|
| 122 |
+
class Y,Z1,Z2,Z3,Z4 frontend
|
| 123 |
+
class Google_Translate,DS_translated,DS_native translation
|
| 124 |
+
```
|
| 125 |
+
|
| 126 |
+
## Architecture Components
|
| 127 |
+
|
| 128 |
+
### π΅ Model Discovery (Light Gray)
|
| 129 |
+
- **Static Curated Models**: Handpicked important models for comprehensive evaluation
|
| 130 |
+
- **Dynamic Popular Models**: Real-time discovery of trending models via web scraping
|
| 131 |
+
- **Quality Control**: Blocklist for problematic or incompatible models
|
| 132 |
+
- **Model Validation**: API availability checks and cost filtering (β€$20/1M tokens)
|
| 133 |
+
- **Timeout Protection**: 120s timeout for large/reasoning models, 60s for others
|
| 134 |
+
- **Metadata Enrichment**: Rich model information from OpenRouter and HuggingFace APIs
|
| 135 |
+
|
| 136 |
+
### π£ Evaluation Pipeline (Medium Gray)
|
| 137 |
+
- **7 Active Tasks**: Translation (bidirectional), Classification, MMLU, ARC, TruthfulQA, MGSM
|
| 138 |
+
- **Unified English Zero-Shot Prompting**: All tasks use English instructions with target language content
|
| 139 |
+
- **Origin Tagging**: Distinguishes between human-translated ('human') and machine-translated ('machine') data
|
| 140 |
+
- **Combinatorial Approach**: Systematic evaluation across Model Γ Language Γ Task combinations
|
| 141 |
+
- **Sample-based**: 10 evaluations per combination for statistical reliability
|
| 142 |
+
- **Batch Processing**: 50 tasks per batch with rate limiting and error resilience
|
| 143 |
+
- **Dual Deployment**: `main.py` for local/GitHub, `main_gcs.py` for Google Cloud with GCS storage
|
| 144 |
+
|
| 145 |
+
### π API Integration (Light Gray)
|
| 146 |
+
- **OpenRouter**: Primary model inference API for all language model tasks
|
| 147 |
+
- **Rate Limiting**: Intelligent batching and delays to prevent API overload
|
| 148 |
+
- **Error Handling**: Graceful handling of timeouts, rate limits, and model unavailability
|
| 149 |
+
- **HuggingFace**: Model metadata and open-source model information
|
| 150 |
+
- **Google Translate**: Specialized translation API for on-the-fly dataset translation
|
| 151 |
+
|
| 152 |
+
### π’ Data Storage (Cyan)
|
| 153 |
+
- **results.json**: Aggregated evaluation scores with origin-specific metrics
|
| 154 |
+
- **models.json**: Dynamic model list with metadata and validation status
|
| 155 |
+
- **languages.json**: Language information with population data
|
| 156 |
+
|
| 157 |
+
### π‘ Frontend Visualization (Light Red)
|
| 158 |
+
- **WorldMap**: Interactive country-level language proficiency visualization
|
| 159 |
+
- **ModelTable**: Ranked model performance leaderboard with origin-specific columns
|
| 160 |
+
- **LanguageTable**: Language coverage and speaker statistics
|
| 161 |
+
- **DatasetTable**: Task-specific performance breakdowns with human/machine distinction
|
| 162 |
+
|
| 163 |
+
### π΅ Translation & Origin Tracking (Light Green)
|
| 164 |
+
- **On-the-fly Translation**: Google Translate API for languages without native benchmarks
|
| 165 |
+
- **Origin Tagging**: Automatic classification of data sources (human vs. machine translated)
|
| 166 |
+
- **Separate Metrics**: Frontend displays distinct scores for human and machine-translated data
|
| 167 |
+
|
| 168 |
+
## Data Flow Summary
|
| 169 |
+
|
| 170 |
+
1. **Model Discovery**: Combine curated + trending models β validate API availability β enrich with metadata
|
| 171 |
+
2. **Evaluation Setup**: Generate all valid Model Γ Language Γ Task combinations with origin tracking
|
| 172 |
+
3. **Task Execution**: Run evaluations using unified English prompting and appropriate datasets
|
| 173 |
+
4. **Result Processing**: Aggregate scores by model+language+task+origin and save to JSON files
|
| 174 |
+
5. **Backend Serving**: FastAPI serves processed data with origin-specific metrics via REST API
|
| 175 |
+
6. **Frontend Display**: React app visualizes data through interactive components with transparency indicators
|
| 176 |
+
|
| 177 |
+
This architecture enables scalable, automated evaluation of AI language models across diverse languages and tasks while providing real-time insights through an intuitive web interface with methodological transparency.
|
pyproject.toml
CHANGED
|
@@ -13,7 +13,7 @@ dependencies = [
|
|
| 13 |
"uvicorn>=0.34.2",
|
| 14 |
]
|
| 15 |
|
| 16 |
-
[
|
| 17 |
dev = [
|
| 18 |
"aiolimiter>=1.2.1",
|
| 19 |
"bert-score>=0.3.13",
|
|
@@ -26,7 +26,7 @@ dev = [
|
|
| 26 |
"joblib>=1.5.0",
|
| 27 |
"langcodes>=3.5.0",
|
| 28 |
"language-data>=1.3.0",
|
| 29 |
-
"openai>=
|
| 30 |
"protobuf>=6.30.2",
|
| 31 |
"python-dotenv>=1.1.0",
|
| 32 |
"rich>=14.0.0",
|
|
@@ -36,11 +36,3 @@ dev = [
|
|
| 36 |
"tqdm>=4.67.1",
|
| 37 |
"transformers>=4.51.3",
|
| 38 |
]
|
| 39 |
-
|
| 40 |
-
[dependency-groups]
|
| 41 |
-
dev = [
|
| 42 |
-
"ipython>=9.3.0",
|
| 43 |
-
"jupyter>=1.1.1",
|
| 44 |
-
"scipy>=1.16.0",
|
| 45 |
-
"seaborn>=0.13.2",
|
| 46 |
-
]
|
|
|
|
| 13 |
"uvicorn>=0.34.2",
|
| 14 |
]
|
| 15 |
|
| 16 |
+
[dependency-groups]
|
| 17 |
dev = [
|
| 18 |
"aiolimiter>=1.2.1",
|
| 19 |
"bert-score>=0.3.13",
|
|
|
|
| 26 |
"joblib>=1.5.0",
|
| 27 |
"langcodes>=3.5.0",
|
| 28 |
"language-data>=1.3.0",
|
| 29 |
+
"openai>=2.3.0",
|
| 30 |
"protobuf>=6.30.2",
|
| 31 |
"python-dotenv>=1.1.0",
|
| 32 |
"rich>=14.0.0",
|
|
|
|
| 36 |
"tqdm>=4.67.1",
|
| 37 |
"transformers>=4.51.3",
|
| 38 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|