davidpomerenke's picture
Upload from GitHub Actions: flores filter for available dev split
34b05c6 verified
import asyncio
import time
from datetime import timedelta
from os import environ
import pandas as pd
from languages import languages
from models import models
from rich import print
from tasks import tasks
from tqdm.asyncio import tqdm_asyncio
from datasets_.util import load, save, get_valid_task_languages
from tqdm import tqdm
n_sentences = int(environ.get("N_SENTENCES", 10))
n_languages = int(environ.get("N_LANGUAGES", 1000))
n_models = int(environ.get("N_MODELS", 40))
async def evaluate():
start_time = time.time()
# Pre-compute model tasks to avoid O(n²) lookups
model_tasks = models.set_index("id")["tasks"].to_dict()
# Pre-compute valid languages for each task
valid_task_langs = {task_name: get_valid_task_languages(task_name) for task_name in tasks}
# get all combinations that need evaluation (filtering invalid lang×task combos)
combis = [
(task_name, model, lang.bcp_47, i)
for i in range(n_sentences)
for lang in languages.head(n_languages).itertuples()
for task_name, task in tasks.items()
for model in models.iloc[:n_models]["id"]
if task_name in model_tasks[model] and lang.bcp_47 in valid_task_langs[task_name]
]
combis = pd.DataFrame(combis, columns=["task", "model", "bcp_47", "sentence_nr"])
# Load cached results and filter out completed combinations
old_results = load("results-detailed")
if not old_results.empty:
completed = set(old_results[["task", "model", "bcp_47", "sentence_nr"]].apply(tuple, axis=1))
combis = combis[~combis.apply(lambda row: tuple(row) in completed, axis=1)]
print(f"Running {len(combis)} evaluation tasks...")
# batching (asyncio.gather + rate-limiting can in principle run everything at once, but in practice batching is more efficient / necessary)
batch_size = 2000
batch_results = [
await tqdm_asyncio.gather(
*[tasks[task_name](model, bcp_47, sentence_nr)
for _, (task_name, model, bcp_47, sentence_nr) in batch.iterrows()]
)
for i in tqdm(range(0, len(combis), batch_size), colour='blue', desc='Batches')
for batch in [combis[i:i + batch_size]]
]
results = [r for batch in batch_results for result in batch for r in result]
results = pd.DataFrame(results) if results else pd.DataFrame(columns=["task", "model", "bcp_47", "metric", "sentence_nr", "score", "origin"])
# Merge with cached results (immutable log)
all_results = pd.concat([old_results, results]).drop_duplicates(
subset=["task", "model", "bcp_47", "metric", "sentence_nr"]
) if not old_results.empty else results
# Filter to current models × languages and aggregate
current_models = set(models.iloc[:n_models]["id"])
current_languages = set(languages.head(n_languages)["bcp_47"])
results_agg = (
all_results[all_results["model"].isin(current_models) & all_results["bcp_47"].isin(current_languages)]
.groupby(["model", "bcp_47", "task", "metric"])
.agg({"score": "mean", "origin": "first"})
.reset_index()
)
save(all_results, "results-detailed")
save(results_agg, "results")
save(models, "models")
save(languages, "languages")
elapsed = time.time() - start_time
print(f"Evaluation completed in {str(timedelta(seconds=int(elapsed)))}")
if __name__ == "__main__":
results = asyncio.run(evaluate())