Spaces:
Running
Running
| import asyncio | |
| import json | |
| import numpy as np | |
| import pandas as pd | |
| from tqdm.asyncio import tqdm_asyncio | |
| from languages import languages | |
| from models import models | |
| from tasks import tasks | |
| # ===== config ===== | |
| n_sentences = 10 | |
| # ===== run evaluation and aggregate results ===== | |
| async def evaluate(): | |
| print("running evaluations") | |
| results = [ | |
| task(model, lang.bcp_47, i) | |
| for task in tasks | |
| for i in range(n_sentences) | |
| for lang in languages.iloc[:100].itertuples() | |
| for model in models["id"] | |
| if lang.in_benchmark | |
| ] | |
| return await tqdm_asyncio.gather(*results, miniters=1) | |
| def serialize(df): | |
| return df.replace({np.nan: None, pd.NA: None}).to_dict(orient="records") | |
| async def main(): | |
| models["creation_date"] = models["creation_date"].apply(lambda x: x.isoformat()) | |
| results = await evaluate() | |
| results = [r for group in results for r in group] | |
| results = { | |
| "languages": serialize(languages), | |
| "models": serialize(models), | |
| "scores": results, | |
| } | |
| with open("results.json", "w") as f: | |
| json.dump(results, f, indent=2, ensure_ascii=False) | |
| if __name__ == "__main__": | |
| asyncio.run(main()) | |