languagebench / evals /models.py
davidpomerenke's picture
Upload from GitHub Actions: model name no bracket stuff
aa92add verified
import re
from datetime import date
from os import getenv
import pandas as pd
from aiolimiter import AsyncLimiter
from dotenv import load_dotenv
from google.cloud import translate_v2 as translate
from huggingface_hub import AsyncInferenceClient, HfApi
from joblib.memory import Memory
from openai import AsyncOpenAI, BadRequestError
from requests import HTTPError, get
# for development purposes, all languages will be evaluated on the fast models
# and only a sample of languages will be evaluated on all models
important_models = [
"meta-llama/llama-4-maverick", # 0.6$
"meta-llama/llama-3.3-70b-instruct", # 0.3$
"meta-llama/llama-3.1-70b-instruct", # 0.3$
"meta-llama/llama-3-70b-instruct", # 0.4$
# "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter
"openai/gpt-5",
"openai/gpt-5-mini",
"openai/gpt-5-nano",
"openai/gpt-4.1", # 8$
"openai/gpt-4o", # 10$
"openai/gpt-3.5-turbo", # $1.50
"openai/gpt-oss-120b",
"anthropic/claude-sonnet-4.5",
"anthropic/claude-haiku-4.5",
"anthropic/claude-opus-4.1", # 15$
"anthropic/claude-sonnet-4",
"anthropic/claude-3.7-sonnet", # 15$
"anthropic/claude-3.5-sonnet",
"mistralai/mistral-small-3.2-24b-instruct", # 0.3$
"mistralai/mistral-medium-3.1",
"mistralai/mistral-saba", # 0.6$
"mistralai/mistral-nemo", # 0.08$
"google/gemini-2.5-pro", # $10
"google/gemini-2.5-flash", # 0.6$
"google/gemini-2.5-flash-lite", # 0.3$
"google/gemma-3-27b-it", # 0.2$
# "x-ai/grok-4", # $15
"x-ai/grok-4-fast",
# "x-ai/grok-3", # $15
"cohere/command-a",
"qwen/qwen3-32b",
"qwen/qwen3-235b-a22b",
"qwen/qwen3-30b-a3b", # 0.29$
# "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content"
# "qwen/qwq-32b", # 0.2$
# "qwen/qwen-2.5-72b-instruct", # 0.39$
# "qwen/qwen-2-72b-instruct", # 0.9$
"deepseek/deepseek-v3.2-exp",
"microsoft/phi-4", # 0.07$
"amazon/nova-premier-v1", # 12.5$
"amazon/nova-pro-v1", # 0.09$
"moonshotai/kimi-k2", # 0.6$
# "moonshotai/kimi-k2-thinking", # 2.5$
"baidu/ernie-4.5-300b-a47b",
# "baidu/ernie-4.5-21b-a3b-thinking",
"z-ai/glm-4.6", # 1.75$
]
blocklist = [
"google/gemini-2.5-pro-preview",
# "google/gemini-2.5-pro",
"google/gemini-2.5-flash-preview",
"google/gemini-2.5-flash-lite-preview",
"google/gemini-2.5-flash-preview-04-17",
"google/gemini-2.5-flash-preview-05-20",
"google/gemini-2.5-flash-lite-preview-06-17",
"google/gemini-2.5-pro-preview-06-05",
"google/gemini-2.5-pro-preview-05-06",
"perplexity/sonar-deep-research",
"perplexity/sonar-reasoning",
"perplexity/sonar-reasoning-pro",
"qwen/qwen3-vl-30b-a3b-thinking",
"alpindale/goliath-120b"
]
transcription_models = [
"elevenlabs/scribe_v1",
"openai/whisper-large-v3",
# "openai/whisper-small",
# "facebook/seamless-m4t-v2-large",
]
cache = Memory(location=".cache", verbose=0).cache
@cache
def load_or_metadata(date: date):
return get("https://openrouter.ai/api/frontend/models").json()["data"]
def get_or_metadata(permaslug):
models = load_or_metadata(date.today())
slugs = [
m
for m in models
if (m["permaslug"] == permaslug or m["slug"] == permaslug)
# ensure that a provider endpoint is available
and m["endpoint"]
# exclude free models
# the problem is that free models typically have very high rate-limiting
and not m["endpoint"]["is_free"]
# exclude providers that train on user data
# this is crucial since we are submitting benchmark data
# make sure to additionally configure this in OpenRouter settings to avoid mistakes!
and m["endpoint"]["provider_info"]["dataPolicy"]["training"] is False
]
if len(slugs) == 0:
print(f"no appropriate model (not free and no user data training) found for {permaslug}")
return slugs[0] if len(slugs) >= 1 else None
@cache
def get_historical_popular_models(date: date):
# date parameter is used for daily caching
try:
raw = get("https://openrouter.ai/rankings").text
# Extract model data from rankingData using regex
# Find all count and model_permaslug pairs in the data
# Format: "count":number,"model_permaslug":"model/name"
pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
matches = re.findall(pattern, raw)
if matches:
# Aggregate model counts
model_counts = {}
for count_str, model_slug in matches:
count = float(count_str)
if not model_slug.startswith("openrouter") and model_slug != "Others":
# Remove variant suffixes for aggregation
base_model = model_slug.split(":")[0]
model_counts[base_model] = model_counts.get(base_model, 0) + count
# Sort by popularity and return top models
sorted_models = sorted(
model_counts.items(), key=lambda x: x[1], reverse=True
)
result = []
for model_slug, count in sorted_models:
result.append({"slug": model_slug, "count": int(count)})
return result
else:
return []
except Exception as e:
return []
@cache
def get_current_popular_models(date: date):
# date parameter is used for daily caching
try:
raw = get("https://openrouter.ai/rankings?view=day").text
# Extract model data from daily rankings
# Find all count and model_permaslug pairs in the daily data
pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\""
matches = re.findall(pattern, raw)
if matches:
# Aggregate model counts
model_counts = {}
for count_str, model_slug in matches:
count = float(count_str)
if not model_slug.startswith("openrouter") and model_slug != "Others":
# Remove variant suffixes for aggregation
base_model = model_slug.split(":")[0]
model_counts[base_model] = model_counts.get(base_model, 0) + count
# Sort by popularity and return top models
sorted_models = sorted(
model_counts.items(), key=lambda x: x[1], reverse=True
)
result = []
for model_slug, count in sorted_models:
result.append({"slug": model_slug, "count": int(count)})
return result
else:
return []
except Exception as e:
return []
def get_translation_models():
return pd.DataFrame(
[
{
"id": "google/translate-v2",
"name": "Google Translate",
"provider_name": "Google",
"cost": 20.0,
"train_on_prompts": False, # they don't do it in the API
"size": None,
"type": "closed-source",
"license": None,
"tasks": ["translation_from", "translation_to"],
}
]
)
load_dotenv()
client = AsyncOpenAI(
base_url="https://openrouter.ai/api/v1",
api_key=getenv("OPENROUTER_API_KEY"),
)
openrouter_rate_limit = AsyncLimiter(max_rate=20, time_period=1)
elevenlabs_rate_limit = AsyncLimiter(max_rate=2, time_period=1)
huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1)
google_rate_limit = AsyncLimiter(max_rate=10, time_period=1)
@cache
async def complete(**kwargs) -> str | None:
async with openrouter_rate_limit:
try:
response = await client.chat.completions.create(**kwargs)
except BadRequestError as e:
if "filtered" in e.message:
return None
raise e
if not response.choices:
raise Exception(response)
return response.choices[0].message.content.strip()
translate_client = translate.Client()
def get_google_supported_languages():
return [l["language"] for l in translate_client.get_languages()]
@cache
async def translate_google(text, source_language, target_language):
async with google_rate_limit:
response = translate_client.translate(
text, source_language=source_language, target_language=target_language
)
return response["translatedText"]
# @cache
# async def transcribe_elevenlabs(path, model):
# modelname = model.split("/")[-1]
# client = AsyncElevenLabs(api_key=getenv("ELEVENLABS_API_KEY"))
# async with elevenlabs_rate_limit:
# with open(path, "rb") as file:
# response = await client.speech_to_text.convert(
# model_id=modelname, file=file
# )
# return response.text
# @cache
# async def transcribe_huggingface(path, model):
# client = AsyncInferenceClient(api_key=getenv("HUGGINGFACE_ACCESS_TOKEN"))
# async with huggingface_rate_limit:
# output = await client.automatic_speech_recognition(model=model, audio=path)
# return output.text
# async def transcribe(path, model="elevenlabs/scribe_v1"):
# provider, modelname = model.split("/")
# match provider:
# case "elevenlabs":
# return await transcribe_elevenlabs(path, modelname)
# case "openai" | "facebook":
# return await transcribe_huggingface(path, model)
# case _:
# raise ValueError(f"Model {model} not supported")
api = HfApi()
@cache
def get_hf_metadata(row):
# get metadata from the HuggingFace API
empty = {
"hf_id": None,
"creation_date": None,
"size": None,
"type": "closed-source",
"license": None,
}
if not row:
return empty
id = row["hf_slug"] or row["slug"].split(":")[0]
if not id:
return empty
try:
info = api.model_info(id)
license = ""
if (
info.card_data
and hasattr(info.card_data, "license")
and info.card_data.license
):
license = (
info.card_data.license.replace("-", " ").replace("mit", "MIT").title()
)
return {
"hf_id": info.id,
"creation_date": info.created_at,
"size": info.safetensors.total if info.safetensors else None,
"type": "open-source",
"license": license,
}
except HTTPError:
return empty
def get_cost(row):
try:
cost = float(row["endpoint"]["pricing"]["completion"])
return round(cost * 1_000_000, 2)
except (TypeError, KeyError):
return None
def get_training_policy(row):
# get openrouter info whether the provider may train on prompts
# (this needs to be thoroughly avoided for our benchmark prompts!)
return row["endpoint"]["provider_info"]["dataPolicy"]["training"]
@cache
def load_models(date: date) -> pd.DataFrame:
# popular_models = (
# get_historical_popular_models(date.today())[:20]
# + get_current_popular_models(date.today())[:10]
# )
popular_models = []
popular_models = [m["slug"] for m in popular_models]
all_model_candidates = set(important_models + popular_models) - set(blocklist)
# Validate models exist on OpenRouter before including them
valid_models = []
for model_id in all_model_candidates:
metadata = get_or_metadata(model_id)
if metadata is not None:
valid_models.append(model_id)
models = pd.DataFrame(sorted(valid_models), columns=["id"])
or_metadata = models["id"].apply(get_or_metadata) # TODO this is double-doubled
hf_metadata = or_metadata.apply(get_hf_metadata)
creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date
creation_date_or = pd.to_datetime(
or_metadata.str["created_at"].str.split("T").str[0]
).dt.date
models = models.assign(
name=or_metadata.str["short_name"]
.str.replace(" (free)", "")
.str.replace(" (self-moderated)", "")
.str.replace(r"\s*\([^)]*\)\s*$", "", regex=True),
provider_name=or_metadata.str["name"].str.split(": ").str[0],
# openrouter_metadata=or_metadata.astype(str),
cost=or_metadata.apply(get_cost),
train_on_prompts=or_metadata.apply(get_training_policy),
hf_id=hf_metadata.str["hf_id"],
size=hf_metadata.str["size"],
type=hf_metadata.str["type"],
license=hf_metadata.str["license"],
creation_date=creation_date_hf.combine_first(creation_date_or),
)
models.to_json(
"models_unfiltered.json", orient="records", indent=2, force_ascii=False
)
# Filter out expensive models to keep costs reasonable
models = models[models["cost"] <= 15.0].reset_index(drop=True)
models["tasks"] = [
[
"translation_from",
"translation_to",
"classification",
"mmlu",
"arc",
"truthfulqa",
"mgsm",
]
] * len(models)
models = pd.concat([models, get_translation_models()])
return models
models = load_models(date.today())