Spaces:
Running
Running
| import re | |
| from datetime import date | |
| from os import getenv | |
| import pandas as pd | |
| from aiolimiter import AsyncLimiter | |
| from dotenv import load_dotenv | |
| from google.cloud import translate_v2 as translate | |
| from huggingface_hub import AsyncInferenceClient, HfApi | |
| from joblib.memory import Memory | |
| from openai import AsyncOpenAI, BadRequestError | |
| from requests import HTTPError, get | |
| # for development purposes, all languages will be evaluated on the fast models | |
| # and only a sample of languages will be evaluated on all models | |
| important_models = [ | |
| "meta-llama/llama-4-maverick", # 0.6$ | |
| "meta-llama/llama-3.3-70b-instruct", # 0.3$ | |
| "meta-llama/llama-3.1-70b-instruct", # 0.3$ | |
| "meta-llama/llama-3-70b-instruct", # 0.4$ | |
| # "meta-llama/llama-2-70b-chat", # 0.9$; not properly supported by OpenRouter | |
| "openai/gpt-5", | |
| "openai/gpt-5-mini", | |
| "openai/gpt-5-nano", | |
| "openai/gpt-4.1", # 8$ | |
| "openai/gpt-4o", # 10$ | |
| "openai/gpt-3.5-turbo", # $1.50 | |
| "openai/gpt-oss-120b", | |
| "anthropic/claude-sonnet-4.5", | |
| "anthropic/claude-haiku-4.5", | |
| "anthropic/claude-opus-4.1", # 15$ | |
| "anthropic/claude-sonnet-4", | |
| "anthropic/claude-3.7-sonnet", # 15$ | |
| "anthropic/claude-3.5-sonnet", | |
| "mistralai/mistral-small-3.2-24b-instruct", # 0.3$ | |
| "mistralai/mistral-medium-3.1", | |
| "mistralai/mistral-saba", # 0.6$ | |
| "mistralai/mistral-nemo", # 0.08$ | |
| "google/gemini-2.5-pro", # $10 | |
| "google/gemini-2.5-flash", # 0.6$ | |
| "google/gemini-2.5-flash-lite", # 0.3$ | |
| "google/gemma-3-27b-it", # 0.2$ | |
| # "x-ai/grok-4", # $15 | |
| "x-ai/grok-4-fast", | |
| # "x-ai/grok-3", # $15 | |
| "cohere/command-a", | |
| "qwen/qwen3-32b", | |
| "qwen/qwen3-235b-a22b", | |
| "qwen/qwen3-30b-a3b", # 0.29$ | |
| # "qwen/qwen-turbo", # 0.2$; recognizes "inappropriate content" | |
| # "qwen/qwq-32b", # 0.2$ | |
| # "qwen/qwen-2.5-72b-instruct", # 0.39$ | |
| # "qwen/qwen-2-72b-instruct", # 0.9$ | |
| "deepseek/deepseek-v3.2-exp", | |
| "microsoft/phi-4", # 0.07$ | |
| "amazon/nova-premier-v1", # 12.5$ | |
| "amazon/nova-pro-v1", # 0.09$ | |
| "moonshotai/kimi-k2", # 0.6$ | |
| # "moonshotai/kimi-k2-thinking", # 2.5$ | |
| "baidu/ernie-4.5-300b-a47b", | |
| # "baidu/ernie-4.5-21b-a3b-thinking", | |
| "z-ai/glm-4.6", # 1.75$ | |
| ] | |
| blocklist = [ | |
| "google/gemini-2.5-pro-preview", | |
| # "google/gemini-2.5-pro", | |
| "google/gemini-2.5-flash-preview", | |
| "google/gemini-2.5-flash-lite-preview", | |
| "google/gemini-2.5-flash-preview-04-17", | |
| "google/gemini-2.5-flash-preview-05-20", | |
| "google/gemini-2.5-flash-lite-preview-06-17", | |
| "google/gemini-2.5-pro-preview-06-05", | |
| "google/gemini-2.5-pro-preview-05-06", | |
| "perplexity/sonar-deep-research", | |
| "perplexity/sonar-reasoning", | |
| "perplexity/sonar-reasoning-pro", | |
| "qwen/qwen3-vl-30b-a3b-thinking", | |
| "alpindale/goliath-120b" | |
| ] | |
| transcription_models = [ | |
| "elevenlabs/scribe_v1", | |
| "openai/whisper-large-v3", | |
| # "openai/whisper-small", | |
| # "facebook/seamless-m4t-v2-large", | |
| ] | |
| cache = Memory(location=".cache", verbose=0).cache | |
| def load_or_metadata(date: date): | |
| return get("https://openrouter.ai/api/frontend/models").json()["data"] | |
| def get_or_metadata(permaslug): | |
| models = load_or_metadata(date.today()) | |
| slugs = [ | |
| m | |
| for m in models | |
| if (m["permaslug"] == permaslug or m["slug"] == permaslug) | |
| # ensure that a provider endpoint is available | |
| and m["endpoint"] | |
| # exclude free models | |
| # the problem is that free models typically have very high rate-limiting | |
| and not m["endpoint"]["is_free"] | |
| # exclude providers that train on user data | |
| # this is crucial since we are submitting benchmark data | |
| # make sure to additionally configure this in OpenRouter settings to avoid mistakes! | |
| and m["endpoint"]["provider_info"]["dataPolicy"]["training"] is False | |
| ] | |
| if len(slugs) == 0: | |
| print(f"no appropriate model (not free and no user data training) found for {permaslug}") | |
| return slugs[0] if len(slugs) >= 1 else None | |
| def get_historical_popular_models(date: date): | |
| # date parameter is used for daily caching | |
| try: | |
| raw = get("https://openrouter.ai/rankings").text | |
| # Extract model data from rankingData using regex | |
| # Find all count and model_permaslug pairs in the data | |
| # Format: "count":number,"model_permaslug":"model/name" | |
| pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\"" | |
| matches = re.findall(pattern, raw) | |
| if matches: | |
| # Aggregate model counts | |
| model_counts = {} | |
| for count_str, model_slug in matches: | |
| count = float(count_str) | |
| if not model_slug.startswith("openrouter") and model_slug != "Others": | |
| # Remove variant suffixes for aggregation | |
| base_model = model_slug.split(":")[0] | |
| model_counts[base_model] = model_counts.get(base_model, 0) + count | |
| # Sort by popularity and return top models | |
| sorted_models = sorted( | |
| model_counts.items(), key=lambda x: x[1], reverse=True | |
| ) | |
| result = [] | |
| for model_slug, count in sorted_models: | |
| result.append({"slug": model_slug, "count": int(count)}) | |
| return result | |
| else: | |
| return [] | |
| except Exception as e: | |
| return [] | |
| def get_current_popular_models(date: date): | |
| # date parameter is used for daily caching | |
| try: | |
| raw = get("https://openrouter.ai/rankings?view=day").text | |
| # Extract model data from daily rankings | |
| # Find all count and model_permaslug pairs in the daily data | |
| pattern = r"\\\"count\\\":([\d.]+).*?\\\"model_permaslug\\\":\\\"([^\\\"]+)\\\"" | |
| matches = re.findall(pattern, raw) | |
| if matches: | |
| # Aggregate model counts | |
| model_counts = {} | |
| for count_str, model_slug in matches: | |
| count = float(count_str) | |
| if not model_slug.startswith("openrouter") and model_slug != "Others": | |
| # Remove variant suffixes for aggregation | |
| base_model = model_slug.split(":")[0] | |
| model_counts[base_model] = model_counts.get(base_model, 0) + count | |
| # Sort by popularity and return top models | |
| sorted_models = sorted( | |
| model_counts.items(), key=lambda x: x[1], reverse=True | |
| ) | |
| result = [] | |
| for model_slug, count in sorted_models: | |
| result.append({"slug": model_slug, "count": int(count)}) | |
| return result | |
| else: | |
| return [] | |
| except Exception as e: | |
| return [] | |
| def get_translation_models(): | |
| return pd.DataFrame( | |
| [ | |
| { | |
| "id": "google/translate-v2", | |
| "name": "Google Translate", | |
| "provider_name": "Google", | |
| "cost": 20.0, | |
| "train_on_prompts": False, # they don't do it in the API | |
| "size": None, | |
| "type": "closed-source", | |
| "license": None, | |
| "tasks": ["translation_from", "translation_to"], | |
| } | |
| ] | |
| ) | |
| load_dotenv() | |
| client = AsyncOpenAI( | |
| base_url="https://openrouter.ai/api/v1", | |
| api_key=getenv("OPENROUTER_API_KEY"), | |
| ) | |
| openrouter_rate_limit = AsyncLimiter(max_rate=20, time_period=1) | |
| elevenlabs_rate_limit = AsyncLimiter(max_rate=2, time_period=1) | |
| huggingface_rate_limit = AsyncLimiter(max_rate=5, time_period=1) | |
| google_rate_limit = AsyncLimiter(max_rate=10, time_period=1) | |
| async def complete(**kwargs) -> str | None: | |
| async with openrouter_rate_limit: | |
| try: | |
| response = await client.chat.completions.create(**kwargs) | |
| except BadRequestError as e: | |
| if "filtered" in e.message: | |
| return None | |
| raise e | |
| if not response.choices: | |
| raise Exception(response) | |
| return response.choices[0].message.content.strip() | |
| translate_client = translate.Client() | |
| def get_google_supported_languages(): | |
| return [l["language"] for l in translate_client.get_languages()] | |
| async def translate_google(text, source_language, target_language): | |
| async with google_rate_limit: | |
| response = translate_client.translate( | |
| text, source_language=source_language, target_language=target_language | |
| ) | |
| return response["translatedText"] | |
| # @cache | |
| # async def transcribe_elevenlabs(path, model): | |
| # modelname = model.split("/")[-1] | |
| # client = AsyncElevenLabs(api_key=getenv("ELEVENLABS_API_KEY")) | |
| # async with elevenlabs_rate_limit: | |
| # with open(path, "rb") as file: | |
| # response = await client.speech_to_text.convert( | |
| # model_id=modelname, file=file | |
| # ) | |
| # return response.text | |
| # @cache | |
| # async def transcribe_huggingface(path, model): | |
| # client = AsyncInferenceClient(api_key=getenv("HUGGINGFACE_ACCESS_TOKEN")) | |
| # async with huggingface_rate_limit: | |
| # output = await client.automatic_speech_recognition(model=model, audio=path) | |
| # return output.text | |
| # async def transcribe(path, model="elevenlabs/scribe_v1"): | |
| # provider, modelname = model.split("/") | |
| # match provider: | |
| # case "elevenlabs": | |
| # return await transcribe_elevenlabs(path, modelname) | |
| # case "openai" | "facebook": | |
| # return await transcribe_huggingface(path, model) | |
| # case _: | |
| # raise ValueError(f"Model {model} not supported") | |
| api = HfApi() | |
| def get_hf_metadata(row): | |
| # get metadata from the HuggingFace API | |
| empty = { | |
| "hf_id": None, | |
| "creation_date": None, | |
| "size": None, | |
| "type": "closed-source", | |
| "license": None, | |
| } | |
| if not row: | |
| return empty | |
| id = row["hf_slug"] or row["slug"].split(":")[0] | |
| if not id: | |
| return empty | |
| try: | |
| info = api.model_info(id) | |
| license = "" | |
| if ( | |
| info.card_data | |
| and hasattr(info.card_data, "license") | |
| and info.card_data.license | |
| ): | |
| license = ( | |
| info.card_data.license.replace("-", " ").replace("mit", "MIT").title() | |
| ) | |
| return { | |
| "hf_id": info.id, | |
| "creation_date": info.created_at, | |
| "size": info.safetensors.total if info.safetensors else None, | |
| "type": "open-source", | |
| "license": license, | |
| } | |
| except HTTPError: | |
| return empty | |
| def get_cost(row): | |
| try: | |
| cost = float(row["endpoint"]["pricing"]["completion"]) | |
| return round(cost * 1_000_000, 2) | |
| except (TypeError, KeyError): | |
| return None | |
| def get_training_policy(row): | |
| # get openrouter info whether the provider may train on prompts | |
| # (this needs to be thoroughly avoided for our benchmark prompts!) | |
| return row["endpoint"]["provider_info"]["dataPolicy"]["training"] | |
| def load_models(date: date) -> pd.DataFrame: | |
| # popular_models = ( | |
| # get_historical_popular_models(date.today())[:20] | |
| # + get_current_popular_models(date.today())[:10] | |
| # ) | |
| popular_models = [] | |
| popular_models = [m["slug"] for m in popular_models] | |
| all_model_candidates = set(important_models + popular_models) - set(blocklist) | |
| # Validate models exist on OpenRouter before including them | |
| valid_models = [] | |
| for model_id in all_model_candidates: | |
| metadata = get_or_metadata(model_id) | |
| if metadata is not None: | |
| valid_models.append(model_id) | |
| models = pd.DataFrame(sorted(valid_models), columns=["id"]) | |
| or_metadata = models["id"].apply(get_or_metadata) # TODO this is double-doubled | |
| hf_metadata = or_metadata.apply(get_hf_metadata) | |
| creation_date_hf = pd.to_datetime(hf_metadata.str["creation_date"]).dt.date | |
| creation_date_or = pd.to_datetime( | |
| or_metadata.str["created_at"].str.split("T").str[0] | |
| ).dt.date | |
| models = models.assign( | |
| name=or_metadata.str["short_name"] | |
| .str.replace(" (free)", "") | |
| .str.replace(" (self-moderated)", "") | |
| .str.replace(r"\s*\([^)]*\)\s*$", "", regex=True), | |
| provider_name=or_metadata.str["name"].str.split(": ").str[0], | |
| # openrouter_metadata=or_metadata.astype(str), | |
| cost=or_metadata.apply(get_cost), | |
| train_on_prompts=or_metadata.apply(get_training_policy), | |
| hf_id=hf_metadata.str["hf_id"], | |
| size=hf_metadata.str["size"], | |
| type=hf_metadata.str["type"], | |
| license=hf_metadata.str["license"], | |
| creation_date=creation_date_hf.combine_first(creation_date_or), | |
| ) | |
| models.to_json( | |
| "models_unfiltered.json", orient="records", indent=2, force_ascii=False | |
| ) | |
| # Filter out expensive models to keep costs reasonable | |
| models = models[models["cost"] <= 15.0].reset_index(drop=True) | |
| models["tasks"] = [ | |
| [ | |
| "translation_from", | |
| "translation_to", | |
| "classification", | |
| "mmlu", | |
| "arc", | |
| "truthfulqa", | |
| "mgsm", | |
| ] | |
| ] * len(models) | |
| models = pd.concat([models, get_translation_models()]) | |
| return models | |
| models = load_models(date.today()) | |