Spaces:
Running
Running
| import json | |
| import os | |
| import numpy as np | |
| import pandas as pd | |
| import uvicorn | |
| from countries import make_country_table | |
| from datasets_.util import load | |
| from fastapi import FastAPI, Request | |
| from fastapi.middleware.cors import CORSMiddleware | |
| from fastapi.middleware.gzip import GZipMiddleware | |
| from fastapi.responses import JSONResponse | |
| from fastapi.staticfiles import StaticFiles | |
| from joblib.memory import Memory | |
| cache = Memory(location=".cache", verbose=0).cache | |
| scores = load("results") | |
| scores_detailed = load("results-detailed") | |
| languages = load("languages") | |
| models = load("models") | |
| def mean(lst): | |
| return sum(lst) / len(lst) if lst else None | |
| task_metrics = [ | |
| "translation_from_bleu", | |
| "translation_to_bleu", | |
| "classification_accuracy", | |
| "mmlu_accuracy", | |
| "arc_accuracy", | |
| "mgsm_accuracy", | |
| ] | |
| def compute_normalized_average(df, metrics): | |
| """Compute simple average across metric columns without normalization.""" | |
| return df[metrics].mean(axis=1, skipna=False) | |
| def compute_bootstrap_ci( | |
| data_hash, group_cols_tuple, n_bootstrap=1000, ci_level=0.95, seed=42 | |
| ): | |
| """Compute bootstrap CIs for grouped data. Cached based on data hash.""" | |
| # This function is called with the actual data passed separately via _ci_cache | |
| df, group_cols = _ci_cache[data_hash] | |
| np.random.seed(seed) | |
| percentiles = [(1 - ci_level) / 2 * 100, (1 + ci_level) / 2 * 100] | |
| def bootstrap_group(group): | |
| scores = group["score"].values | |
| if len(scores) == 0: | |
| return pd.Series({"ci_lower": None, "ci_upper": None}) | |
| bootstrap_means = [ | |
| np.random.choice(scores, len(scores), replace=True).mean() | |
| for _ in range(n_bootstrap) | |
| ] | |
| ci_lower, ci_upper = np.percentile(bootstrap_means, percentiles) | |
| return pd.Series({"ci_lower": ci_lower, "ci_upper": ci_upper}) | |
| result = df.groupby(group_cols, as_index=False).apply( | |
| bootstrap_group, include_groups=False | |
| ) | |
| result.columns = group_cols + ["ci_lower", "ci_upper"] | |
| return result | |
| # Thread-safe cache for passing DataFrames to cached function | |
| _ci_cache = {} | |
| def add_confidence_intervals(df, scores_df_detailed, group_col, metrics): | |
| """DRY helper to add CI columns for metrics and average to a dataframe.""" | |
| if scores_df_detailed is None or scores_df_detailed.empty: | |
| return df | |
| detailed = scores_df_detailed.copy() | |
| detailed["task_metric"] = detailed["task"] + "_" + detailed["metric"] | |
| # Add CI for each metric | |
| for metric in metrics: | |
| metric_data = detailed[detailed["task_metric"] == metric] | |
| if not metric_data.empty: | |
| # Create hash based on data shape, groups, and statistics | |
| group_stats = ( | |
| metric_data.groupby(group_col)["score"] | |
| .agg(["count", "mean", "std"]) | |
| .round(6) | |
| ) | |
| data_hash = hash( | |
| ( | |
| metric, | |
| group_col, | |
| len(metric_data), | |
| tuple(group_stats.index), | |
| tuple(map(tuple, group_stats.values)), | |
| ) | |
| ) | |
| _ci_cache[data_hash] = (metric_data, [group_col]) | |
| ci_df = compute_bootstrap_ci(data_hash, (group_col,)) | |
| ci_df = ci_df.rename( | |
| columns={ | |
| "ci_lower": f"{metric}_ci_lower", | |
| "ci_upper": f"{metric}_ci_upper", | |
| } | |
| ) | |
| df = pd.merge(df, ci_df, on=group_col, how="left") | |
| # Add CI for average | |
| avg_data = detailed[detailed["task_metric"].isin(metrics)] | |
| if not avg_data.empty: | |
| # Create hash based on data shape, groups, and statistics | |
| group_stats = ( | |
| avg_data.groupby(group_col)["score"].agg(["count", "mean", "std"]).round(6) | |
| ) | |
| data_hash = hash( | |
| ( | |
| "average", | |
| group_col, | |
| len(avg_data), | |
| tuple(group_stats.index), | |
| tuple(map(tuple, group_stats.values)), | |
| ) | |
| ) | |
| _ci_cache[data_hash] = (avg_data, [group_col]) | |
| avg_ci_df = compute_bootstrap_ci(data_hash, (group_col,)) | |
| avg_ci_df = avg_ci_df.rename( | |
| columns={"ci_lower": "average_ci_lower", "ci_upper": "average_ci_upper"} | |
| ) | |
| df = pd.merge(df, avg_ci_df, on=group_col, how="left") | |
| return df | |
| def make_model_table(scores_df, models, scores_df_detailed=None): | |
| scores_df = scores_df.copy() | |
| scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"] | |
| scores_df["task_metric_origin"] = ( | |
| scores_df["task_metric"] + "_" + scores_df["origin"] | |
| ) | |
| # Pivot scores | |
| main_pivot = scores_df.pivot_table( | |
| index="model", columns="task_metric", values="score", aggfunc="mean" | |
| ) | |
| scores_pivot = scores_df.pivot_table( | |
| index="model", columns="task_metric_origin", values="score", aggfunc="mean" | |
| ) | |
| df = pd.merge(main_pivot, scores_pivot, on="model", how="outer") | |
| # Fill missing metrics and compute average | |
| for metric in task_metrics: | |
| df[metric] = df.get(metric, np.nan) | |
| df["average"] = compute_normalized_average(df, task_metrics) | |
| df = add_confidence_intervals(df, scores_df_detailed, "model", task_metrics) | |
| # Add machine-origin flags | |
| machine_presence = ( | |
| scores_df[scores_df["origin"] == "machine"] | |
| .groupby(["model", "task_metric"]) | |
| .size() | |
| ) | |
| for metric in task_metrics: | |
| df[f"{metric}_contains_machine"] = df.index.map( | |
| lambda m: (m, metric) in machine_presence.index | |
| ) | |
| # Sort and add metadata | |
| df = df.sort_values(by="average", ascending=False).reset_index() | |
| df = pd.merge(df, models, left_on="model", right_on="id", how="left") | |
| df["rank"] = df.index + 1 | |
| df["creation_date"] = df["creation_date"].apply( | |
| lambda x: x.isoformat() if x else None | |
| ) | |
| # Select columns dynamically | |
| metric_cols = [m for m in df.columns if any(tm in m for tm in task_metrics)] | |
| avg_ci_cols = [ | |
| c for c in df.columns if c in ["average_ci_lower", "average_ci_upper"] | |
| ] | |
| return df[ | |
| [ | |
| "rank", | |
| "model", | |
| "name", | |
| "provider_name", | |
| "hf_id", | |
| "creation_date", | |
| "size", | |
| "type", | |
| "license", | |
| "cost", | |
| "average", | |
| *avg_ci_cols, | |
| *sorted(set(metric_cols)), | |
| ] | |
| ] | |
| def make_language_table(scores_df, languages, scores_df_detailed=None): | |
| scores_df = scores_df.copy() | |
| scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"] | |
| # Pivot scores and origins | |
| score_pivot = scores_df.pivot_table( | |
| index="bcp_47", columns="task_metric", values="score", aggfunc="mean" | |
| ) | |
| origin_pivot = scores_df.pivot_table( | |
| index="bcp_47", columns="task_metric", values="origin", aggfunc="first" | |
| ) | |
| origin_pivot = origin_pivot.add_suffix("_origin") | |
| df = pd.merge(score_pivot, origin_pivot, on="bcp_47", how="outer") | |
| # Fill missing metrics and compute average | |
| for metric in task_metrics: | |
| df[metric] = df.get(metric, np.nan) | |
| df["average"] = compute_normalized_average(df, task_metrics) | |
| # For language table, we need to compute scores from detailed data to match CI calculation | |
| # (CI is computed from all samples, so score should be too) | |
| if scores_df_detailed is not None and not scores_df_detailed.empty: | |
| detailed = scores_df_detailed.copy() | |
| detailed["task_metric"] = detailed["task"] + "_" + detailed["metric"] | |
| detailed_pivot = detailed.pivot_table( | |
| index="bcp_47", columns="task_metric", values="score", aggfunc="mean" | |
| ) | |
| for metric in task_metrics: | |
| if metric in detailed_pivot.columns: | |
| df[metric] = detailed_pivot[metric] | |
| df["average"] = compute_normalized_average(df, task_metrics) | |
| df = add_confidence_intervals(df, scores_df_detailed, "bcp_47", task_metrics) | |
| # Merge with language metadata and sort | |
| df = pd.merge(languages, df, on="bcp_47", how="outer").sort_values( | |
| by="speakers", ascending=False | |
| ) | |
| # Select columns dynamically | |
| metric_cols = [m for m in df.columns if any(tm in m for tm in task_metrics)] | |
| avg_ci_cols = [ | |
| c for c in df.columns if c in ["average_ci_lower", "average_ci_upper"] | |
| ] | |
| return df[ | |
| [ | |
| "bcp_47", | |
| "language_name", | |
| "autonym", | |
| "speakers", | |
| "family", | |
| "average", | |
| *avg_ci_cols, | |
| "in_benchmark", | |
| *sorted(set(metric_cols)), | |
| ] | |
| ] | |
| def make_language_tier_history(scores_df, languages, models): | |
| ranked_langs = languages.sort_values(by="speakers", ascending=False).reset_index( | |
| drop=True | |
| ) | |
| tier_ranges = {"Top 1": (0, 1), "Top 2-20": (1, 20), "Top 20-200": (19, 500)} | |
| # Calculate model-language overall scores | |
| scores_df = scores_df.copy() | |
| scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"] | |
| pivot = scores_df.pivot_table( | |
| index=["model", "bcp_47"], columns="task_metric", values="score", aggfunc="mean" | |
| ) | |
| for metric in task_metrics: | |
| pivot[metric] = pivot.get(metric, np.nan) | |
| pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics) | |
| pivot = pivot.reset_index() | |
| # Aggregate by tier | |
| tier_scores = pd.concat( | |
| [ | |
| pivot[pivot["bcp_47"].isin(ranked_langs.iloc[start:end]["bcp_47"])] | |
| .groupby("model")["proficiency_score"] | |
| .mean() | |
| .reset_index() | |
| .assign(tier=tier_name) | |
| for tier_name, (start, end) in tier_ranges.items() | |
| ], | |
| ignore_index=True, | |
| ) | |
| tier_scores = pd.merge( | |
| tier_scores, models, left_on="model", right_on="id", how="left" | |
| ) | |
| tier_scores["creation_date"] = tier_scores["creation_date"].apply( | |
| lambda x: x.isoformat() if x else None | |
| ) | |
| return tier_scores[ | |
| [ | |
| "model", | |
| "name", | |
| "provider_name", | |
| "creation_date", | |
| "size", | |
| "tier", | |
| "proficiency_score", | |
| ] | |
| ] | |
| def make_license_history(scores_df, models): | |
| scores_df = scores_df.copy() | |
| scores_df["task_metric"] = scores_df["task"] + "_" + scores_df["metric"] | |
| # Pivot and compute overall score | |
| pivot = scores_df.pivot_table( | |
| index="model", columns="task_metric", values="score", aggfunc="mean" | |
| ) | |
| for metric in task_metrics: | |
| pivot[metric] = pivot.get(metric, np.nan) | |
| pivot["proficiency_score"] = compute_normalized_average(pivot, task_metrics) | |
| # Merge and classify | |
| df = pd.merge( | |
| pivot.reset_index(), models, left_on="model", right_on="id", how="left" | |
| ) | |
| df["license_type"] = df["type"].apply( | |
| lambda x: "Open-source" if x == "open-source" else "Commercial" | |
| ) | |
| df["creation_date"] = df["creation_date"].apply( | |
| lambda x: x.isoformat() if x else None | |
| ) | |
| return df[ | |
| [ | |
| "model", | |
| "name", | |
| "provider_name", | |
| "creation_date", | |
| "size", | |
| "license_type", | |
| "proficiency_score", | |
| ] | |
| ] | |
| app = FastAPI() | |
| app.add_middleware(CORSMiddleware, allow_origins=["*"]) | |
| app.add_middleware(GZipMiddleware, minimum_size=1000) | |
| def serialize(df): | |
| return df.replace({np.nan: None}).to_dict(orient="records") | |
| async def data(request: Request): | |
| body = await request.body() | |
| data = json.loads(body) | |
| selected_languages = data.get("selectedLanguages", {}) | |
| # Identify which metrics have machine translations available | |
| machine_translated_metrics = { | |
| f"{row['task']}_{row['metric']}" | |
| for _, row in scores.iterrows() | |
| if row["origin"] == "machine" | |
| } | |
| # Filter by selected languages if provided | |
| df = ( | |
| scores[scores["bcp_47"].isin(lang["bcp_47"] for lang in selected_languages)] | |
| if selected_languages | |
| else scores | |
| ) | |
| df_detailed = ( | |
| scores_detailed[ | |
| scores_detailed["bcp_47"].isin( | |
| lang["bcp_47"] for lang in selected_languages | |
| ) | |
| ] | |
| if selected_languages | |
| else scores_detailed | |
| ) | |
| if len(df) == 0: | |
| model_table = pd.DataFrame() | |
| countries = pd.DataFrame() | |
| else: | |
| model_table = make_model_table(df, models, df_detailed) | |
| countries = make_country_table(make_language_table(df, languages, df_detailed)) | |
| language_table = make_language_table(scores, languages, scores_detailed) | |
| language_tier_history = make_language_tier_history(scores, languages, models) | |
| license_history = make_license_history(scores, models) | |
| datasets_df = pd.read_json("data/datasets.json") | |
| return JSONResponse( | |
| content={ | |
| "model_table": serialize(model_table), | |
| "language_table": serialize(language_table), | |
| "dataset_table": serialize(datasets_df), | |
| "countries": serialize(countries), | |
| "machine_translated_metrics": list(machine_translated_metrics), | |
| "language_tier_history": serialize(language_tier_history), | |
| "license_history": serialize(license_history), | |
| } | |
| ) | |
| # Only serve static files if build directory exists | |
| if os.path.exists("frontend/build"): | |
| app.mount("/", StaticFiles(directory="frontend/build", html=True), name="frontend") | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=int(os.environ.get("PORT", 8000))) | |