Spaces:
Running
Running
David Pomerenke
commited on
Commit
·
df383f6
1
Parent(s):
ed78196
Add links to OpenRouter
Browse files
app.py
CHANGED
|
@@ -4,11 +4,14 @@ import gradio as gr
|
|
| 4 |
import pandas as pd
|
| 5 |
import plotly.graph_objects as go
|
| 6 |
|
| 7 |
-
# Load and process results
|
| 8 |
with open("results.json") as f:
|
| 9 |
results = json.load(f)
|
| 10 |
|
| 11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
def create_leaderboard_df(results):
|
| 13 |
# Sort languages by average BLEU to determine resource categories
|
| 14 |
langs_with_bleu = [lang for lang in results if lang["bleu"] is not None]
|
|
@@ -33,33 +36,31 @@ def create_leaderboard_df(results):
|
|
| 33 |
)
|
| 34 |
|
| 35 |
for score in lang["scores"]:
|
| 36 |
-
|
| 37 |
-
if
|
| 38 |
-
model_scores[
|
| 39 |
"High-Resource": [],
|
| 40 |
"Mid-Resource": [],
|
| 41 |
"Low-Resource": [],
|
| 42 |
}
|
| 43 |
-
model_scores[
|
| 44 |
|
| 45 |
# Calculate average scores and create DataFrame
|
| 46 |
leaderboard_data = []
|
| 47 |
for model, categories in model_scores.items():
|
| 48 |
# Calculate averages for each category
|
| 49 |
high_avg = (
|
| 50 |
-
round(
|
| 51 |
-
sum(categories["High-Resource"]) / len(categories["High-Resource"]), 3
|
| 52 |
-
)
|
| 53 |
if categories["High-Resource"]
|
| 54 |
else 0
|
| 55 |
)
|
| 56 |
mid_avg = (
|
| 57 |
-
round(
|
| 58 |
if categories["Mid-Resource"]
|
| 59 |
else 0
|
| 60 |
)
|
| 61 |
low_avg = (
|
| 62 |
-
round(
|
| 63 |
if categories["Low-Resource"]
|
| 64 |
else 0
|
| 65 |
)
|
|
@@ -72,9 +73,10 @@ def create_leaderboard_df(results):
|
|
| 72 |
)
|
| 73 |
overall_avg = round(sum(all_scores) / len(all_scores), 3)
|
| 74 |
|
|
|
|
| 75 |
leaderboard_data.append(
|
| 76 |
{
|
| 77 |
-
"Model": model,
|
| 78 |
"Overall BLEU": overall_avg,
|
| 79 |
"High-Resource BLEU": high_avg,
|
| 80 |
"Mid-Resource BLEU": mid_avg,
|
|
@@ -106,7 +108,20 @@ def create_leaderboard_df(results):
|
|
| 106 |
]
|
| 107 |
]
|
| 108 |
|
| 109 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
|
| 111 |
|
| 112 |
def create_model_comparison_plot(results):
|
|
@@ -160,23 +175,30 @@ def create_language_stats_df(results):
|
|
| 160 |
lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"]
|
| 161 |
)
|
| 162 |
|
|
|
|
|
|
|
|
|
|
| 163 |
row = {
|
| 164 |
-
"Language": lang[
|
| 165 |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
|
| 166 |
"Models Tested": len(lang["scores"]),
|
| 167 |
"Average BLEU": round(lang["bleu"], 3)
|
| 168 |
if lang["bleu"] is not None
|
| 169 |
else "N/A",
|
| 170 |
-
"Best Model":
|
| 171 |
-
if best_score["model"] is not None
|
| 172 |
-
else "N/A",
|
| 173 |
"Best Model BLEU": round(best_score["bleu"], 3)
|
| 174 |
if best_score["bleu"] is not None
|
| 175 |
else "N/A",
|
| 176 |
}
|
| 177 |
flat_data.append(row)
|
| 178 |
|
| 179 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
|
| 181 |
|
| 182 |
def create_scatter_plot(results):
|
|
@@ -220,14 +242,12 @@ with gr.Blocks(title="AI Language Translation Benchmark") as demo:
|
|
| 220 |
"Comparing translation performance across different AI models and languages"
|
| 221 |
)
|
| 222 |
|
| 223 |
-
df = create_language_stats_df(results)
|
| 224 |
-
leaderboard_df = create_leaderboard_df(results)
|
| 225 |
bar_plot = create_model_comparison_plot(results)
|
| 226 |
scatter_plot = create_scatter_plot(results)
|
| 227 |
|
| 228 |
-
|
| 229 |
gr.Plot(value=bar_plot, label="Model Comparison")
|
| 230 |
-
|
| 231 |
gr.Plot(value=scatter_plot, label="Language Coverage")
|
| 232 |
|
| 233 |
gr.Markdown(
|
|
|
|
| 4 |
import pandas as pd
|
| 5 |
import plotly.graph_objects as go
|
| 6 |
|
|
|
|
| 7 |
with open("results.json") as f:
|
| 8 |
results = json.load(f)
|
| 9 |
|
| 10 |
|
| 11 |
+
def mean(lst):
|
| 12 |
+
return sum(lst) / len(lst)
|
| 13 |
+
|
| 14 |
+
|
| 15 |
def create_leaderboard_df(results):
|
| 16 |
# Sort languages by average BLEU to determine resource categories
|
| 17 |
langs_with_bleu = [lang for lang in results if lang["bleu"] is not None]
|
|
|
|
| 36 |
)
|
| 37 |
|
| 38 |
for score in lang["scores"]:
|
| 39 |
+
model = score["model"]
|
| 40 |
+
if model not in model_scores:
|
| 41 |
+
model_scores[model] = {
|
| 42 |
"High-Resource": [],
|
| 43 |
"Mid-Resource": [],
|
| 44 |
"Low-Resource": [],
|
| 45 |
}
|
| 46 |
+
model_scores[model][category].append(score["bleu"])
|
| 47 |
|
| 48 |
# Calculate average scores and create DataFrame
|
| 49 |
leaderboard_data = []
|
| 50 |
for model, categories in model_scores.items():
|
| 51 |
# Calculate averages for each category
|
| 52 |
high_avg = (
|
| 53 |
+
round(mean(categories["High-Resource"]), 3)
|
|
|
|
|
|
|
| 54 |
if categories["High-Resource"]
|
| 55 |
else 0
|
| 56 |
)
|
| 57 |
mid_avg = (
|
| 58 |
+
round(mean(categories["Mid-Resource"]), 3)
|
| 59 |
if categories["Mid-Resource"]
|
| 60 |
else 0
|
| 61 |
)
|
| 62 |
low_avg = (
|
| 63 |
+
round(mean(categories["Low-Resource"]), 3)
|
| 64 |
if categories["Low-Resource"]
|
| 65 |
else 0
|
| 66 |
)
|
|
|
|
| 73 |
)
|
| 74 |
overall_avg = round(sum(all_scores) / len(all_scores), 3)
|
| 75 |
|
| 76 |
+
model_name = model.split("/")[-1]
|
| 77 |
leaderboard_data.append(
|
| 78 |
{
|
| 79 |
+
"Model": f"[{model_name}](https://openrouter.ai/{model})",
|
| 80 |
"Overall BLEU": overall_avg,
|
| 81 |
"High-Resource BLEU": high_avg,
|
| 82 |
"Mid-Resource BLEU": mid_avg,
|
|
|
|
| 108 |
]
|
| 109 |
]
|
| 110 |
|
| 111 |
+
return gr.DataFrame(
|
| 112 |
+
value=df,
|
| 113 |
+
label="Model Leaderboard",
|
| 114 |
+
show_search=False,
|
| 115 |
+
datatype=[
|
| 116 |
+
"number",
|
| 117 |
+
"markdown",
|
| 118 |
+
"number",
|
| 119 |
+
"number",
|
| 120 |
+
"number",
|
| 121 |
+
"number",
|
| 122 |
+
"number",
|
| 123 |
+
],
|
| 124 |
+
)
|
| 125 |
|
| 126 |
|
| 127 |
def create_model_comparison_plot(results):
|
|
|
|
| 175 |
lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"]
|
| 176 |
)
|
| 177 |
|
| 178 |
+
model = best_score['model']
|
| 179 |
+
model_name = model.split('/')[-1] if model else "N/A"
|
| 180 |
+
model_link = f"<a href='https://openrouter.ai/{model}' style='text-decoration: none; color: inherit;'>{model_name}</a>" if model else "N/A"
|
| 181 |
row = {
|
| 182 |
+
"Language": f"**{lang['language_name']}**",
|
| 183 |
"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
|
| 184 |
"Models Tested": len(lang["scores"]),
|
| 185 |
"Average BLEU": round(lang["bleu"], 3)
|
| 186 |
if lang["bleu"] is not None
|
| 187 |
else "N/A",
|
| 188 |
+
"Best Model": model_link,
|
|
|
|
|
|
|
| 189 |
"Best Model BLEU": round(best_score["bleu"], 3)
|
| 190 |
if best_score["bleu"] is not None
|
| 191 |
else "N/A",
|
| 192 |
}
|
| 193 |
flat_data.append(row)
|
| 194 |
|
| 195 |
+
df = pd.DataFrame(flat_data)
|
| 196 |
+
return gr.DataFrame(
|
| 197 |
+
value=df,
|
| 198 |
+
label="Language Results",
|
| 199 |
+
show_search="search",
|
| 200 |
+
datatype=["markdown", "number", "number", "number", "markdown", "number"],
|
| 201 |
+
)
|
| 202 |
|
| 203 |
|
| 204 |
def create_scatter_plot(results):
|
|
|
|
| 242 |
"Comparing translation performance across different AI models and languages"
|
| 243 |
)
|
| 244 |
|
|
|
|
|
|
|
| 245 |
bar_plot = create_model_comparison_plot(results)
|
| 246 |
scatter_plot = create_scatter_plot(results)
|
| 247 |
|
| 248 |
+
create_leaderboard_df(results)
|
| 249 |
gr.Plot(value=bar_plot, label="Model Comparison")
|
| 250 |
+
create_language_stats_df(results)
|
| 251 |
gr.Plot(value=scatter_plot, label="Language Coverage")
|
| 252 |
|
| 253 |
gr.Markdown(
|