Spaces:
Running
Running
David Pomerenke
commited on
Commit
·
63fd3b1
1
Parent(s):
4f572a5
Improve methodology
Browse files
app.py
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import json
|
|
|
|
| 2 |
|
| 3 |
import gradio as gr
|
| 4 |
import pandas as pd
|
|
@@ -68,8 +69,12 @@ def mean(lst):
|
|
| 68 |
|
| 69 |
def create_leaderboard_df(metric):
|
| 70 |
# Sort languages by average BLEU to determine resource categories
|
| 71 |
-
langs_with_score = [
|
| 72 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
n_langs = len(sorted_langs)
|
| 74 |
high_cutoff = n_langs // 4 # top 25%
|
| 75 |
low_cutoff = n_langs - n_langs // 4 # bottom 25%
|
|
@@ -97,7 +102,7 @@ def create_leaderboard_df(metric):
|
|
| 97 |
"Mid-Resource": [],
|
| 98 |
"Low-Resource": [],
|
| 99 |
}
|
| 100 |
-
model_scores[model][category].append(score[metric[
|
| 101 |
|
| 102 |
# Calculate average scores and create DataFrame
|
| 103 |
leaderboard_data = []
|
|
@@ -183,14 +188,14 @@ def create_model_comparison_plot(metric):
|
|
| 183 |
|
| 184 |
# Create appropriate title and y-axis label based on metric
|
| 185 |
title = f"{metric['display_name']} by Model and Language"
|
| 186 |
-
y_label = metric[
|
| 187 |
|
| 188 |
# Flatten the data for the selected metric
|
| 189 |
scores_flat = []
|
| 190 |
for lang in top_languages:
|
| 191 |
for score in lang["scores"]:
|
| 192 |
# Get the value directly using the field name
|
| 193 |
-
value = score[metric[
|
| 194 |
if value is not None:
|
| 195 |
scores_flat.append(
|
| 196 |
{
|
|
@@ -292,9 +297,9 @@ def create_scatter_plot(metric):
|
|
| 292 |
for lang in filtered_results:
|
| 293 |
# Calculate average score for this metric across all models
|
| 294 |
scores = [
|
| 295 |
-
score[metric[
|
| 296 |
for score in lang["scores"]
|
| 297 |
-
if score[metric[
|
| 298 |
]
|
| 299 |
if scores: # Only include if we have valid scores
|
| 300 |
avg_score = sum(scores) / len(scores)
|
|
@@ -332,7 +337,7 @@ def create_scatter_plot(metric):
|
|
| 332 |
fig.update_layout(
|
| 333 |
title=None,
|
| 334 |
xaxis_title="Number of Speakers (Millions)",
|
| 335 |
-
yaxis_title=metric[
|
| 336 |
height=500,
|
| 337 |
showlegend=False,
|
| 338 |
)
|
|
@@ -368,6 +373,7 @@ def get_population_data():
|
|
| 368 |
data[t_code] = t_population
|
| 369 |
return data
|
| 370 |
|
|
|
|
| 371 |
# Helper functions for visualization
|
| 372 |
def make_black_bar(value, max_width=10):
|
| 373 |
filled = int(value * max_width)
|
|
@@ -396,13 +402,14 @@ def make_colored_bar(score, max_width=10):
|
|
| 396 |
else:
|
| 397 |
return "🟥" * filled + "⬜" * empty
|
| 398 |
|
|
|
|
| 399 |
def create_world_map(metric):
|
| 400 |
# Collect all country data
|
| 401 |
population_data = get_population_data()
|
| 402 |
country_data = {}
|
| 403 |
for lang in results:
|
| 404 |
# Skip languages without the required data
|
| 405 |
-
if "population" not in lang or lang[metric[
|
| 406 |
continue
|
| 407 |
|
| 408 |
for country_code, speakers in lang["population"].items():
|
|
@@ -423,13 +430,13 @@ def create_world_map(metric):
|
|
| 423 |
|
| 424 |
country_data[iso3_code]["total_speakers"] += speakers
|
| 425 |
country_data[iso3_code]["weighted_score_sum"] += (
|
| 426 |
-
speakers * lang[metric[
|
| 427 |
)
|
| 428 |
country_data[iso3_code]["languages"].append(
|
| 429 |
{
|
| 430 |
"name": lang["language_name"],
|
| 431 |
"speakers": speakers,
|
| 432 |
-
"score": lang[metric[
|
| 433 |
}
|
| 434 |
)
|
| 435 |
except (KeyError, AttributeError):
|
|
@@ -506,7 +513,7 @@ def create_world_map(metric):
|
|
| 506 |
hoverinfo="text",
|
| 507 |
colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
|
| 508 |
colorbar=dict(
|
| 509 |
-
title=metric[
|
| 510 |
orientation="h", # horizontal orientation
|
| 511 |
y=-0.2, # position below map
|
| 512 |
yanchor="bottom",
|
|
@@ -519,7 +526,9 @@ def create_world_map(metric):
|
|
| 519 |
)
|
| 520 |
|
| 521 |
fig.update_layout(
|
| 522 |
-
title=dict(
|
|
|
|
|
|
|
| 523 |
geo=dict(
|
| 524 |
showframe=True,
|
| 525 |
showcoastlines=True,
|
|
@@ -540,23 +549,19 @@ def create_world_map(metric):
|
|
| 540 |
|
| 541 |
return fig
|
| 542 |
|
|
|
|
| 543 |
def create_metric_explanation(metric):
|
| 544 |
-
return gr.Markdown(metric[
|
| 545 |
|
| 546 |
|
| 547 |
# Create the visualization components
|
| 548 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
| 549 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
| 550 |
-
gr.Markdown(
|
| 551 |
-
"Comparing language proficiency across different models and languages."
|
| 552 |
-
)
|
| 553 |
start_metric = METRICS["overall_performance"]
|
| 554 |
|
| 555 |
metric = gr.Dropdown(
|
| 556 |
-
choices=[
|
| 557 |
-
metric_info["display_name"]
|
| 558 |
-
for metric_info in METRICS.values()
|
| 559 |
-
],
|
| 560 |
value=start_metric["display_name"],
|
| 561 |
label="Select Metric",
|
| 562 |
interactive=True,
|
|
@@ -586,38 +591,58 @@ with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
|
| 586 |
gr.Markdown(
|
| 587 |
"""
|
| 588 |
## Methodology
|
| 589 |
-
|
| 590 |
-
|
| 591 |
-
|
| 592 |
-
|
| 593 |
-
|
| 594 |
-
|
| 595 |
-
### Models
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
|
| 602 |
-
|
| 603 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 604 |
""",
|
| 605 |
container=True,
|
| 606 |
)
|
| 607 |
-
|
| 608 |
def update_component(fn, metric_choice):
|
| 609 |
metric = [m for m in METRICS.values() if m["display_name"] == metric_choice][0]
|
| 610 |
return fn(metric)
|
| 611 |
-
|
| 612 |
-
from functools import partial
|
| 613 |
|
| 614 |
-
|
| 615 |
-
metric.change(
|
|
|
|
|
|
|
|
|
|
|
|
|
| 616 |
metric.change(
|
| 617 |
-
fn=partial(update_component, create_model_comparison_plot),
|
|
|
|
|
|
|
| 618 |
)
|
| 619 |
metric.change(
|
| 620 |
-
fn=partial(update_component, create_scatter_plot),
|
|
|
|
|
|
|
| 621 |
)
|
| 622 |
metric.change(
|
| 623 |
fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map
|
|
|
|
| 1 |
import json
|
| 2 |
+
from functools import partial
|
| 3 |
|
| 4 |
import gradio as gr
|
| 5 |
import pandas as pd
|
|
|
|
| 69 |
|
| 70 |
def create_leaderboard_df(metric):
|
| 71 |
# Sort languages by average BLEU to determine resource categories
|
| 72 |
+
langs_with_score = [
|
| 73 |
+
lang for lang in results if lang[metric["field_name"]] is not None
|
| 74 |
+
]
|
| 75 |
+
sorted_langs = sorted(
|
| 76 |
+
langs_with_score, key=lambda x: x[metric["field_name"]], reverse=True
|
| 77 |
+
)
|
| 78 |
n_langs = len(sorted_langs)
|
| 79 |
high_cutoff = n_langs // 4 # top 25%
|
| 80 |
low_cutoff = n_langs - n_langs // 4 # bottom 25%
|
|
|
|
| 102 |
"Mid-Resource": [],
|
| 103 |
"Low-Resource": [],
|
| 104 |
}
|
| 105 |
+
model_scores[model][category].append(score[metric["field_name"]])
|
| 106 |
|
| 107 |
# Calculate average scores and create DataFrame
|
| 108 |
leaderboard_data = []
|
|
|
|
| 188 |
|
| 189 |
# Create appropriate title and y-axis label based on metric
|
| 190 |
title = f"{metric['display_name']} by Model and Language"
|
| 191 |
+
y_label = metric["label"]
|
| 192 |
|
| 193 |
# Flatten the data for the selected metric
|
| 194 |
scores_flat = []
|
| 195 |
for lang in top_languages:
|
| 196 |
for score in lang["scores"]:
|
| 197 |
# Get the value directly using the field name
|
| 198 |
+
value = score[metric["field_name"]]
|
| 199 |
if value is not None:
|
| 200 |
scores_flat.append(
|
| 201 |
{
|
|
|
|
| 297 |
for lang in filtered_results:
|
| 298 |
# Calculate average score for this metric across all models
|
| 299 |
scores = [
|
| 300 |
+
score[metric["field_name"]]
|
| 301 |
for score in lang["scores"]
|
| 302 |
+
if score[metric["field_name"]] is not None
|
| 303 |
]
|
| 304 |
if scores: # Only include if we have valid scores
|
| 305 |
avg_score = sum(scores) / len(scores)
|
|
|
|
| 337 |
fig.update_layout(
|
| 338 |
title=None,
|
| 339 |
xaxis_title="Number of Speakers (Millions)",
|
| 340 |
+
yaxis_title=metric["label"],
|
| 341 |
height=500,
|
| 342 |
showlegend=False,
|
| 343 |
)
|
|
|
|
| 373 |
data[t_code] = t_population
|
| 374 |
return data
|
| 375 |
|
| 376 |
+
|
| 377 |
# Helper functions for visualization
|
| 378 |
def make_black_bar(value, max_width=10):
|
| 379 |
filled = int(value * max_width)
|
|
|
|
| 402 |
else:
|
| 403 |
return "🟥" * filled + "⬜" * empty
|
| 404 |
|
| 405 |
+
|
| 406 |
def create_world_map(metric):
|
| 407 |
# Collect all country data
|
| 408 |
population_data = get_population_data()
|
| 409 |
country_data = {}
|
| 410 |
for lang in results:
|
| 411 |
# Skip languages without the required data
|
| 412 |
+
if "population" not in lang or lang[metric["field_name"]] is None:
|
| 413 |
continue
|
| 414 |
|
| 415 |
for country_code, speakers in lang["population"].items():
|
|
|
|
| 430 |
|
| 431 |
country_data[iso3_code]["total_speakers"] += speakers
|
| 432 |
country_data[iso3_code]["weighted_score_sum"] += (
|
| 433 |
+
speakers * lang[metric["field_name"]]
|
| 434 |
)
|
| 435 |
country_data[iso3_code]["languages"].append(
|
| 436 |
{
|
| 437 |
"name": lang["language_name"],
|
| 438 |
"speakers": speakers,
|
| 439 |
+
"score": lang[metric["field_name"]],
|
| 440 |
}
|
| 441 |
)
|
| 442 |
except (KeyError, AttributeError):
|
|
|
|
| 513 |
hoverinfo="text",
|
| 514 |
colorscale=[[0, "#ff9999"], [1, "#99ccff"]],
|
| 515 |
colorbar=dict(
|
| 516 |
+
title=metric["label"],
|
| 517 |
orientation="h", # horizontal orientation
|
| 518 |
y=-0.2, # position below map
|
| 519 |
yanchor="bottom",
|
|
|
|
| 526 |
)
|
| 527 |
|
| 528 |
fig.update_layout(
|
| 529 |
+
title=dict(
|
| 530 |
+
text=f"{metric['display_name']} by Country", x=0.5, xanchor="center"
|
| 531 |
+
),
|
| 532 |
geo=dict(
|
| 533 |
showframe=True,
|
| 534 |
showcoastlines=True,
|
|
|
|
| 549 |
|
| 550 |
return fig
|
| 551 |
|
| 552 |
+
|
| 553 |
def create_metric_explanation(metric):
|
| 554 |
+
return gr.Markdown(metric["explanation"])
|
| 555 |
|
| 556 |
|
| 557 |
# Create the visualization components
|
| 558 |
with gr.Blocks(title="AI Language Proficiency Benchmark") as demo:
|
| 559 |
gr.Markdown("# AI Language Proficiency Benchmark")
|
| 560 |
+
gr.Markdown("Comparing language proficiency across different models and languages.")
|
|
|
|
|
|
|
| 561 |
start_metric = METRICS["overall_performance"]
|
| 562 |
|
| 563 |
metric = gr.Dropdown(
|
| 564 |
+
choices=[metric_info["display_name"] for metric_info in METRICS.values()],
|
|
|
|
|
|
|
|
|
|
| 565 |
value=start_metric["display_name"],
|
| 566 |
label="Select Metric",
|
| 567 |
interactive=True,
|
|
|
|
| 591 |
gr.Markdown(
|
| 592 |
"""
|
| 593 |
## Methodology
|
| 594 |
+
|
| 595 |
+
### Benchmark Data
|
| 596 |
+
We use the [FLORES+](https://huggingface.co/datasets/openlanguagedata/flores_plus) dataset for evaluation, which contains parallel text in over 200 languages, as well as topic labels for each sentence. Where FLORES+ includes multiple scripts for one language, we use only the most common one.
|
| 597 |
+
|
| 598 |
+
Population and speaker data and language code resolution are from Unicode [CLDR](https://github.com/unicode-org/cldr) via the [langcodes](https://github.com/rspeer/langcodes) package.
|
| 599 |
+
|
| 600 |
+
### AI Models
|
| 601 |
+
We use [OpenRouter](https://openrouter.ai/) to access all relevant AI models via a unified API.
|
| 602 |
+
|
| 603 |
+
### Evaluation Tasks
|
| 604 |
+
Our benchmark includes three core tasks to assess different aspects of language understanding:
|
| 605 |
+
|
| 606 |
+
1. **Machine Translation**: Models translate text _from_ the evaluated language _to_ a fixed set of target languages. The set of target languages is representative of global speaker populations. Performance is measured using:
|
| 607 |
+
- [BLEU Score](https://huggingface.co/metrics/bleu): Measures n-gram precision with a brevity penalty
|
| 608 |
+
- [ChrF Score](https://huggingface.co/metrics/chrf): Character-level F-score that better captures morphological variations
|
| 609 |
+
|
| 610 |
+
2. **Text Classification**: Models classify text into predefined topics after being shown examples. We:
|
| 611 |
+
- Group sentences by URL into paragraphs with the same topic
|
| 612 |
+
- Use the 5 most common topics, encoded as numbers rather than English labels
|
| 613 |
+
- Provide 5 examples of each topic as few-shot examples
|
| 614 |
+
- Test the model's ability to classify new text
|
| 615 |
+
- Report accuracy as the primary metric
|
| 616 |
+
|
| 617 |
+
3. **Masked Language Modeling**: Models predict missing portions of text (marked with `<mask>`). We:
|
| 618 |
+
- Mask approximately 5% of each sentence at a random position
|
| 619 |
+
- Provide 10 examples of complete sentences paired with masked versions in a few-shot setting
|
| 620 |
+
- Evaluate predictions using ChrF score against the original text
|
| 621 |
+
|
| 622 |
+
The overall performance score combines metrics from all tasks to provide a holistic assessment of model capabilities across languages.
|
| 623 |
""",
|
| 624 |
container=True,
|
| 625 |
)
|
| 626 |
+
|
| 627 |
def update_component(fn, metric_choice):
|
| 628 |
metric = [m for m in METRICS.values() if m["display_name"] == metric_choice][0]
|
| 629 |
return fn(metric)
|
|
|
|
|
|
|
| 630 |
|
| 631 |
+
|
| 632 |
+
metric.change(
|
| 633 |
+
fn=partial(update_component, create_metric_explanation),
|
| 634 |
+
inputs=metric,
|
| 635 |
+
outputs=metric_explanation,
|
| 636 |
+
)
|
| 637 |
metric.change(
|
| 638 |
+
fn=partial(update_component, create_model_comparison_plot),
|
| 639 |
+
inputs=metric,
|
| 640 |
+
outputs=model_comparison_plot,
|
| 641 |
)
|
| 642 |
metric.change(
|
| 643 |
+
fn=partial(update_component, create_scatter_plot),
|
| 644 |
+
inputs=metric,
|
| 645 |
+
outputs=scatter_plot,
|
| 646 |
)
|
| 647 |
metric.change(
|
| 648 |
fn=partial(update_component, create_world_map), inputs=metric, outputs=world_map
|