Spaces:

fair-forward
/

languagebench

Running

languagebench / app.py

David Pomerenke

Metadata and Methodology

0a5d23d 9 months ago

7.91 kB

	import gradio as gr
	import json
	import pandas as pd
	import plotly.graph_objects as go

	# Load and process results
	with open("results.json") as f:
	results = json.load(f)


	def create_model_comparison_plot(results):
	# Extract all unique models
	models = set()
	for lang in results:
	for score in lang["scores"]:
	models.add(score["model"])
	models = list(models)

	# Create traces for each model
	traces = []
	for model in models:
	x_vals = [] # languages
	y_vals = [] # BLEU scores

	for lang in results:
	model_score = next(
	(s["bleu"] for s in lang["scores"] if s["model"] == model), None
	)
	if model_score is not None:
	x_vals.append(lang["language_name"])
	y_vals.append(model_score)

	traces.append(
	go.Bar(
	name=model.split("/")[-1],
	x=x_vals,
	y=y_vals,
	)
	)

	fig = go.Figure(data=traces)
	fig.update_layout(
	title="BLEU Scores by Model and Language",
	xaxis_title="Language",
	yaxis_title="BLEU Score",
	barmode="group",
	height=500,
	)
	return fig


	def create_scatter_plot(results):
	fig = go.Figure()

	x_vals = [lang["speakers"] / 1_000_000 for lang in results] # Convert to millions
	y_vals = [lang["bleu"] for lang in results]
	labels = [lang["language_name"] for lang in results]

	fig.add_trace(
	go.Scatter(
	x=x_vals,
	y=y_vals,
	mode="markers+text",
	text=labels,
	textposition="top center",
	hovertemplate="<b>%{text}</b><br>"
	+ "Speakers: %{x:.1f}M<br>"
	+ "BLEU Score: %{y:.3f}<extra></extra>",
	)
	)

	fig.update_layout(
	title="Language Coverage: Speakers vs BLEU Score",
	xaxis_title="Number of Speakers (Millions)",
	yaxis_title="Average BLEU Score",
	height=500,
	showlegend=False,
	)

	# Use log scale for x-axis since speaker numbers vary widely
	fig.update_xaxes(type="log")

	return fig


	def create_results_df(results):
	# Create a list to store flattened data
	flat_data = []

	for lang in results:
	# Find the best model and its BLEU score
	best_score = max(lang["scores"] or [{"bleu": None, "model": None}], key=lambda x: x["bleu"])

	row = {
	"Language": lang["language_name"],
	"Speakers (M)": round(lang["speakers"] / 1_000_000, 1),
	"Models Tested": len(lang["scores"]),
	"Average BLEU": round(lang["bleu"], 3) if lang["bleu"] is not None else "N/A",
	"Best Model": best_score["model"] if best_score["model"] is not None else "N/A",
	"Best Model BLEU": round(best_score["bleu"], 3) if best_score["bleu"] is not None else "N/A",
	}
	flat_data.append(row)

	return pd.DataFrame(flat_data)


	def create_leaderboard_df(results):
	# Sort languages by average BLEU to determine resource categories
	langs_with_bleu = [lang for lang in results if lang["bleu"] is not None]
	sorted_langs = sorted(langs_with_bleu, key=lambda x: x["bleu"], reverse=True)
	n_langs = len(sorted_langs)
	high_cutoff = n_langs // 4 # top 25%
	low_cutoff = n_langs - n_langs // 4 # bottom 25%

	# Create sets of languages for each category
	high_resource = {lang["language_name"] for lang in sorted_langs[:high_cutoff]}
	low_resource = {lang["language_name"] for lang in sorted_langs[low_cutoff:]}

	# Get all model scores with categorization
	model_scores = {}
	for lang in results:
	category = ("High-Resource" if lang["language_name"] in high_resource else
	"Low-Resource" if lang["language_name"] in low_resource else
	"Mid-Resource")

	for score in lang["scores"]:
	model_name = score["model"].split("/")[-1]
	if model_name not in model_scores:
	model_scores[model_name] = {
	"High-Resource": [],
	"Mid-Resource": [],
	"Low-Resource": []
	}
	model_scores[model_name][category].append(score["bleu"])

	# Calculate average scores and create DataFrame
	leaderboard_data = []
	for model, categories in model_scores.items():
	# Calculate averages for each category
	high_avg = round(sum(categories["High-Resource"]) / len(categories["High-Resource"]), 3) if categories["High-Resource"] else 0
	mid_avg = round(sum(categories["Mid-Resource"]) / len(categories["Mid-Resource"]), 3) if categories["Mid-Resource"] else 0
	low_avg = round(sum(categories["Low-Resource"]) / len(categories["Low-Resource"]), 3) if categories["Low-Resource"] else 0

	# Calculate overall average
	all_scores = (categories["High-Resource"] +
	categories["Mid-Resource"] +
	categories["Low-Resource"])
	overall_avg = round(sum(all_scores) / len(all_scores), 3)

	leaderboard_data.append({
	"Model": model,
	"Overall BLEU": overall_avg,
	"High-Resource BLEU": high_avg,
	"Mid-Resource BLEU": mid_avg,
	"Low-Resource BLEU": low_avg,
	"Languages Tested": len(all_scores),
	})

	# Sort by overall BLEU
	df = pd.DataFrame(leaderboard_data)
	df = df.sort_values("Overall BLEU", ascending=False)

	# Add rank and medals
	df["Rank"] = range(1, len(df) + 1)
	df["Rank"] = df["Rank"].apply(
	lambda x: "🥇" if x == 1 else "🥈" if x == 2 else "🥉" if x == 3 else str(x)
	)

	# Reorder columns
	df = df[["Rank", "Model", "Overall BLEU", "High-Resource BLEU",
	"Mid-Resource BLEU", "Low-Resource BLEU", "Languages Tested"]]

	return df


	# Create the visualization components
	with gr.Blocks(title="AI Language Translation Benchmark") as demo:
	gr.Markdown("# AI Language Translation Benchmark")
	gr.Markdown(
	"Comparing translation performance across different AI models and languages"
	)

	df = create_results_df(results)
	leaderboard_df = create_leaderboard_df(results)
	bar_plot = create_model_comparison_plot(results)
	scatter_plot = create_scatter_plot(results)

	gr.DataFrame(value=leaderboard_df, label="Model Leaderboard", show_search=False)
	gr.Plot(value=bar_plot, label="Model Comparison")
	gr.DataFrame(value=df, label="Language Results", show_search="search")
	gr.Plot(value=scatter_plot, label="Language Coverage")


	gr.Markdown("""
	## Methodology
	### Dataset
	- Using [FLORES-200](https://huggingface.co/datasets/openlanguagedata/flores_plus) evaluation set, a high-quality human-translated benchmark comprising 200 languages
	- Each language is tested with the same 100 sentences
	- All translations are from the evaluated language to a fixed set of representative languages sampled by number of speakers
	- Language statistics sourced from Ethnologue and Wikidata

	### Models & Evaluation
	- Models accessed through [OpenRouter](https://openrouter.ai/), including fast models of all big labs, open and closed
	- BLEU Score: Translations are evaluated using the BLEU metric, which measures how similar the AI's translation is to a human reference translation -- higher is better

	### Language Categories
	Languages are divided into three tiers based on translation difficulty:
	- High-Resource: Top 25% of languages by BLEU score (easiest to translate)
	- Mid-Resource: Middle 50% of languages
	- Low-Resource: Bottom 25% of languages (hardest to translate)
	""", container=True)

	demo.launch()