Spaces:
Running
Running
David Pomerenke
commited on
Commit
·
7fc657e
1
Parent(s):
0c05388
Classification evaluation
Browse files- app.py +3 -0
- evals.py +89 -10
- results.json +0 -0
app.py
CHANGED
|
@@ -187,6 +187,9 @@ def create_language_stats_df(results):
|
|
| 187 |
if best_score["bleu"] is not None
|
| 188 |
else "N/A",
|
| 189 |
"CommonVoice Hours": commonvoice_link,
|
|
|
|
|
|
|
|
|
|
| 190 |
}
|
| 191 |
flat_data.append(row)
|
| 192 |
|
|
|
|
| 187 |
if best_score["bleu"] is not None
|
| 188 |
else "N/A",
|
| 189 |
"CommonVoice Hours": commonvoice_link,
|
| 190 |
+
"Accuracy": round(lang["accuracy"], 3)
|
| 191 |
+
if lang["accuracy"] is not None
|
| 192 |
+
else "N/A",
|
| 193 |
}
|
| 194 |
flat_data.append(row)
|
| 195 |
|
evals.py
CHANGED
|
@@ -15,13 +15,14 @@ from langcodes import Language, standardize_tag
|
|
| 15 |
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
|
| 16 |
from openai import AsyncOpenAI
|
| 17 |
from requests import get
|
|
|
|
| 18 |
from tqdm.asyncio import tqdm_asyncio
|
| 19 |
from transformers import NllbTokenizer
|
| 20 |
|
| 21 |
# config
|
| 22 |
models = [
|
| 23 |
"openai/gpt-4o-mini", # 0.6$/M tokens
|
| 24 |
-
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive
|
| 25 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
| 26 |
"mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
|
| 27 |
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
|
@@ -138,14 +139,14 @@ languages = pd.merge(
|
|
| 138 |
) # "left" because keep it simple for now
|
| 139 |
languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
|
| 140 |
|
| 141 |
-
languages = languages.sort_values(by="speakers", ascending=False)
|
| 142 |
|
| 143 |
# sample languages to translate to
|
| 144 |
target_languages = languages[languages["in_benchmark"]].sample(
|
| 145 |
n=n_sentences, weights="speakers", replace=True, random_state=42
|
| 146 |
)
|
| 147 |
# sample languages to analyze with all models
|
| 148 |
-
detailed_languages = languages[languages["in_benchmark"]].sample(n=
|
| 149 |
|
| 150 |
|
| 151 |
# utils
|
|
@@ -213,13 +214,71 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
|
|
| 213 |
}
|
| 214 |
|
| 215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
def mean(lst):
|
| 217 |
return sum(lst) / len(lst) if lst else 0
|
| 218 |
|
| 219 |
|
| 220 |
# evaluation!
|
| 221 |
async def main():
|
| 222 |
-
|
|
|
|
| 223 |
translate_and_evaluate(model, original_language.bcp_47, i)
|
| 224 |
for i in range(n_sentences)
|
| 225 |
for original_language in languages.itertuples()
|
|
@@ -230,22 +289,41 @@ async def main():
|
|
| 230 |
or original_language.bcp_47 in detailed_languages.bcp_47.values
|
| 231 |
)
|
| 232 |
]
|
| 233 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 234 |
results = []
|
| 235 |
for language in languages.itertuples():
|
| 236 |
results_for_language = []
|
| 237 |
for model in models:
|
| 238 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
score
|
| 240 |
-
for score in
|
| 241 |
if score["bcp_47"] == language.bcp_47 and score["model"] == model
|
| 242 |
]
|
| 243 |
-
|
|
|
|
| 244 |
results_for_language.append(
|
| 245 |
{
|
| 246 |
"model": model,
|
| 247 |
-
"bleu": mean([s["bleu"] for s in
|
| 248 |
-
"chrf": mean([s["chrf"] for s in
|
|
|
|
| 249 |
}
|
| 250 |
)
|
| 251 |
if results_for_language:
|
|
@@ -257,6 +335,7 @@ async def main():
|
|
| 257 |
"scores": results_for_language,
|
| 258 |
"bleu": mean([s["bleu"] for s in results_for_language]),
|
| 259 |
"chrf": mean([s["chrf"] for s in results_for_language]),
|
|
|
|
| 260 |
"commonvoice_hours": language.commonvoice_hours
|
| 261 |
if not pd.isna(language.commonvoice_hours)
|
| 262 |
else None,
|
|
|
|
| 15 |
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION
|
| 16 |
from openai import AsyncOpenAI
|
| 17 |
from requests import get
|
| 18 |
+
from rich import print
|
| 19 |
from tqdm.asyncio import tqdm_asyncio
|
| 20 |
from transformers import NllbTokenizer
|
| 21 |
|
| 22 |
# config
|
| 23 |
models = [
|
| 24 |
"openai/gpt-4o-mini", # 0.6$/M tokens
|
| 25 |
+
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
|
| 26 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
| 27 |
"mistralai/mistral-small-24b-instruct-2501", # 0.14$/M tokens
|
| 28 |
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
|
|
|
| 139 |
) # "left" because keep it simple for now
|
| 140 |
languages["in_benchmark"] = languages["bcp_47"].isin(benchmark_languages["bcp_47"])
|
| 141 |
|
| 142 |
+
languages = languages.sort_values(by="speakers", ascending=False).iloc[:10]
|
| 143 |
|
| 144 |
# sample languages to translate to
|
| 145 |
target_languages = languages[languages["in_benchmark"]].sample(
|
| 146 |
n=n_sentences, weights="speakers", replace=True, random_state=42
|
| 147 |
)
|
| 148 |
# sample languages to analyze with all models
|
| 149 |
+
detailed_languages = languages[languages["in_benchmark"]].sample(n=1, random_state=42)
|
| 150 |
|
| 151 |
|
| 152 |
# utils
|
|
|
|
| 214 |
}
|
| 215 |
|
| 216 |
|
| 217 |
+
metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
|
| 218 |
+
|
| 219 |
+
@cache
|
| 220 |
+
async def classify_and_evaluate(model, language_bcp_47, nr):
|
| 221 |
+
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
| 222 |
+
sentences = pd.DataFrame(load_sentences(language), columns=["text"])
|
| 223 |
+
sentences = pd.concat([metadata, sentences], axis=1)
|
| 224 |
+
sentences = sentences.dropna(subset=["topic"])
|
| 225 |
+
sentences["topic"] = sentences["topic"].str.lower()
|
| 226 |
+
paragraphs = (
|
| 227 |
+
sentences.groupby("URL").agg({"text": " ".join, "topic": "first"}).reset_index()
|
| 228 |
+
)
|
| 229 |
+
top_topics = paragraphs.value_counts("topic").head(5).index
|
| 230 |
+
paragraphs = paragraphs[paragraphs["topic"].isin(top_topics)]
|
| 231 |
+
examples = pd.concat(
|
| 232 |
+
[
|
| 233 |
+
paragraphs[paragraphs["topic"] == t].sample(n=5, random_state=42)
|
| 234 |
+
for t in top_topics
|
| 235 |
+
]
|
| 236 |
+
).sample(frac=1, random_state=42)
|
| 237 |
+
test_paragraphs = paragraphs[~paragraphs["URL"].isin(examples["URL"])].sample(
|
| 238 |
+
frac=1, random_state=42
|
| 239 |
+
)
|
| 240 |
+
test_paragraph = test_paragraphs.iloc[nr]
|
| 241 |
+
messages = [
|
| 242 |
+
{
|
| 243 |
+
"role": "system",
|
| 244 |
+
"content": f"Categories: {'; '.join(examples['topic'].drop_duplicates())}.",
|
| 245 |
+
}
|
| 246 |
+
]
|
| 247 |
+
for example in examples.itertuples():
|
| 248 |
+
messages += [
|
| 249 |
+
{"role": "user", "content": example.text},
|
| 250 |
+
{"role": "assistant", "content": example.topic},
|
| 251 |
+
]
|
| 252 |
+
reply = await complete(
|
| 253 |
+
model=model,
|
| 254 |
+
messages=[
|
| 255 |
+
*messages,
|
| 256 |
+
{
|
| 257 |
+
"role": "user",
|
| 258 |
+
"content": test_paragraph.text,
|
| 259 |
+
},
|
| 260 |
+
],
|
| 261 |
+
temperature=0,
|
| 262 |
+
max_tokens=1024,
|
| 263 |
+
)
|
| 264 |
+
prediction = reply.choices[0].message.content.strip()
|
| 265 |
+
return {
|
| 266 |
+
"model": model,
|
| 267 |
+
"bcp_47": language["bcp_47"],
|
| 268 |
+
"true": test_paragraph.topic,
|
| 269 |
+
"pred": prediction,
|
| 270 |
+
"sentence_nr": nr,
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
|
| 274 |
def mean(lst):
|
| 275 |
return sum(lst) / len(lst) if lst else 0
|
| 276 |
|
| 277 |
|
| 278 |
# evaluation!
|
| 279 |
async def main():
|
| 280 |
+
print("evaluate translation")
|
| 281 |
+
translation_scores = [
|
| 282 |
translate_and_evaluate(model, original_language.bcp_47, i)
|
| 283 |
for i in range(n_sentences)
|
| 284 |
for original_language in languages.itertuples()
|
|
|
|
| 289 |
or original_language.bcp_47 in detailed_languages.bcp_47.values
|
| 290 |
)
|
| 291 |
]
|
| 292 |
+
translation_scores = await tqdm_asyncio.gather(*translation_scores, miniters=1)
|
| 293 |
+
print("evaluate classification")
|
| 294 |
+
classification_scores = [
|
| 295 |
+
classify_and_evaluate(model, language.bcp_47, i)
|
| 296 |
+
for i in range(n_sentences)
|
| 297 |
+
for language in languages.itertuples()
|
| 298 |
+
for model in models
|
| 299 |
+
if language.in_benchmark
|
| 300 |
+
and (model == fast_model or language.bcp_47 in detailed_languages.bcp_47.values)
|
| 301 |
+
]
|
| 302 |
+
classification_scores = await tqdm_asyncio.gather(
|
| 303 |
+
*classification_scores, miniters=1
|
| 304 |
+
)
|
| 305 |
results = []
|
| 306 |
for language in languages.itertuples():
|
| 307 |
results_for_language = []
|
| 308 |
for model in models:
|
| 309 |
+
translations_for_model = [
|
| 310 |
+
score
|
| 311 |
+
for score in translation_scores
|
| 312 |
+
if score["bcp_47"] == language.bcp_47 and score["model"] == model
|
| 313 |
+
]
|
| 314 |
+
classifications_for_model = [
|
| 315 |
score
|
| 316 |
+
for score in classification_scores
|
| 317 |
if score["bcp_47"] == language.bcp_47 and score["model"] == model
|
| 318 |
]
|
| 319 |
+
accuracy = mean([s["true"] == s["pred"] for s in classifications_for_model])
|
| 320 |
+
if translations_for_model:
|
| 321 |
results_for_language.append(
|
| 322 |
{
|
| 323 |
"model": model,
|
| 324 |
+
"bleu": mean([s["bleu"] for s in translations_for_model]),
|
| 325 |
+
"chrf": mean([s["chrf"] for s in translations_for_model]),
|
| 326 |
+
"accuracy": accuracy,
|
| 327 |
}
|
| 328 |
)
|
| 329 |
if results_for_language:
|
|
|
|
| 335 |
"scores": results_for_language,
|
| 336 |
"bleu": mean([s["bleu"] for s in results_for_language]),
|
| 337 |
"chrf": mean([s["chrf"] for s in results_for_language]),
|
| 338 |
+
"accuracy": mean([s["accuracy"] for s in results_for_language]),
|
| 339 |
"commonvoice_hours": language.commonvoice_hours
|
| 340 |
if not pd.isna(language.commonvoice_hours)
|
| 341 |
else None,
|
results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|