Spaces:
Running
Running
David Pomerenke
commited on
Commit
·
8274634
1
Parent(s):
9051509
Run on 100 languages, adjust display
Browse files- evals/backend.py +1 -2
- evals/main.py +5 -13
- evals/models.py +8 -8
- evals/tasks.py +1 -1
- frontend/src/components/LanguageTable.js +8 -8
- frontend/src/components/ModelTable.js +32 -16
- results.json +0 -0
evals/backend.py
CHANGED
|
@@ -20,6 +20,7 @@ models = pd.DataFrame(results["models"])
|
|
| 20 |
def mean(lst):
|
| 21 |
return sum(lst) / len(lst) if lst else None
|
| 22 |
|
|
|
|
| 23 |
|
| 24 |
def make_model_table(df, models):
|
| 25 |
df = (
|
|
@@ -29,7 +30,6 @@ def make_model_table(df, models):
|
|
| 29 |
)
|
| 30 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 31 |
df = df.drop(columns=["task", "metric"])
|
| 32 |
-
task_metrics = df["task_metric"].unique()
|
| 33 |
df = df.pivot(index="model", columns="task_metric", values="score").fillna(0)
|
| 34 |
df["average"] = df[task_metrics].mean(axis=1)
|
| 35 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
|
@@ -59,7 +59,6 @@ def make_language_table(df, languages):
|
|
| 59 |
)
|
| 60 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 61 |
df = df.drop(columns=["task", "metric"])
|
| 62 |
-
task_metrics = df["task_metric"].unique()
|
| 63 |
df = (
|
| 64 |
df.pivot(index="bcp_47", columns="task_metric", values="score")
|
| 65 |
.fillna(0)
|
|
|
|
| 20 |
def mean(lst):
|
| 21 |
return sum(lst) / len(lst) if lst else None
|
| 22 |
|
| 23 |
+
task_metrics = ["translation_bleu", "classification_accuracy"]
|
| 24 |
|
| 25 |
def make_model_table(df, models):
|
| 26 |
df = (
|
|
|
|
| 30 |
)
|
| 31 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 32 |
df = df.drop(columns=["task", "metric"])
|
|
|
|
| 33 |
df = df.pivot(index="model", columns="task_metric", values="score").fillna(0)
|
| 34 |
df["average"] = df[task_metrics].mean(axis=1)
|
| 35 |
df = df.sort_values(by="average", ascending=False).reset_index()
|
|
|
|
| 59 |
)
|
| 60 |
df["task_metric"] = df["task"] + "_" + df["metric"]
|
| 61 |
df = df.drop(columns=["task", "metric"])
|
|
|
|
| 62 |
df = (
|
| 63 |
df.pivot(index="bcp_47", columns="task_metric", values="score")
|
| 64 |
.fillna(0)
|
evals/main.py
CHANGED
|
@@ -6,16 +6,12 @@ import pandas as pd
|
|
| 6 |
from tqdm.asyncio import tqdm_asyncio
|
| 7 |
|
| 8 |
from languages import languages
|
| 9 |
-
from models import
|
| 10 |
from tasks import tasks
|
| 11 |
|
| 12 |
# ===== config =====
|
| 13 |
|
| 14 |
-
n_sentences =
|
| 15 |
-
langs_eval = languages.iloc[:30]
|
| 16 |
-
langs_eval_detailed = languages.iloc[:2]
|
| 17 |
-
transcription_langs_eval = languages.iloc[:10]
|
| 18 |
-
transcription_langs_eval_detailed = languages.iloc[:5]
|
| 19 |
|
| 20 |
# ===== run evaluation and aggregate results =====
|
| 21 |
|
|
@@ -23,16 +19,12 @@ transcription_langs_eval_detailed = languages.iloc[:5]
|
|
| 23 |
async def evaluate():
|
| 24 |
print("running evaluations")
|
| 25 |
results = [
|
| 26 |
-
task(model,
|
| 27 |
for task in tasks
|
| 28 |
for i in range(n_sentences)
|
| 29 |
-
for
|
| 30 |
for model in models["id"]
|
| 31 |
-
if
|
| 32 |
-
and (
|
| 33 |
-
model == model_fast
|
| 34 |
-
or original_language.bcp_47 in langs_eval_detailed.bcp_47.values
|
| 35 |
-
)
|
| 36 |
]
|
| 37 |
return await tqdm_asyncio.gather(*results, miniters=1)
|
| 38 |
|
|
|
|
| 6 |
from tqdm.asyncio import tqdm_asyncio
|
| 7 |
|
| 8 |
from languages import languages
|
| 9 |
+
from models import models
|
| 10 |
from tasks import tasks
|
| 11 |
|
| 12 |
# ===== config =====
|
| 13 |
|
| 14 |
+
n_sentences = 10
|
|
|
|
|
|
|
|
|
|
|
|
|
| 15 |
|
| 16 |
# ===== run evaluation and aggregate results =====
|
| 17 |
|
|
|
|
| 19 |
async def evaluate():
|
| 20 |
print("running evaluations")
|
| 21 |
results = [
|
| 22 |
+
task(model, lang.bcp_47, i)
|
| 23 |
for task in tasks
|
| 24 |
for i in range(n_sentences)
|
| 25 |
+
for lang in languages.iloc[:100].itertuples()
|
| 26 |
for model in models["id"]
|
| 27 |
+
if lang.in_benchmark
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
]
|
| 29 |
return await tqdm_asyncio.gather(*results, miniters=1)
|
| 30 |
|
evals/models.py
CHANGED
|
@@ -14,23 +14,24 @@ from requests import HTTPError
|
|
| 14 |
models = [
|
| 15 |
"openai/gpt-4o-mini", # 0.6$/M tokens
|
| 16 |
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
|
|
|
|
| 17 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
| 18 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$/M tokens
|
| 19 |
"meta-llama/llama-3-70b-instruct", # 0.4$/M tokens
|
| 20 |
-
"mistralai/mistral-small-24b-instruct
|
| 21 |
-
"mistralai/mistral-
|
|
|
|
| 22 |
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
| 23 |
-
"google/gemini-2.0-flash-lite-001", # 0.3$/M tokens
|
| 24 |
"google/gemma-3-27b-it", # 0.2$/M tokens
|
| 25 |
# "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
|
| 26 |
-
"qwen/qwq-32b",
|
| 27 |
-
|
| 28 |
# "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
|
| 29 |
-
"microsoft/phi-4-multimodal-instruct",
|
| 30 |
"amazon/nova-micro-v1", # 0.09$/M tokens
|
| 31 |
# "openGPT-X/Teuken-7B-instruct-research-v0.4", # not on OpenRouter
|
| 32 |
]
|
| 33 |
-
model_fast = "meta-llama/llama-3.3-70b-instruct"
|
| 34 |
|
| 35 |
transcription_models = [
|
| 36 |
"elevenlabs/scribe_v1",
|
|
@@ -38,7 +39,6 @@ transcription_models = [
|
|
| 38 |
# "openai/whisper-small",
|
| 39 |
# "facebook/seamless-m4t-v2-large",
|
| 40 |
]
|
| 41 |
-
transcription_model_fast = "elevenlabs/scribe_v1"
|
| 42 |
|
| 43 |
load_dotenv()
|
| 44 |
client = AsyncOpenAI(
|
|
|
|
| 14 |
models = [
|
| 15 |
"openai/gpt-4o-mini", # 0.6$/M tokens
|
| 16 |
# "anthropic/claude-3.5-haiku", # 4$/M tokens -> too expensive for dev
|
| 17 |
+
"meta-llama/llama-4-maverick", # 0.6$/M tokens
|
| 18 |
"meta-llama/llama-3.3-70b-instruct", # 0.3$/M tokens
|
| 19 |
"meta-llama/llama-3.1-70b-instruct", # 0.3$/M tokens
|
| 20 |
"meta-llama/llama-3-70b-instruct", # 0.4$/M tokens
|
| 21 |
+
"mistralai/mistral-small-3.1-24b-instruct", # 0.3$/M tokens
|
| 22 |
+
# "mistralai/mistral-saba", # 0.6$/M tokens
|
| 23 |
+
# "mistralai/mistral-nemo", # 0.08$/M tokens
|
| 24 |
"google/gemini-2.0-flash-001", # 0.4$/M tokens
|
| 25 |
+
# "google/gemini-2.0-flash-lite-001", # 0.3$/M tokens
|
| 26 |
"google/gemma-3-27b-it", # 0.2$/M tokens
|
| 27 |
# "qwen/qwen-turbo", # 0.2$/M tokens; recognizes "inappropriate content"
|
| 28 |
+
"qwen/qwq-32b", # 0.2$/M tokens
|
| 29 |
+
"deepseek/deepseek-chat-v3-0324", # 1.1$/M tokens
|
| 30 |
# "microsoft/phi-4", # 0.07$/M tokens; only 16k tokens context
|
| 31 |
+
"microsoft/phi-4-multimodal-instruct", # 0.1$/M tokens
|
| 32 |
"amazon/nova-micro-v1", # 0.09$/M tokens
|
| 33 |
# "openGPT-X/Teuken-7B-instruct-research-v0.4", # not on OpenRouter
|
| 34 |
]
|
|
|
|
| 35 |
|
| 36 |
transcription_models = [
|
| 37 |
"elevenlabs/scribe_v1",
|
|
|
|
| 39 |
# "openai/whisper-small",
|
| 40 |
# "facebook/seamless-m4t-v2-large",
|
| 41 |
]
|
|
|
|
| 42 |
|
| 43 |
load_dotenv()
|
| 44 |
client = AsyncOpenAI(
|
evals/tasks.py
CHANGED
|
@@ -212,6 +212,6 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
| 212 |
tasks = [
|
| 213 |
translate_and_evaluate,
|
| 214 |
classify_and_evaluate,
|
| 215 |
-
mlm_and_evaluate,
|
| 216 |
# transcribe_and_evaluate,
|
| 217 |
]
|
|
|
|
| 212 |
tasks = [
|
| 213 |
translate_and_evaluate,
|
| 214 |
classify_and_evaluate,
|
| 215 |
+
# mlm_and_evaluate,
|
| 216 |
# transcribe_and_evaluate,
|
| 217 |
]
|
frontend/src/components/LanguageTable.js
CHANGED
|
@@ -174,12 +174,12 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages }) => {
|
|
| 174 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 175 |
/>
|
| 176 |
<Column
|
| 177 |
-
field='
|
| 178 |
header='Translation'
|
| 179 |
sortable
|
| 180 |
-
body={scoreBodyTemplate('
|
| 181 |
-
minScore: 0
|
| 182 |
-
maxScore: 0.
|
| 183 |
})}
|
| 184 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 185 |
/>
|
|
@@ -188,12 +188,12 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages }) => {
|
|
| 188 |
header='Classification'
|
| 189 |
sortable
|
| 190 |
body={scoreBodyTemplate('classification_accuracy', {
|
| 191 |
-
minScore: 0
|
| 192 |
-
maxScore: 0.
|
| 193 |
})}
|
| 194 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 195 |
/>
|
| 196 |
-
<Column
|
| 197 |
field='language_modeling_chrf'
|
| 198 |
header='Language Modeling'
|
| 199 |
sortable
|
|
@@ -202,7 +202,7 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages }) => {
|
|
| 202 |
maxScore: 1
|
| 203 |
})}
|
| 204 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 205 |
-
/>
|
| 206 |
</DataTable>
|
| 207 |
)
|
| 208 |
}
|
|
|
|
| 174 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 175 |
/>
|
| 176 |
<Column
|
| 177 |
+
field='translation_bleu'
|
| 178 |
header='Translation'
|
| 179 |
sortable
|
| 180 |
+
body={scoreBodyTemplate('translation_bleu', {
|
| 181 |
+
minScore: 0,
|
| 182 |
+
maxScore: 0.5
|
| 183 |
})}
|
| 184 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 185 |
/>
|
|
|
|
| 188 |
header='Classification'
|
| 189 |
sortable
|
| 190 |
body={scoreBodyTemplate('classification_accuracy', {
|
| 191 |
+
minScore: 0,
|
| 192 |
+
maxScore: 0.5
|
| 193 |
})}
|
| 194 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 195 |
/>
|
| 196 |
+
{/* <Column
|
| 197 |
field='language_modeling_chrf'
|
| 198 |
header='Language Modeling'
|
| 199 |
sortable
|
|
|
|
| 202 |
maxScore: 1
|
| 203 |
})}
|
| 204 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 205 |
+
/> */}
|
| 206 |
</DataTable>
|
| 207 |
)
|
| 208 |
}
|
frontend/src/components/ModelTable.js
CHANGED
|
@@ -103,18 +103,29 @@ const ModelTable = ({ data }) => {
|
|
| 103 |
return <div style={{ textAlign: 'center' }}>{sizeStr}</div>
|
| 104 |
}
|
| 105 |
|
| 106 |
-
const capitalize = s =>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
const providerBodyTemplate = rowData => {
|
| 109 |
-
const providerName = rowData.model
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
return providerName
|
| 111 |
}
|
| 112 |
|
| 113 |
const modelBodyTemplate = rowData => {
|
| 114 |
-
const modelName = rowData.model
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
|
|
|
|
|
|
| 118 |
}
|
| 119 |
|
| 120 |
const typeBodyTemplate = rowData => {
|
|
@@ -148,7 +159,12 @@ const ModelTable = ({ data }) => {
|
|
| 148 |
style={{ width: '800px', minHeight: '650px' }}
|
| 149 |
>
|
| 150 |
<Column field='rank' body={rankBodyTemplate} />
|
| 151 |
-
<Column
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 152 |
<Column
|
| 153 |
field='model'
|
| 154 |
header='Model'
|
|
@@ -178,16 +194,16 @@ const ModelTable = ({ data }) => {
|
|
| 178 |
field='average'
|
| 179 |
header='Average'
|
| 180 |
sortable
|
| 181 |
-
body={scoreBodyTemplate('average', { minScore: 0
|
| 182 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 183 |
/>
|
| 184 |
<Column
|
| 185 |
-
field='
|
| 186 |
header='Translation'
|
| 187 |
sortable
|
| 188 |
-
body={scoreBodyTemplate('
|
| 189 |
-
minScore: 0
|
| 190 |
-
maxScore: 0.
|
| 191 |
})}
|
| 192 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 193 |
/>
|
|
@@ -196,12 +212,12 @@ const ModelTable = ({ data }) => {
|
|
| 196 |
header='Classification'
|
| 197 |
sortable
|
| 198 |
body={scoreBodyTemplate('classification_accuracy', {
|
| 199 |
-
minScore: 0
|
| 200 |
-
maxScore: 0.
|
| 201 |
})}
|
| 202 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 203 |
/>
|
| 204 |
-
<Column
|
| 205 |
field='language_modeling_chrf'
|
| 206 |
header='Language Modeling'
|
| 207 |
sortable
|
|
@@ -210,7 +226,7 @@ const ModelTable = ({ data }) => {
|
|
| 210 |
maxScore: 1
|
| 211 |
})}
|
| 212 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 213 |
-
/>
|
| 214 |
</DataTable>
|
| 215 |
)
|
| 216 |
}
|
|
|
|
| 103 |
return <div style={{ textAlign: 'center' }}>{sizeStr}</div>
|
| 104 |
}
|
| 105 |
|
| 106 |
+
const capitalize = s =>
|
| 107 |
+
(String(s).charAt(0).toUpperCase() + String(s).slice(1))
|
| 108 |
+
.replace(/gpt/i, 'GPT')
|
| 109 |
+
.replace(/qwq/i, 'QwQ')
|
| 110 |
+
.replace(/deepseek/i, 'DeepSeek')
|
| 111 |
+
.replace(/openai/i, 'OpenAI')
|
| 112 |
|
| 113 |
const providerBodyTemplate = rowData => {
|
| 114 |
+
const providerName = rowData.model
|
| 115 |
+
.split('/')[0]
|
| 116 |
+
.split('-')
|
| 117 |
+
.map(capitalize)
|
| 118 |
+
.join(' ')
|
| 119 |
return providerName
|
| 120 |
}
|
| 121 |
|
| 122 |
const modelBodyTemplate = rowData => {
|
| 123 |
+
const modelName = rowData.model
|
| 124 |
+
.split('/')[1]
|
| 125 |
+
.split('-')
|
| 126 |
+
.map(capitalize)
|
| 127 |
+
.join(' ')
|
| 128 |
+
return <div style={{ fontWeight: 'bold', height: '100%' }}>{modelName}</div>
|
| 129 |
}
|
| 130 |
|
| 131 |
const typeBodyTemplate = rowData => {
|
|
|
|
| 159 |
style={{ width: '800px', minHeight: '650px' }}
|
| 160 |
>
|
| 161 |
<Column field='rank' body={rankBodyTemplate} />
|
| 162 |
+
<Column
|
| 163 |
+
field='provider'
|
| 164 |
+
header='Provider'
|
| 165 |
+
style={{ minWidth: '7rem' }}
|
| 166 |
+
body={providerBodyTemplate}
|
| 167 |
+
/>
|
| 168 |
<Column
|
| 169 |
field='model'
|
| 170 |
header='Model'
|
|
|
|
| 194 |
field='average'
|
| 195 |
header='Average'
|
| 196 |
sortable
|
| 197 |
+
body={scoreBodyTemplate('average', { minScore: 0, maxScore: 0.6 })}
|
| 198 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 199 |
/>
|
| 200 |
<Column
|
| 201 |
+
field='translation_bleu'
|
| 202 |
header='Translation'
|
| 203 |
sortable
|
| 204 |
+
body={scoreBodyTemplate('translation_bleu', {
|
| 205 |
+
minScore: 0,
|
| 206 |
+
maxScore: 0.3
|
| 207 |
})}
|
| 208 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 209 |
/>
|
|
|
|
| 212 |
header='Classification'
|
| 213 |
sortable
|
| 214 |
body={scoreBodyTemplate('classification_accuracy', {
|
| 215 |
+
minScore: 0,
|
| 216 |
+
maxScore: 0.9
|
| 217 |
})}
|
| 218 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 219 |
/>
|
| 220 |
+
{/* <Column
|
| 221 |
field='language_modeling_chrf'
|
| 222 |
header='Language Modeling'
|
| 223 |
sortable
|
|
|
|
| 226 |
maxScore: 1
|
| 227 |
})}
|
| 228 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 229 |
+
/> */}
|
| 230 |
</DataTable>
|
| 231 |
)
|
| 232 |
}
|
results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|