Spaces:
Running
Running
David Pomerenke
commited on
Commit
·
731eddd
1
Parent(s):
60d1364
Translation both from and to
Browse files- evals/backend.py +1 -1
- evals/models.py +1 -2
- evals/tasks.py +26 -16
- frontend/src/components/LanguageTable.js +15 -3
- frontend/src/components/ModelTable.js +16 -4
- results.json +0 -0
evals/backend.py
CHANGED
|
@@ -22,7 +22,7 @@ def mean(lst):
|
|
| 22 |
return sum(lst) / len(lst) if lst else None
|
| 23 |
|
| 24 |
|
| 25 |
-
task_metrics = ["
|
| 26 |
|
| 27 |
|
| 28 |
def make_model_table(df, models):
|
|
|
|
| 22 |
return sum(lst) / len(lst) if lst else None
|
| 23 |
|
| 24 |
|
| 25 |
+
task_metrics = ["translation_from_bleu", "translation_to_bleu", "classification_accuracy"]
|
| 26 |
|
| 27 |
|
| 28 |
def make_model_table(df, models):
|
evals/models.py
CHANGED
|
@@ -116,7 +116,7 @@ async def transcribe(path, model="elevenlabs/scribe_v1"):
|
|
| 116 |
raise ValueError(f"Model {model} not supported")
|
| 117 |
|
| 118 |
|
| 119 |
-
models = pd.DataFrame(models, columns=["id"])
|
| 120 |
|
| 121 |
|
| 122 |
@cache
|
|
@@ -144,7 +144,6 @@ def get_hf_metadata(row):
|
|
| 144 |
if not row:
|
| 145 |
return empty
|
| 146 |
id = row["hf_slug"] or row["slug"].split(":")[0]
|
| 147 |
-
print(id)
|
| 148 |
if not id:
|
| 149 |
return empty
|
| 150 |
try:
|
|
|
|
| 116 |
raise ValueError(f"Model {model} not supported")
|
| 117 |
|
| 118 |
|
| 119 |
+
models = pd.DataFrame(models, columns=["id"]).iloc[:3]
|
| 120 |
|
| 121 |
|
| 122 |
@cache
|
|
|
|
| 144 |
if not row:
|
| 145 |
return empty
|
| 146 |
id = row["hf_slug"] or row["slug"].split(":")[0]
|
|
|
|
| 147 |
if not id:
|
| 148 |
return empty
|
| 149 |
try:
|
evals/tasks.py
CHANGED
|
@@ -1,30 +1,37 @@
|
|
| 1 |
import random
|
|
|
|
| 2 |
|
| 3 |
import evaluate
|
| 4 |
import pandas as pd
|
|
|
|
|
|
|
| 5 |
from joblib.memory import Memory
|
| 6 |
from languages import languages, script_name
|
| 7 |
-
from datasets_.flores import flores_sentences
|
| 8 |
from models import complete, transcribe
|
| 9 |
-
import sentencepiece as spm
|
| 10 |
|
| 11 |
cache = Memory(location=".cache", verbose=0).cache
|
| 12 |
bleu = evaluate.load("bleu")
|
| 13 |
chrf = evaluate.load("chrf")
|
| 14 |
wer = evaluate.load("wer")
|
| 15 |
-
tokenizer = spm.SentencePieceProcessor(
|
|
|
|
|
|
|
| 16 |
|
| 17 |
# sample languages to translate to
|
| 18 |
target_languages = languages[languages["in_benchmark"]].sample(
|
| 19 |
frac=1, weights="speakers", replace=True, random_state=42
|
| 20 |
)
|
| 21 |
|
|
|
|
| 22 |
@cache
|
| 23 |
-
async def translate_and_evaluate(model,
|
| 24 |
-
original_language = languages[languages["bcp_47"] ==
|
| 25 |
-
0
|
| 26 |
-
]
|
| 27 |
target_language = target_languages.iloc[sentence_nr]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
original_sentence = flores_sentences(original_language)[sentence_nr].strip()
|
| 29 |
target_sentence = flores_sentences(target_language)[sentence_nr].strip()
|
| 30 |
script = script_name(target_language.flores_path.split("_")[1])
|
|
@@ -52,14 +59,15 @@ async def translate_and_evaluate(model, original_language_bcp_47, sentence_nr):
|
|
| 52 |
return [
|
| 53 |
{
|
| 54 |
"model": model,
|
| 55 |
-
"bcp_47":
|
| 56 |
-
"task": "
|
| 57 |
"metric": metric,
|
| 58 |
"score": score,
|
| 59 |
"sentence_nr": sentence_nr,
|
| 60 |
}
|
| 61 |
-
for metric, score in
|
| 62 |
-
|
|
|
|
| 63 |
)
|
| 64 |
]
|
| 65 |
|
|
@@ -68,8 +76,8 @@ metadata = pd.read_csv("data/floresp-v2.0-rc.3/metadata_dev.tsv", sep="\t")
|
|
| 68 |
|
| 69 |
|
| 70 |
@cache
|
| 71 |
-
async def classify_and_evaluate(model,
|
| 72 |
-
language = languages[languages["bcp_47"] ==
|
| 73 |
sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
|
| 74 |
sentences = pd.concat([metadata, sentences], axis=1)
|
| 75 |
sentences = sentences.dropna(subset=["topic"])
|
|
@@ -119,7 +127,7 @@ async def classify_and_evaluate(model, language_bcp_47, nr):
|
|
| 119 |
return [
|
| 120 |
{
|
| 121 |
"model": model,
|
| 122 |
-
"bcp_47":
|
| 123 |
"task": "classification",
|
| 124 |
"metric": "accuracy",
|
| 125 |
"score": int(pred == true),
|
|
@@ -177,6 +185,7 @@ async def mlm_and_evaluate(model, language_bcp_47, nr):
|
|
| 177 |
}
|
| 178 |
]
|
| 179 |
|
|
|
|
| 180 |
@cache
|
| 181 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
| 182 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
|
@@ -210,8 +219,9 @@ async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
|
| 210 |
|
| 211 |
|
| 212 |
tasks = [
|
| 213 |
-
translate_and_evaluate,
|
|
|
|
| 214 |
classify_and_evaluate,
|
| 215 |
# mlm_and_evaluate,
|
| 216 |
# transcribe_and_evaluate,
|
| 217 |
-
]
|
|
|
|
| 1 |
import random
|
| 2 |
+
from functools import partial
|
| 3 |
|
| 4 |
import evaluate
|
| 5 |
import pandas as pd
|
| 6 |
+
import sentencepiece as spm
|
| 7 |
+
from datasets_.flores import flores_sentences
|
| 8 |
from joblib.memory import Memory
|
| 9 |
from languages import languages, script_name
|
|
|
|
| 10 |
from models import complete, transcribe
|
|
|
|
| 11 |
|
| 12 |
cache = Memory(location=".cache", verbose=0).cache
|
| 13 |
bleu = evaluate.load("bleu")
|
| 14 |
chrf = evaluate.load("chrf")
|
| 15 |
wer = evaluate.load("wer")
|
| 16 |
+
tokenizer = spm.SentencePieceProcessor(
|
| 17 |
+
model_file="data/spbleu/flores200_sacrebleu_tokenizer_spm.model"
|
| 18 |
+
)
|
| 19 |
|
| 20 |
# sample languages to translate to
|
| 21 |
target_languages = languages[languages["in_benchmark"]].sample(
|
| 22 |
frac=1, weights="speakers", replace=True, random_state=42
|
| 23 |
)
|
| 24 |
|
| 25 |
+
|
| 26 |
@cache
|
| 27 |
+
async def translate_and_evaluate(model, bcp_47, sentence_nr, mode="from"):
|
| 28 |
+
original_language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
|
|
|
|
|
|
| 29 |
target_language = target_languages.iloc[sentence_nr]
|
| 30 |
+
match mode:
|
| 31 |
+
case "from":
|
| 32 |
+
pass
|
| 33 |
+
case "to":
|
| 34 |
+
original_language, target_language = target_language, original_language
|
| 35 |
original_sentence = flores_sentences(original_language)[sentence_nr].strip()
|
| 36 |
target_sentence = flores_sentences(target_language)[sentence_nr].strip()
|
| 37 |
script = script_name(target_language.flores_path.split("_")[1])
|
|
|
|
| 59 |
return [
|
| 60 |
{
|
| 61 |
"model": model,
|
| 62 |
+
"bcp_47": bcp_47,
|
| 63 |
+
"task": f"translation_{mode}",
|
| 64 |
"metric": metric,
|
| 65 |
"score": score,
|
| 66 |
"sentence_nr": sentence_nr,
|
| 67 |
}
|
| 68 |
+
for metric, score in (
|
| 69 |
+
("bleu", bleu_score["bleu"]),
|
| 70 |
+
("chrf", chrf_score["score"] / 100),
|
| 71 |
)
|
| 72 |
]
|
| 73 |
|
|
|
|
| 76 |
|
| 77 |
|
| 78 |
@cache
|
| 79 |
+
async def classify_and_evaluate(model, bcp_47, nr):
|
| 80 |
+
language = languages[languages["bcp_47"] == bcp_47].iloc[0]
|
| 81 |
sentences = pd.DataFrame(flores_sentences(language), columns=["text"])
|
| 82 |
sentences = pd.concat([metadata, sentences], axis=1)
|
| 83 |
sentences = sentences.dropna(subset=["topic"])
|
|
|
|
| 127 |
return [
|
| 128 |
{
|
| 129 |
"model": model,
|
| 130 |
+
"bcp_47": bcp_47,
|
| 131 |
"task": "classification",
|
| 132 |
"metric": "accuracy",
|
| 133 |
"score": int(pred == true),
|
|
|
|
| 185 |
}
|
| 186 |
]
|
| 187 |
|
| 188 |
+
|
| 189 |
@cache
|
| 190 |
async def transcribe_and_evaluate(model, language_bcp_47, nr):
|
| 191 |
language = languages[languages["bcp_47"] == language_bcp_47].iloc[0]
|
|
|
|
| 219 |
|
| 220 |
|
| 221 |
tasks = [
|
| 222 |
+
partial(translate_and_evaluate, mode="from"),
|
| 223 |
+
partial(translate_and_evaluate, mode="to"),
|
| 224 |
classify_and_evaluate,
|
| 225 |
# mlm_and_evaluate,
|
| 226 |
# transcribe_and_evaluate,
|
| 227 |
+
]
|
frontend/src/components/LanguageTable.js
CHANGED
|
@@ -174,10 +174,22 @@ const LanguageTable = ({ data, selectedLanguages, setSelectedLanguages }) => {
|
|
| 174 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 175 |
/>
|
| 176 |
<Column
|
| 177 |
-
field='
|
| 178 |
-
header=
|
|
|
|
| 179 |
sortable
|
| 180 |
-
body={scoreBodyTemplate('
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 181 |
minScore: 0,
|
| 182 |
maxScore: 0.5
|
| 183 |
})}
|
|
|
|
| 174 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 175 |
/>
|
| 176 |
<Column
|
| 177 |
+
field='translation_from_bleu'
|
| 178 |
+
header="Translation (from)"
|
| 179 |
+
headerTooltip='Translation performance from a language to all other languages (spBLEU score)'
|
| 180 |
sortable
|
| 181 |
+
body={scoreBodyTemplate('translation_from_bleu', {
|
| 182 |
+
minScore: 0,
|
| 183 |
+
maxScore: 0.5
|
| 184 |
+
})}
|
| 185 |
+
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 186 |
+
/>
|
| 187 |
+
<Column
|
| 188 |
+
field='translation_to_bleu'
|
| 189 |
+
header="Translation (to)"
|
| 190 |
+
headerTooltip='Translation performance from all other languages to a language (spBLEU score)'
|
| 191 |
+
sortable
|
| 192 |
+
body={scoreBodyTemplate('translation_to_bleu', {
|
| 193 |
minScore: 0,
|
| 194 |
maxScore: 0.5
|
| 195 |
})}
|
frontend/src/components/ModelTable.js
CHANGED
|
@@ -224,12 +224,24 @@ const ModelTable = ({ data }) => {
|
|
| 224 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 225 |
/>
|
| 226 |
<Column
|
| 227 |
-
field='
|
| 228 |
-
header=
|
|
|
|
| 229 |
sortable
|
| 230 |
-
body={scoreBodyTemplate('
|
| 231 |
minScore: 0,
|
| 232 |
-
maxScore: 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 233 |
})}
|
| 234 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 235 |
/>
|
|
|
|
| 224 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 225 |
/>
|
| 226 |
<Column
|
| 227 |
+
field='translation_from_bleu'
|
| 228 |
+
header="Translation (from)"
|
| 229 |
+
headerTooltip='Translation performance from a language to all other languages (spBLEU score)'
|
| 230 |
sortable
|
| 231 |
+
body={scoreBodyTemplate('translation_from_bleu', {
|
| 232 |
minScore: 0,
|
| 233 |
+
maxScore: 0.5
|
| 234 |
+
})}
|
| 235 |
+
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 236 |
+
/>
|
| 237 |
+
<Column
|
| 238 |
+
field='translation_to_bleu'
|
| 239 |
+
header="Translation (to)"
|
| 240 |
+
headerTooltip='Translation performance from all other languages to a language (spBLEU score)'
|
| 241 |
+
sortable
|
| 242 |
+
body={scoreBodyTemplate('translation_to_bleu', {
|
| 243 |
+
minScore: 0,
|
| 244 |
+
maxScore: 0.5
|
| 245 |
})}
|
| 246 |
style={{ minWidth: '5rem', maxWidth: '10rem' }}
|
| 247 |
/>
|
results.json
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|