Spaces:
Running
Running
David Pomerenke
commited on
Commit
·
08735bb
1
Parent(s):
63fd3b1
Add language families
Browse files- .gitignore +1 -6
- data/data.txt +1 -0
- evals.py +15 -6
- pyproject.toml +1 -0
- results.json +38 -19
- uv.lock +0 -0
.gitignore
CHANGED
|
@@ -1,15 +1,10 @@
|
|
| 1 |
floresp-*
|
|
|
|
| 2 |
LanguageCodes.tab
|
| 3 |
ScriptCodes.csv
|
| 4 |
.cache
|
| 5 |
.env
|
| 6 |
|
| 7 |
-
# Observable
|
| 8 |
-
.DS_Store
|
| 9 |
-
/dist/
|
| 10 |
-
node_modules/
|
| 11 |
-
yarn-error.log
|
| 12 |
-
|
| 13 |
# Python-generated files
|
| 14 |
__pycache__/
|
| 15 |
*.py[oc]
|
|
|
|
| 1 |
floresp-*
|
| 2 |
+
glottolog-*
|
| 3 |
LanguageCodes.tab
|
| 4 |
ScriptCodes.csv
|
| 5 |
.cache
|
| 6 |
.env
|
| 7 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
# Python-generated files
|
| 9 |
__pycache__/
|
| 10 |
*.py[oc]
|
data/data.txt
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
floresp-v2.0-rc.3: https://github.com/openlanguagedata/flores
|
|
|
|
| 2 |
languages.csv: generated from https://query.wikidata.org/ using the languages.rq query
|
| 3 |
LanguageCodes.tab: https://www.ethnologue.com/
|
| 4 |
ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
|
|
|
|
| 1 |
floresp-v2.0-rc.3: https://github.com/openlanguagedata/flores
|
| 2 |
+
glottolog-5.1: https://github.com/glottolog/glottolog
|
| 3 |
languages.csv: generated from https://query.wikidata.org/ using the languages.rq query
|
| 4 |
LanguageCodes.tab: https://www.ethnologue.com/
|
| 5 |
ScriptCodes.csv: https://www.unicode.org/iso15924/iso15924-codes.html
|
evals.py
CHANGED
|
@@ -19,6 +19,7 @@ from requests import get
|
|
| 19 |
from rich import print
|
| 20 |
from tqdm.asyncio import tqdm_asyncio
|
| 21 |
from transformers import NllbTokenizer
|
|
|
|
| 22 |
|
| 23 |
# config
|
| 24 |
models = [
|
|
@@ -73,6 +74,15 @@ def population(bcp_47):
|
|
| 73 |
return items
|
| 74 |
|
| 75 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
def script_name(iso15924):
|
| 77 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
| 78 |
|
|
@@ -406,12 +416,8 @@ async def main():
|
|
| 406 |
"scores": results,
|
| 407 |
"mt_bleu": mean([s["mt_bleu"] for s in results]),
|
| 408 |
"mt_chrf": mean([s["mt_chrf"] for s in results]),
|
| 409 |
-
"cls_acc": mean(
|
| 410 |
-
|
| 411 |
-
),
|
| 412 |
-
"mlm_chrf": mean(
|
| 413 |
-
[s["mlm_chrf"] for s in results]
|
| 414 |
-
),
|
| 415 |
"overall_score": mean([s["overall_score"] for s in results]),
|
| 416 |
"commonvoice_hours": language.commonvoice_hours
|
| 417 |
if not pd.isna(language.commonvoice_hours)
|
|
@@ -420,6 +426,9 @@ async def main():
|
|
| 420 |
if not pd.isna(language.commonvoice_locale)
|
| 421 |
else None,
|
| 422 |
"population": population(language.bcp_47),
|
|
|
|
|
|
|
|
|
|
| 423 |
}
|
| 424 |
)
|
| 425 |
with open("results.json", "w") as f:
|
|
|
|
| 19 |
from rich import print
|
| 20 |
from tqdm.asyncio import tqdm_asyncio
|
| 21 |
from transformers import NllbTokenizer
|
| 22 |
+
from pyglottolog import Glottolog
|
| 23 |
|
| 24 |
# config
|
| 25 |
models = [
|
|
|
|
| 74 |
return items
|
| 75 |
|
| 76 |
|
| 77 |
+
glottolog = Glottolog("data/glottolog-5.1")
|
| 78 |
+
|
| 79 |
+
|
| 80 |
+
@cache
|
| 81 |
+
def language_family(iso_639_3):
|
| 82 |
+
languoid = glottolog.languoid(iso_639_3)
|
| 83 |
+
return languoid.family.name if languoid else None
|
| 84 |
+
|
| 85 |
+
|
| 86 |
def script_name(iso15924):
|
| 87 |
return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]
|
| 88 |
|
|
|
|
| 416 |
"scores": results,
|
| 417 |
"mt_bleu": mean([s["mt_bleu"] for s in results]),
|
| 418 |
"mt_chrf": mean([s["mt_chrf"] for s in results]),
|
| 419 |
+
"cls_acc": mean([s["cls_acc"] for s in results]),
|
| 420 |
+
"mlm_chrf": mean([s["mlm_chrf"] for s in results]),
|
|
|
|
|
|
|
|
|
|
|
|
|
| 421 |
"overall_score": mean([s["overall_score"] for s in results]),
|
| 422 |
"commonvoice_hours": language.commonvoice_hours
|
| 423 |
if not pd.isna(language.commonvoice_hours)
|
|
|
|
| 426 |
if not pd.isna(language.commonvoice_locale)
|
| 427 |
else None,
|
| 428 |
"population": population(language.bcp_47),
|
| 429 |
+
"language_family": language_family(
|
| 430 |
+
language.flores_path.split("_")[0]
|
| 431 |
+
),
|
| 432 |
}
|
| 433 |
)
|
| 434 |
with open("results.json", "w") as f:
|
pyproject.toml
CHANGED
|
@@ -20,6 +20,7 @@ dev-dependencies = [
|
|
| 20 |
"langcodes>=3.5.0",
|
| 21 |
"openai>=1.52.2",
|
| 22 |
"protobuf>=5.28.3",
|
|
|
|
| 23 |
"python-dotenv>=1.0.1",
|
| 24 |
"sacrebleu>=2.4.3",
|
| 25 |
"sentencepiece>=0.2.0",
|
|
|
|
| 20 |
"langcodes>=3.5.0",
|
| 21 |
"openai>=1.52.2",
|
| 22 |
"protobuf>=5.28.3",
|
| 23 |
+
"pyglottolog>=3.14.0",
|
| 24 |
"python-dotenv>=1.0.1",
|
| 25 |
"sacrebleu>=2.4.3",
|
| 26 |
"sentencepiece>=0.2.0",
|
results.json
CHANGED
|
@@ -208,7 +208,8 @@
|
|
| 208 |
"ZA": 17503716,
|
| 209 |
"ZM": 2788256,
|
| 210 |
"ZW": 6109446
|
| 211 |
-
}
|
|
|
|
| 212 |
},
|
| 213 |
{
|
| 214 |
"language_name": "Chinese",
|
|
@@ -252,7 +253,8 @@
|
|
| 252 |
"TW": 22422850,
|
| 253 |
"US": 2295209,
|
| 254 |
"VN": 1085934
|
| 255 |
-
}
|
|
|
|
| 256 |
},
|
| 257 |
{
|
| 258 |
"language_name": "Hindi",
|
|
@@ -282,7 +284,8 @@
|
|
| 282 |
"NP": 127377,
|
| 283 |
"UG": 2206,
|
| 284 |
"ZA": 1129272
|
| 285 |
-
}
|
|
|
|
| 286 |
},
|
| 287 |
{
|
| 288 |
"language_name": "Spanish",
|
|
@@ -345,7 +348,8 @@
|
|
| 345 |
"US": 31933344,
|
| 346 |
"UY": 2981097,
|
| 347 |
"VE": 23488572
|
| 348 |
-
}
|
|
|
|
| 349 |
},
|
| 350 |
{
|
| 351 |
"language_name": "Arabic",
|
|
@@ -407,7 +411,8 @@
|
|
| 407 |
"TN": 10549080,
|
| 408 |
"TR": 459298,
|
| 409 |
"YE": 22114456
|
| 410 |
-
}
|
|
|
|
| 411 |
},
|
| 412 |
{
|
| 413 |
"language_name": "Urdu",
|
|
@@ -436,7 +441,8 @@
|
|
| 436 |
"IN": 66304500,
|
| 437 |
"MU": 71727,
|
| 438 |
"PK": 221825950
|
| 439 |
-
}
|
|
|
|
| 440 |
},
|
| 441 |
{
|
| 442 |
"language_name": "French",
|
|
@@ -522,7 +528,8 @@
|
|
| 522 |
"VU": 149166,
|
| 523 |
"WF": 7610,
|
| 524 |
"YT": 110580
|
| 525 |
-
}
|
|
|
|
| 526 |
},
|
| 527 |
{
|
| 528 |
"language_name": "Bangla",
|
|
@@ -551,7 +558,8 @@
|
|
| 551 |
"GB": 263044,
|
| 552 |
"IN": 107413290,
|
| 553 |
"NP": 28508
|
| 554 |
-
}
|
|
|
|
| 555 |
},
|
| 556 |
{
|
| 557 |
"language_name": "Portuguese",
|
|
@@ -591,7 +599,8 @@
|
|
| 591 |
"PT": 9890592,
|
| 592 |
"ST": 179454,
|
| 593 |
"TL": 816395
|
| 594 |
-
}
|
|
|
|
| 595 |
},
|
| 596 |
{
|
| 597 |
"language_name": "Punjabi",
|
|
@@ -621,7 +630,8 @@
|
|
| 621 |
"KE": 10170,
|
| 622 |
"PK": 163450700,
|
| 623 |
"SG": 9314
|
| 624 |
-
}
|
|
|
|
| 625 |
},
|
| 626 |
{
|
| 627 |
"language_name": "Russian",
|
|
@@ -668,7 +678,8 @@
|
|
| 668 |
"UA": 20204534,
|
| 669 |
"US": 798334,
|
| 670 |
"UZ": 4279156
|
| 671 |
-
}
|
|
|
|
| 672 |
},
|
| 673 |
{
|
| 674 |
"language_name": "Swahili",
|
|
@@ -701,7 +712,8 @@
|
|
| 701 |
"UG": 32439750,
|
| 702 |
"YT": 2716,
|
| 703 |
"ZA": 1016
|
| 704 |
-
}
|
|
|
|
| 705 |
},
|
| 706 |
{
|
| 707 |
"language_name": "Indonesian",
|
|
@@ -727,7 +739,8 @@
|
|
| 727 |
"population": {
|
| 728 |
"ID": 170896640,
|
| 729 |
"NL": 311047
|
| 730 |
-
}
|
|
|
|
| 731 |
},
|
| 732 |
{
|
| 733 |
"language_name": "German",
|
|
@@ -778,7 +791,8 @@
|
|
| 778 |
"SI": 883126,
|
| 779 |
"SK": 1196932,
|
| 780 |
"US": 1563403
|
| 781 |
-
}
|
|
|
|
| 782 |
},
|
| 783 |
{
|
| 784 |
"language_name": "Japanese",
|
|
@@ -805,7 +819,8 @@
|
|
| 805 |
"BR": 444604,
|
| 806 |
"CA": 52772,
|
| 807 |
"JP": 119231650
|
| 808 |
-
}
|
|
|
|
| 809 |
},
|
| 810 |
{
|
| 811 |
"language_name": "Telugu",
|
|
@@ -830,7 +845,8 @@
|
|
| 830 |
"commonvoice_locale": "te",
|
| 831 |
"population": {
|
| 832 |
"IN": 95478480
|
| 833 |
-
}
|
|
|
|
| 834 |
},
|
| 835 |
{
|
| 836 |
"language_name": "Marathi",
|
|
@@ -855,7 +871,8 @@
|
|
| 855 |
"commonvoice_locale": "mr",
|
| 856 |
"population": {
|
| 857 |
"IN": 92826300
|
| 858 |
-
}
|
|
|
|
| 859 |
},
|
| 860 |
{
|
| 861 |
"language_name": "Javanese",
|
|
@@ -881,7 +898,8 @@
|
|
| 881 |
"population": {
|
| 882 |
"ID": 90788840,
|
| 883 |
"MY": 391825
|
| 884 |
-
}
|
|
|
|
| 885 |
},
|
| 886 |
{
|
| 887 |
"language_name": "Vietnamese",
|
|
@@ -909,6 +927,7 @@
|
|
| 909 |
"CN": 6970,
|
| 910 |
"US": 1130973,
|
| 911 |
"VN": 84900318
|
| 912 |
-
}
|
|
|
|
| 913 |
}
|
| 914 |
]
|
|
|
|
| 208 |
"ZA": 17503716,
|
| 209 |
"ZM": 2788256,
|
| 210 |
"ZW": 6109446
|
| 211 |
+
},
|
| 212 |
+
"language_family": "Indo-European"
|
| 213 |
},
|
| 214 |
{
|
| 215 |
"language_name": "Chinese",
|
|
|
|
| 253 |
"TW": 22422850,
|
| 254 |
"US": 2295209,
|
| 255 |
"VN": 1085934
|
| 256 |
+
},
|
| 257 |
+
"language_family": "Sino-Tibetan"
|
| 258 |
},
|
| 259 |
{
|
| 260 |
"language_name": "Hindi",
|
|
|
|
| 284 |
"NP": 127377,
|
| 285 |
"UG": 2206,
|
| 286 |
"ZA": 1129272
|
| 287 |
+
},
|
| 288 |
+
"language_family": "Indo-European"
|
| 289 |
},
|
| 290 |
{
|
| 291 |
"language_name": "Spanish",
|
|
|
|
| 348 |
"US": 31933344,
|
| 349 |
"UY": 2981097,
|
| 350 |
"VE": 23488572
|
| 351 |
+
},
|
| 352 |
+
"language_family": "Indo-European"
|
| 353 |
},
|
| 354 |
{
|
| 355 |
"language_name": "Arabic",
|
|
|
|
| 411 |
"TN": 10549080,
|
| 412 |
"TR": 459298,
|
| 413 |
"YE": 22114456
|
| 414 |
+
},
|
| 415 |
+
"language_family": "Afro-Asiatic"
|
| 416 |
},
|
| 417 |
{
|
| 418 |
"language_name": "Urdu",
|
|
|
|
| 441 |
"IN": 66304500,
|
| 442 |
"MU": 71727,
|
| 443 |
"PK": 221825950
|
| 444 |
+
},
|
| 445 |
+
"language_family": "Indo-European"
|
| 446 |
},
|
| 447 |
{
|
| 448 |
"language_name": "French",
|
|
|
|
| 528 |
"VU": 149166,
|
| 529 |
"WF": 7610,
|
| 530 |
"YT": 110580
|
| 531 |
+
},
|
| 532 |
+
"language_family": "Indo-European"
|
| 533 |
},
|
| 534 |
{
|
| 535 |
"language_name": "Bangla",
|
|
|
|
| 558 |
"GB": 263044,
|
| 559 |
"IN": 107413290,
|
| 560 |
"NP": 28508
|
| 561 |
+
},
|
| 562 |
+
"language_family": "Indo-European"
|
| 563 |
},
|
| 564 |
{
|
| 565 |
"language_name": "Portuguese",
|
|
|
|
| 599 |
"PT": 9890592,
|
| 600 |
"ST": 179454,
|
| 601 |
"TL": 816395
|
| 602 |
+
},
|
| 603 |
+
"language_family": "Indo-European"
|
| 604 |
},
|
| 605 |
{
|
| 606 |
"language_name": "Punjabi",
|
|
|
|
| 630 |
"KE": 10170,
|
| 631 |
"PK": 163450700,
|
| 632 |
"SG": 9314
|
| 633 |
+
},
|
| 634 |
+
"language_family": "Indo-European"
|
| 635 |
},
|
| 636 |
{
|
| 637 |
"language_name": "Russian",
|
|
|
|
| 678 |
"UA": 20204534,
|
| 679 |
"US": 798334,
|
| 680 |
"UZ": 4279156
|
| 681 |
+
},
|
| 682 |
+
"language_family": "Indo-European"
|
| 683 |
},
|
| 684 |
{
|
| 685 |
"language_name": "Swahili",
|
|
|
|
| 712 |
"UG": 32439750,
|
| 713 |
"YT": 2716,
|
| 714 |
"ZA": 1016
|
| 715 |
+
},
|
| 716 |
+
"language_family": "Atlantic-Congo"
|
| 717 |
},
|
| 718 |
{
|
| 719 |
"language_name": "Indonesian",
|
|
|
|
| 739 |
"population": {
|
| 740 |
"ID": 170896640,
|
| 741 |
"NL": 311047
|
| 742 |
+
},
|
| 743 |
+
"language_family": "Austronesian"
|
| 744 |
},
|
| 745 |
{
|
| 746 |
"language_name": "German",
|
|
|
|
| 791 |
"SI": 883126,
|
| 792 |
"SK": 1196932,
|
| 793 |
"US": 1563403
|
| 794 |
+
},
|
| 795 |
+
"language_family": "Indo-European"
|
| 796 |
},
|
| 797 |
{
|
| 798 |
"language_name": "Japanese",
|
|
|
|
| 819 |
"BR": 444604,
|
| 820 |
"CA": 52772,
|
| 821 |
"JP": 119231650
|
| 822 |
+
},
|
| 823 |
+
"language_family": "Japonic"
|
| 824 |
},
|
| 825 |
{
|
| 826 |
"language_name": "Telugu",
|
|
|
|
| 845 |
"commonvoice_locale": "te",
|
| 846 |
"population": {
|
| 847 |
"IN": 95478480
|
| 848 |
+
},
|
| 849 |
+
"language_family": "Dravidian"
|
| 850 |
},
|
| 851 |
{
|
| 852 |
"language_name": "Marathi",
|
|
|
|
| 871 |
"commonvoice_locale": "mr",
|
| 872 |
"population": {
|
| 873 |
"IN": 92826300
|
| 874 |
+
},
|
| 875 |
+
"language_family": "Indo-European"
|
| 876 |
},
|
| 877 |
{
|
| 878 |
"language_name": "Javanese",
|
|
|
|
| 898 |
"population": {
|
| 899 |
"ID": 90788840,
|
| 900 |
"MY": 391825
|
| 901 |
+
},
|
| 902 |
+
"language_family": "Austronesian"
|
| 903 |
},
|
| 904 |
{
|
| 905 |
"language_name": "Vietnamese",
|
|
|
|
| 927 |
"CN": 6970,
|
| 928 |
"US": 1130973,
|
| 929 |
"VN": 84900318
|
| 930 |
+
},
|
| 931 |
+
"language_family": "Austroasiatic"
|
| 932 |
}
|
| 933 |
]
|
uv.lock
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|