File size: 2,384 Bytes
da6e1bc
 
 
b311dd5
 
 
6f0e312
da6e1bc
6f0e312
da6e1bc
 
 
 
 
 
 
 
 
 
 
6f0e312
 
 
 
da6e1bc
 
 
33469f2
 
 
da6e1bc
 
 
 
 
6f0e312
da6e1bc
 
2cdada4
da6e1bc
 
 
 
 
 
 
 
2cdada4
d1a7111
 
 
 
 
 
 
2cdada4
da6e1bc
 
 
 
 
 
 
 
 
 
52abc5b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import re

import pandas as pd
from datasets_.commonvoice import commonvoice
from datasets_.fleurs import fleurs
from datasets_.flores import flores
from datasets_.util import standardize_bcp47
from joblib.memory import Memory
from langcodes import Language
from language_data.population_data import LANGUAGE_SPEAKING_POPULATION

cache = Memory(location=".cache", verbose=0).cache

# load general language data
languages = {
    lang: pop
    for lang, pop in LANGUAGE_SPEAKING_POPULATION.items()
    if not re.match(r".*-[A-Z]{2}$", lang)
}
languages = pd.DataFrame(list(languages.items()), columns=["bcp_47", "speakers"])
# Standardize language codes to avoid duplicates (e.g., "no" and "nb" for Norwegian)
languages["bcp_47"] = languages["bcp_47"].apply(standardize_bcp47)
# Merge any duplicates by summing speakers
languages = languages.groupby("bcp_47", as_index=False).agg({"speakers": "sum"})
languages["language_name"] = languages["bcp_47"].apply(
    lambda x: Language.get(x).display_name()
)
languages["autonym"] = languages["bcp_47"].apply(
    lambda x: Language.get(x).autonym().title()
)

glottolog = pd.read_csv(
    "data/glottolog_languoid.csv/languoid.csv", na_values=[""], keep_default_na=False
)  # Min _Nan_ Chinese is not N/A!
glottolog["bcp_47"] = glottolog["iso639P3code"].apply(
    lambda x: standardize_bcp47(x, macro=True) if not pd.isna(x) else None
)


@cache
def language_family(bcp_47):
    languoid = glottolog[glottolog["bcp_47"] == bcp_47].iloc[0]
    if pd.isna(languoid["family_id"]):
        return None
    family = glottolog[glottolog["id"] == languoid["family_id"]].iloc[0]
    return family["name"]


languages["family"] = languages["bcp_47"].apply(language_family)

# load script codes and names
scripts = pd.read_csv("data/ScriptCodes.csv").rename(
    columns={"Code": "iso15924", "English Name": "script_name"}
)


def script_name(iso15924):
    return scripts[scripts["iso15924"] == iso15924]["script_name"].values[0]


# merge data
# always "left" because keep it simple for now
languages = pd.merge(languages, flores, on="bcp_47", how="left")
languages = pd.merge(languages, fleurs, on="bcp_47", how="left")
languages = pd.merge(languages, commonvoice, on="bcp_47", how="left")
languages["in_benchmark"] = languages["bcp_47"].isin(flores["bcp_47"])
languages = languages.sort_values(by=["speakers", "language_name"], ascending=False)