File size: 5,251 Bytes
2a275ae
 
1f70be8
 
 
a938b8a
1f70be8
cb8646a
 
2a275ae
 
 
 
1f70be8
cb8646a
 
 
a938b8a
 
 
 
 
cb8646a
 
 
 
 
 
 
 
 
2a275ae
cb8646a
 
 
 
 
 
1f70be8
 
 
 
6c8936b
 
 
 
2a275ae
6c8936b
 
1f70be8
6c8936b
 
 
 
a938b8a
6c8936b
 
2a275ae
1f70be8
 
cb8646a
a938b8a
 
1f70be8
 
 
 
cb8646a
a938b8a
 
6c8936b
 
 
1f70be8
 
cb8646a
a938b8a
 
6c8936b
 
 
1f70be8
cb8646a
a938b8a
 
6c8936b
 
 
1f70be8
 
a938b8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2a275ae
cb8646a
2a275ae
 
6c8936b
 
 
 
a938b8a
6c8936b
 
 
 
 
1f70be8
6c8936b
 
1f70be8
7cb31c4
6c8936b
 
 
 
 
 
 
7cb31c4
 
1f70be8
 
2a275ae
a938b8a
cb8646a
2a275ae
 
1f70be8
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import subprocess

import gradio as gr


def run_scripts():
    try:
        # Execute datasets script
        result = subprocess.run(
            ["python", "hub_datasets_by_language.py"],
            capture_output=True,
            text=True,
            check=True,
        )

        # Uncomment this when models script is ready
        # models_result = subprocess.run(
        #     ["python", "hub_models_by_language.py"],
        #     capture_output=True,
        #     text=True,
        #     check=True,
        # )

        # Return success message and updated image paths
        return (
            "✅ Scripts executed successfully! All plots have been updated.",
            "plots/datasets_bar_plot_horizontal.png",
            "plots/datasets_stack_area_en_es.png",
            "plots/datasets_bar_plot_vertical.png",
            "plots/datasets_time_series.png",
        )
    except subprocess.CalledProcessError as e:
        error_msg = e.stderr if e.stderr else e.stdout if e.stdout else str(e)
        # Return error message and keep current images
        return (f"❌ Failed to execute scripts: {error_msg}", None, None, None, None)
    except Exception as e:
        # Return error message and keep current images
        return (f"❌ Unexpected error: {str(e)}", None, None, None, None)


def create_app():
    with gr.Blocks() as app:
        gr.Markdown(
            """
            # Visualizing The Language Gap In The Hugging Face Hub
                    
            The open-source community is creating more and more resources in languages other than English but there is still a huge gap. This Space showcases plots that can help visualize this gap in the case of Spanish and can easily be adapted to other languages.
        """
        )

        gr.Markdown(
            """
            ## English vs Spanish Monolingual Datasets
                    
            Note: We consider only **monolingual** resources in these plots, i.e. datasets and models that only contain data in one language. This is because *most* of the multilingual resources are usually machine-translated and we want to focus on original data.        
        """
        )

        with gr.Row():
            with gr.Column():
                img1 = gr.Image(
                    value="plots/datasets_bar_plot_horizontal.png",
                    label="Distribution of Datasets by Year (Horizontal)",
                    show_label=True,
                    show_download_button=True,
                    show_share_button=True,
                )
                img2 = gr.Image(
                    value="plots/datasets_stack_area_en_es.png",
                    label="Cumulative Growth of Datasets (Stacked)",
                    show_label=True,
                    show_download_button=True,
                    show_share_button=True,
                )
            with gr.Column():
                img3 = gr.Image(
                    value="plots/datasets_bar_plot_vertical.png",
                    label="Distribution of Datasets by Year (Vertical)",
                    show_label=True,
                    show_download_button=True,
                    show_share_button=True,
                )
                img4 = gr.Image(
                    value="plots/datasets_time_series.png",
                    label="Cumulative Growth of Datasets (Line)",
                    show_label=True,
                    show_download_button=True,
                    show_share_button=True,
                )

        # gr.Markdown(
        #     """
        #     ## English vs Spanish Models
        #     """
        # )

        # with gr.Row():
        #     gr.Image(
        #         value="plots/models_stack_area_en_es.png",
        #         label="Cumulative Growth of Models",
        #         show_label=True,
        #         show_download_button=True,
        #         show_share_button=True,
        #     )

        with gr.Row():
            update_button = gr.Button("Update plots with latest data (5 mins)")
            output_label = gr.Label()

        gr.Markdown(
            """
            ## Adapt to other languages

            This Space is WIP and more languages and visuals will be included shortly. Meanwhile, you can clone the Space, adapt the code in the scripts and run it to generate plots for other languages.
            """
        )

        gr.Markdown("## Citation")
        with gr.Accordion("Citation information", open=False):
            gr.Markdown(
                r"""
                If you use these plots or the code please cite:

                ```
                @misc{grandury2024gaphf,
                    author = {María Grandury},
                    title = {Visualizing The Language Gap In The Hugging Face Hub},
                    year = {2024},
                    publisher = {Hugging Face},
                    howpublished = {\url{https://huggingface.co/spaces/mariagrandury/language-gap-in-hf-hub}},
                }
                ```
                """
            )

        update_button.click(
            fn=run_scripts,
            outputs=[output_label, img1, img2, img3, img4],
        )

    return app


app = create_app()
app.launch()