File size: 18,542 Bytes
8628943
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7532d9b
8628943
 
 
72e68df
7532d9b
8628943
7532d9b
 
8628943
 
 
3917981
 
 
 
 
 
 
 
 
 
 
 
387857c
 
3917981
 
 
8628943
 
 
 
 
445a7b6
 
 
 
 
8628943
 
445a7b6
8628943
 
 
 
 
 
 
445a7b6
8628943
 
 
 
 
 
 
 
 
445a7b6
8628943
 
445a7b6
 
8628943
 
 
 
 
 
 
 
 
 
 
445a7b6
8628943
 
 
 
445a7b6
8628943
 
 
 
 
 
 
 
445a7b6
 
7532d9b
8628943
7532d9b
8628943
 
 
 
 
 
 
 
 
 
 
 
 
 
445a7b6
8628943
 
 
 
 
 
 
 
445a7b6
 
8628943
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7532d9b
 
8628943
 
7532d9b
290947d
8628943
 
3917981
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
445a7b6
8628943
 
 
445a7b6
8628943
 
 
 
 
 
290947d
 
8628943
 
 
3917981
 
8628943
 
 
445a7b6
8628943
445a7b6
8628943
290947d
8628943
 
290947d
 
7532d9b
290947d
 
b7fe2d8
 
 
 
290947d
 
 
8628943
 
 
290947d
8628943
3917981
8628943
 
290947d
3917981
b7fe2d8
 
290947d
 
 
8628943
 
290947d
8628943
 
290947d
 
 
 
 
 
 
 
 
 
 
b7fe2d8
 
 
 
 
290947d
 
 
 
 
 
 
8628943
290947d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b7fe2d8
 
290947d
 
b7fe2d8
 
290947d
 
 
 
 
 
 
8628943
 
 
 
 
445a7b6
8628943
ca0d805
 
 
 
8628943
 
445a7b6
8628943
 
 
ca0d805
 
 
 
8628943
 
445a7b6
8628943
445a7b6
8628943
ca0d805
 
 
290947d
ca0d805
8628943
 
290947d
 
 
ca0d805
 
 
 
 
 
 
 
8628943
290947d
 
 
3917981
290947d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ca0d805
290947d
ca0d805
8628943
290947d
 
ca0d805
 
 
8628943
3917981
ca0d805
 
8628943
 
 
7532d9b
 
 
 
 
 
 
 
 
 
 
 
 
8628943
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
"""
Gradio dashboard to explore Lighteval tasks.

Scans `src/lighteval/tasks/tasks` and `src/lighteval/tasks/multilingual/tasks`
for module-level docstrings with this format:

name: <task display name>
dataset: <dataset id(s)>
abstract: <free text>
languages: <comma/newline separated language codes or names>
tags: <comma/newline separated tags>
paper: <url>

This file stays outside the lighteval src tree, per request.
"""

import ast
import os
import re
from collections import Counter
from dataclasses import dataclass

import gradio as gr


REPO_ROOT = "."
TASK_DIRS = [
    os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "tasks"),
    os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "multilingual", "tasks"),
]


star_benchmarks = [
    "aime",
    "mmlu_pro",
    "gpqa:diamond",
    "hle",
    "arc_agi_2",
    "ifbench",
    "ifeval",
    "live code bench",
    "math 500",
    "mix_eval",
    "musr",
    "simpleqa",
    "MMLU pro"
]


@dataclass
class TaskDoc:
    file_path: str
    module: str
    abstract: str
    languages: list[str]
    tags: list[str]
    paper: str | None
    dataset: str | None
    name: str | None = None


def read_file_text(path: str) -> str | None:
    try:
        with open(path, "r", encoding="utf-8") as f:
            return f.read()
    except Exception:
        return None


def parse_module_docstring(text: str) -> str | None:
    try:
        mod = ast.parse(text)
        return ast.get_docstring(mod)
    except Exception:
        # Fallback: naive regex for triple-quoted string at top
        m = re.match(r"^\s*([\'\"])\1\1([\s\S]*?)\1\1\1", text)
        return m.group(2).strip() if m else None


def parse_sections(doc: str) -> dict[str, str]:
    # Very simple section parser keyed by lines ending with ':' on their own
    # Expected keys: name, dataset, abstract, languages, tags, paper
    out: dict[str, str] = {"name": "", "dataset": "", "abstract": "", "languages": "", "tags": "", "paper": ""}
    current_key: str | None = None
    for raw_line in doc.splitlines():
        line = raw_line.rstrip()
        if line.endswith(":") and line.strip().lower() in {"name:", "dataset:", "abstract:", "languages:", "tags:", "paper:"}:
            current_key = line[:-1].strip().lower()
            continue
        if current_key is not None:
            # Preserve paragraphs; we will normalize later
            out[current_key] = (out[current_key] + ("\n" if out[current_key] else "") + line).strip()
    return out


def split_list_field(value: str) -> list[str]:
    if not value:
        return []
    # Support comma and newline separated values
    parts = re.split(r"[\n,]", value)
    cleaned: list[str] = []
    for p in parts:
        token = p.strip()
        if not token:
            continue
        cleaned.append(token)
    return cleaned


def discover_task_files() -> list[str]:
    files: list[str] = []
    print(f"Discovering task files in: {TASK_DIRS}")
    for base in TASK_DIRS:
        print(f"Discovering task files in: {base}")
        if not os.path.isdir(base):
            continue
        # Top-level python files in the directory
        for name in os.listdir(base):
            if name.endswith(".py"):
                files.append(os.path.join(base, name))
        # Also include subdirectory main.py files
        for dirpath, dirnames, filenames in os.walk(base):
            if dirpath == base:
                continue
            if "main.py" in filenames:
                files.append(os.path.join(dirpath, "main.py"))
    # Deduplicate while preserving order
    seen: set = set()
    unique_files: list[str] = []
    for p in files:
        if p in seen:
            continue
        seen.add(p)
        unique_files.append(p)
    return sorted(unique_files)


def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
    docs: list[TaskDoc] = []
    language_counts: Counter = Counter()
    tag_set: set = set()
    for path in discover_task_files():
        text = read_file_text(path)
        if not text:
            continue
        doc = parse_module_docstring(text)
        if not doc:
            continue
        sections = parse_sections(doc)
        abstract = sections.get("abstract", "").strip()
        langs = [lang.lower() for lang in split_list_field(sections.get("languages", ""))]
        tgs = [t.lower() for t in split_list_field(sections.get("tags", ""))]
        paper = sections.get("paper", "").strip() or None
        dataset = sections.get("dataset", "").strip() or None
        name = sections.get("name", "").strip() or None
        for lang in langs:
            language_counts[lang] += 1
        for t in tgs:
            tag_set.add(t)
        module = os.path.relpath(path, REPO_ROOT)
        docs.append(TaskDoc(file_path=path, module=module, abstract=abstract, languages=langs, tags=tgs, paper=paper, dataset=dataset, name=name))
    languages_sorted = [
        lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
    ]
    tags_sorted = sorted(tag_set)
    return docs, languages_sorted, tags_sorted


def build_index() -> tuple[list[TaskDoc], list[str], list[str]]:
    return index_tasks()


ALL_TASKS, ALL_LANGS, ALL_TAGS = build_index()
TOP_LANGS = ALL_LANGS[:8]  # show more by default


def normalize_name_for_matching(name: str) -> str:
    # Normalize for comparison: lowercase, remove underscores/spaces/colons
    return re.sub(r"[_\s:]+", "", name.lower())


def is_starred_benchmark(td: TaskDoc) -> bool:
    # Check multiple possible identifiers
    parts = td.module.replace("\\", "/").split("/")
    base_no_ext = parts[-1].rsplit(".", 1)[0]
    fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
    
    # Normalize all possible identifiers
    task_name_raw = (td.name or "").lower().strip()
    task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
    normalized_task_display = normalize_name_for_matching(task_name_display)
    normalized_module = normalize_name_for_matching(base_no_ext)
    normalized_name = normalize_name_for_matching(task_name_raw)
    
    # Also check dataset if available
    normalized_dataset = normalize_name_for_matching(td.dataset or "")
    
    # Check against star_benchmarks list - try multiple matching strategies
    for star_name in star_benchmarks:
        normalized_star = normalize_name_for_matching(star_name)
        # Try exact match or substring match on various fields
        if (normalized_star == normalized_task_display or 
            normalized_star == normalized_module or
            normalized_star == normalized_name or
            normalized_star in normalized_task_display or
            normalized_star in normalized_module or
            (normalized_dataset and normalized_star in normalized_dataset) or
            star_name.lower() in task_name_display or
            star_name.lower() in base_no_ext.lower()):
            return True
    return False


def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
    selected_langs = [lang.lower() for lang in (languages or [])]
    selected_tags = [t.lower() for t in (tags or [])]
    search_lc = (search or "").strip().lower()
    out: list[TaskDoc] = []
    for td in ALL_TASKS:
        if selected_langs and not any(lang in td.languages for lang in selected_langs):
            continue
        if selected_tags and not any(t in td.tags for t in selected_tags):
            continue
        if search_lc:
            # Search module path, abstract, tags, and dataset names
            hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
            if search_lc not in hay:
                continue
        out.append(td)
    # Sort: starred benchmarks first, then by name
    out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower()))
    return out


def render_cards(tasks: list[TaskDoc]) -> str:
    # Responsive grid of pretty cards; show all details without clicks
    items: list[str] = []
    for t in tasks:
        parts = t.module.replace("\\", "/").split("/")
        base_no_ext = parts[-1].rsplit(".", 1)[0]
        fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
        task_name = (t.name or fallback_name).replace("_", " ").title()
        mod_path = t.module.replace("\\", "/")
        mod_path = mod_path.split("/", 1)[1]
        source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
        paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
        tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
        langs_html = " ".join([f'<span class=\"chip chip-lang\" title=\"language: {lang}\">{lang}</span>' for lang in t.languages]) if t.languages else ""
        chips_tags_html = f'<div class="chips chips-tags">{tags_html}</div>' if tags_html else ""
        chips_langs_html = f'<div class="chips chips-langs">{langs_html}</div>' if langs_html else ""
        abstract_html = (t.abstract or "-").replace("\n", "<br/>")
        sep_html = ' <span class="sep">|</span> ' if paper_html else ""
        links_html = f"{source_html}{sep_html}{paper_html}"
        dataset_links = []
        if t.dataset:
            for ds in [d.strip() for d in t.dataset.split(",") if d.strip()]:
                dataset_links.append(f'<a class="dataset" href="https://huggingface.co/datasets/{ds}" target="_blank" rel="noopener">{ds}</a>')
        dataset_html = " ".join(dataset_links) if dataset_links else ""
        star_icon = "⭐ " if is_starred_benchmark(t) else ""
        items.append(
            f"""
            <article class="card" tabindex="0" aria-label="Task {task_name}">
              <div class="title"><span class="title-text">{star_icon}{task_name}</span> <span class="dataset-inline">{dataset_html}</span></div>
              {chips_tags_html}
              {chips_langs_html}
              <div class="abstract">{abstract_html}</div>
              <div class="links">{links_html}</div>
            </article>
            """
        )
    # CSS includes light and dark mode support
    style = """
    <style>
      /* layout */
      .cards-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 16px; margin-top: 10px; }

      /* card base */
      .card { border-radius: 12px; padding: 14px; transition: box-shadow 160ms ease, transform 120ms ease, border-color 120ms ease; outline: none; }
      .card:hover, .card:focus { transform: translateY(-4px); box-shadow: 0 10px 30px rgba(2,6,23,0.08); }

      .title { display:flex; align-items:center; gap:8px; flex-wrap:wrap; }
      .title-text { font-weight: 600; font-size: 16px; font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
      .dataset-inline { font-size: 12px; }

      .chips { margin: 6px 0 4px 0; display:flex; gap:2px; flex-wrap:wrap; }
      .chips-tags { margin: 6px 0 2px 0; }
      .chips-langs { margin: 0 0 4px 0; }
      .chip { display:inline-block; padding:1px 2px; border-radius:999px; font-size:12px; background: #e6f2ff; color: #1e3a8a; }
      .chip-lang { background: #e8f5e9; color: #166534; }

      .abstract { color: #475569; font-size: 13.5px; line-height: 1.35; margin-top: 6px; min-height: 48px; }
      .links { margin-top: 10px; font-size:12px; }
      .links a { text-decoration: none; font-weight: 600; }

      .sep { color: #94a3b8; margin: 0 8px; }

      .dataset { margin-left: 8px; font-size: 12px; color: #0ea5e9; background: #ecfeff; padding: 2px 6px; border-radius: 6px; text-decoration: none; }

      /* Light mode */
      :root {
        --bg: #f8fafc;
        --card-bg: #ffffff;
        --card-border: #e6f2ff;
        --title-color: #1e3a8a;
        --text-color: #0f172a;
        --muted: #475569;
        --link: #2563eb;
      }

      /* Dark mode overrides */
      @media (prefers-color-scheme: dark) {
        :root {
          --bg: #0b1220;
          --card-bg: #071022;
          --card-border: #0f2a44;
          --title-color: #93c5fd;
          --text-color: #e6eef8;
          --muted: #cbd5e1;
          --link: #6ea8ff;
        }
      }

      /* apply */
      body { background: var(--bg); color: var(--text-color); }
      .card { background: var(--card-bg); border: 1px solid var(--card-border); color: var(--text-color); }
      .title-text { color: var(--title-color); }
      .abstract { color: var(--muted); }
      .links a { color: var(--link); }
      .chips-tags .chip { background: #e6f2ff; color: #1e3a8a; }
      .chips-langs .chip { background: #e8f5e9; color: #166534; }
      /* tweak chips for dark mode for better contrast */
      @media (prefers-color-scheme: dark) {
        .chips-tags .chip { background: rgba(29,78,216,0.35); color: #e6eef8; border: 1px solid rgba(148,163,184,0.15); }
        .chips-langs .chip { background: rgba(22,101,52,0.35); color: #e6eef8; border: 1px solid rgba(148,163,184,0.15); }
      }

      /* small screens adjustments */
      @media (max-width: 520px) {
        .cards-grid { gap: 10px; }
        .title-text { font-size: 15px; }
      }
    </style>
    """
    return style + "<div class=\"cards-grid\">" + "\n".join(items) + "</div>"


def on_filter(languages: list[str], tags: list[str], search: str):
    tasks = filter_tasks(languages, tags, search)
    count = len(tasks)
    total = len(ALL_TASKS)
    counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
    return counter_text, render_cards(tasks)


def on_toggle_language_choices(show_all: bool, selected_langs: list[str], tags: list[str], search: str):
    choices = ALL_LANGS if show_all else TOP_LANGS
    kept = [lang for lang in (selected_langs or []) if lang in choices]
    tasks = filter_tasks(kept, tags, search)
    count = len(tasks)
    total = len(ALL_TASKS)
    counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
    return gr.update(choices=choices, value=kept), counter_text, render_cards(tasks)


def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
    # Only toggle visibility; preserve current tag selections and keep them active in filtering
    tags_value: list[str] = selected_tags or []
    tasks = filter_tasks(languages, tags_value, search)
    count = len(tasks)
    total = len(ALL_TASKS)
    counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
    # keep selections when showing; when hiding we keep value but component hidden (so filter still uses them)
    return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)


with gr.Blocks(title="Lighteval Tasks Explorer", css=None) as demo:
    # Header / hero
    with gr.Row():
        with gr.Column():
            gr.Markdown(
                """
                <h2 style="margin:6px 0 2px 0;"><a href="https://huggingface.co/huggingface/lighteval">Lighteval</a> Tasks Explorer</h2>
                <p style="margin:0 0 12px 0; color:var(--muted);">Browse tasks by language, tags and search the task descriptions.</p>
                """
            )
            task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**")

    # Controls and results in two columns (left: controls, right: cards)
    with gr.Row(equal_height=False):
        with gr.Column(scale=2):
            gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
            # Search with interactive debounce
            search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="", interactive=True)
            # We want debounce behavior: use .change with every character by setting interactive=True and triggering on input
            # Filters
            with gr.Group():
                gr.Markdown("**Languages**")
                show_all_langs = gr.Checkbox(label="Show all languages", value=False)
                lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[])  # default none selected
            with gr.Group():
                gr.Markdown("**Benchmark type**")
                show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
                tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
            # small hint
            gr.Markdown("Tip: use the filters and search together. Results update live.")

        with gr.Column(scale=5):
            cards = gr.HTML()
            # put an initially visible loading placeholder
            cards.value = "<div style='padding:18px'>Loading tasks…</div>"

    # Wire interactions
    # Toggle expand/collapse language choices
    show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
    # Toggle tag filter visibility (keeps values)
    show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])

    # Live filtering: wire change events on controls to update cards.
    # Textbox: trigger on every change (interactive True). If Gradio runtime has debounce param, it's used internally.
    search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
    lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
    tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])

    # Initial load: display all tasks (starred benchmarks first)
    initial_tasks = filter_tasks([], [], "")
    cards.value = render_cards(initial_tasks)


if __name__ == "__main__":
    from git import Repo  # pip install gitpython

    git_url = "https://github.com/huggingface/lighteval.git"
    repo_dir = "./lighteval"
    
    if os.path.exists(repo_dir) and os.path.isdir(os.path.join(repo_dir, ".git")):
        print(f"Pulling latest changes from {git_url}...")
        repo = Repo(repo_dir)
        repo.remotes.origin.pull()
    else:
        print(f"Cloning {git_url} to {repo_dir}...")
        Repo.clone_from(git_url, repo_dir)
    
    # Run with `python benchmark_finder/app.py`
    demo.launch()