Spaces:
Running
Running
| """ | |
| Gradio dashboard to explore Lighteval tasks. | |
| Scans `src/lighteval/tasks/tasks` and `src/lighteval/tasks/multilingual/tasks` | |
| for module-level docstrings with this format: | |
| name: <task display name> | |
| dataset: <dataset id(s)> | |
| abstract: <free text> | |
| languages: <comma/newline separated language codes or names> | |
| tags: <comma/newline separated tags> | |
| paper: <url> | |
| This file stays outside the lighteval src tree, per request. | |
| """ | |
| import ast | |
| import os | |
| import re | |
| from collections import Counter | |
| from git import Repo # pip install gitpython | |
| from dataclasses import dataclass | |
| import gradio as gr | |
| git_url = "https://github.com/huggingface/lighteval.git" | |
| repo_dir = "./lighteval" | |
| if os.path.exists(repo_dir) and os.path.isdir(os.path.join(repo_dir, ".git")): | |
| print(f"Pulling latest changes from {git_url}...") | |
| repo = Repo(repo_dir) | |
| repo.remotes.origin.pull() | |
| else: | |
| print(f"Cloning {git_url} to {repo_dir}...") | |
| Repo.clone_from(git_url, repo_dir) | |
| REPO_ROOT = "." | |
| TASK_DIRS = [ | |
| os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "tasks"), | |
| os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "multilingual", "tasks"), | |
| ] | |
| star_benchmarks = [ | |
| "aime", | |
| "mmlu_pro", | |
| "gpqa:diamond", | |
| "hle", | |
| "arc_agi_2", | |
| "ifbench", | |
| "ifeval", | |
| "live code bench", | |
| "math 500", | |
| "mix_eval", | |
| "musr", | |
| "simpleqa", | |
| "MMLU pro" | |
| ] | |
| class TaskDoc: | |
| file_path: str | |
| module: str | |
| abstract: str | |
| languages: list[str] | |
| tags: list[str] | |
| paper: str | None | |
| dataset: str | None | |
| name: str | None = None | |
| def read_file_text(path: str) -> str | None: | |
| try: | |
| with open(path, "r", encoding="utf-8") as f: | |
| return f.read() | |
| except Exception: | |
| return None | |
| def parse_module_docstring(text: str) -> str | None: | |
| try: | |
| mod = ast.parse(text) | |
| return ast.get_docstring(mod) | |
| except Exception: | |
| # Fallback: naive regex for triple-quoted string at top | |
| m = re.match(r"^\s*([\'\"])\1\1([\s\S]*?)\1\1\1", text) | |
| return m.group(2).strip() if m else None | |
| def parse_sections(doc: str) -> dict[str, str]: | |
| # Very simple section parser keyed by lines ending with ':' on their own | |
| # Expected keys: name, dataset, abstract, languages, tags, paper | |
| out: dict[str, str] = {"name": "", "dataset": "", "abstract": "", "languages": "", "tags": "", "paper": ""} | |
| current_key: str | None = None | |
| for raw_line in doc.splitlines(): | |
| line = raw_line.rstrip() | |
| if line.endswith(":") and line.strip().lower() in {"name:", "dataset:", "abstract:", "languages:", "tags:", "paper:"}: | |
| current_key = line[:-1].strip().lower() | |
| continue | |
| if current_key is not None: | |
| # Preserve paragraphs; we will normalize later | |
| out[current_key] = (out[current_key] + ("\n" if out[current_key] else "") + line).strip() | |
| return out | |
| def split_list_field(value: str) -> list[str]: | |
| if not value: | |
| return [] | |
| # Support comma and newline separated values | |
| parts = re.split(r"[\n,]", value) | |
| cleaned: list[str] = [] | |
| for p in parts: | |
| token = p.strip() | |
| if not token: | |
| continue | |
| cleaned.append(token) | |
| return cleaned | |
| def discover_task_files() -> list[str]: | |
| files: list[str] = [] | |
| print(f"Discovering task files in: {TASK_DIRS}") | |
| for base in TASK_DIRS: | |
| print(f"Discovering task files in: {base}") | |
| if not os.path.isdir(base): | |
| continue | |
| # Top-level python files in the directory | |
| for name in os.listdir(base): | |
| if name.endswith(".py"): | |
| files.append(os.path.join(base, name)) | |
| # Also include subdirectory main.py files | |
| for dirpath, dirnames, filenames in os.walk(base): | |
| if dirpath == base: | |
| continue | |
| if "main.py" in filenames: | |
| files.append(os.path.join(dirpath, "main.py")) | |
| # Deduplicate while preserving order | |
| seen: set = set() | |
| unique_files: list[str] = [] | |
| for p in files: | |
| if p in seen: | |
| continue | |
| seen.add(p) | |
| unique_files.append(p) | |
| return sorted(unique_files) | |
| def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]: | |
| docs: list[TaskDoc] = [] | |
| language_counts: Counter = Counter() | |
| tag_set: set = set() | |
| for path in discover_task_files(): | |
| text = read_file_text(path) | |
| if not text: | |
| continue | |
| doc = parse_module_docstring(text) | |
| if not doc: | |
| continue | |
| sections = parse_sections(doc) | |
| abstract = sections.get("abstract", "").strip() | |
| langs = [lang.lower() for lang in split_list_field(sections.get("languages", ""))] | |
| tgs = [t.lower() for t in split_list_field(sections.get("tags", ""))] | |
| paper = sections.get("paper", "").strip() or None | |
| dataset = sections.get("dataset", "").strip() or None | |
| name = sections.get("name", "").strip() or None | |
| for lang in langs: | |
| language_counts[lang] += 1 | |
| for t in tgs: | |
| tag_set.add(t) | |
| module = os.path.relpath(path, REPO_ROOT) | |
| docs.append(TaskDoc(file_path=path, module=module, abstract=abstract, languages=langs, tags=tgs, paper=paper, dataset=dataset, name=name)) | |
| languages_sorted = [ | |
| lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0])) | |
| ] | |
| tags_sorted = sorted(tag_set) | |
| return docs, languages_sorted, tags_sorted | |
| def build_index() -> tuple[list[TaskDoc], list[str], list[str]]: | |
| return index_tasks() | |
| ALL_TASKS, ALL_LANGS, ALL_TAGS = build_index() | |
| TOP_LANGS = ALL_LANGS[:8] # show more by default | |
| def normalize_name_for_matching(name: str) -> str: | |
| # Normalize for comparison: lowercase, remove underscores/spaces/colons | |
| return re.sub(r"[_\s:]+", "", name.lower()) | |
| def is_starred_benchmark(td: TaskDoc) -> bool: | |
| # Check multiple possible identifiers | |
| parts = td.module.replace("\\", "/").split("/") | |
| base_no_ext = parts[-1].rsplit(".", 1)[0] | |
| fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext | |
| # Normalize all possible identifiers | |
| task_name_raw = (td.name or "").lower().strip() | |
| task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip() | |
| normalized_task_display = normalize_name_for_matching(task_name_display) | |
| normalized_module = normalize_name_for_matching(base_no_ext) | |
| normalized_name = normalize_name_for_matching(task_name_raw) | |
| # Also check dataset if available | |
| normalized_dataset = normalize_name_for_matching(td.dataset or "") | |
| # Check against star_benchmarks list - try multiple matching strategies | |
| for star_name in star_benchmarks: | |
| normalized_star = normalize_name_for_matching(star_name) | |
| # Try exact match or substring match on various fields | |
| if (normalized_star == normalized_task_display or | |
| normalized_star == normalized_module or | |
| normalized_star == normalized_name or | |
| normalized_star in normalized_task_display or | |
| normalized_star in normalized_module or | |
| (normalized_dataset and normalized_star in normalized_dataset) or | |
| star_name.lower() in task_name_display or | |
| star_name.lower() in base_no_ext.lower()): | |
| return True | |
| return False | |
| def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]: | |
| selected_langs = [lang.lower() for lang in (languages or [])] | |
| selected_tags = [t.lower() for t in (tags or [])] | |
| search_lc = (search or "").strip().lower() | |
| out: list[TaskDoc] = [] | |
| for td in ALL_TASKS: | |
| if selected_langs and not any(lang in td.languages for lang in selected_langs): | |
| continue | |
| if selected_tags and not any(t in td.tags for t in selected_tags): | |
| continue | |
| if search_lc: | |
| # Search module path, abstract, tags, and dataset names | |
| hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower() | |
| if search_lc not in hay: | |
| continue | |
| out.append(td) | |
| # Sort: starred benchmarks first, then by name | |
| out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower())) | |
| return out | |
| def render_cards(tasks: list[TaskDoc]) -> str: | |
| # Responsive grid of pretty cards; show all details without clicks | |
| items: list[str] = [] | |
| for t in tasks: | |
| parts = t.module.replace("\\", "/").split("/") | |
| base_no_ext = parts[-1].rsplit(".", 1)[0] | |
| fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext | |
| task_name = (t.name or fallback_name).replace("_", " ").title() | |
| mod_path = t.module.replace("\\", "/") | |
| mod_path = mod_path.split("/", 1)[1] | |
| source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>' | |
| paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else "" | |
| tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else "" | |
| langs_html = " ".join([f'<span class=\"chip chip-lang\" title=\"language: {lang}\">{lang}</span>' for lang in t.languages]) if t.languages else "" | |
| chips_tags_html = f'<div class="chips chips-tags">{tags_html}</div>' if tags_html else "" | |
| chips_langs_html = f'<div class="chips chips-langs">{langs_html}</div>' if langs_html else "" | |
| abstract_html = (t.abstract or "-").replace("\n", "<br/>") | |
| sep_html = ' <span class="sep">|</span> ' if paper_html else "" | |
| links_html = f"{source_html}{sep_html}{paper_html}" | |
| dataset_links = [] | |
| if t.dataset: | |
| for ds in [d.strip() for d in t.dataset.split(",") if d.strip()]: | |
| dataset_links.append(f'<a class="dataset" href="https://huggingface.co/datasets/{ds}" target="_blank" rel="noopener">{ds}</a>') | |
| dataset_html = " ".join(dataset_links) if dataset_links else "" | |
| star_icon = "⭐ " if is_starred_benchmark(t) else "" | |
| items.append( | |
| f""" | |
| <article class="card" tabindex="0" aria-label="Task {task_name}"> | |
| <div class="title"><span class="title-text">{star_icon}{task_name}</span> <span class="dataset-inline">{dataset_html}</span></div> | |
| {chips_tags_html} | |
| {chips_langs_html} | |
| <div class="abstract">{abstract_html}</div> | |
| <div class="links">{links_html}</div> | |
| </article> | |
| """ | |
| ) | |
| # CSS includes light and dark mode support | |
| style = """ | |
| <style> | |
| /* layout */ | |
| .cards-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 16px; margin-top: 10px; } | |
| /* card base */ | |
| .card { border-radius: 12px; padding: 14px; transition: box-shadow 160ms ease, transform 120ms ease, border-color 120ms ease; outline: none; } | |
| .card:hover, .card:focus { transform: translateY(-4px); box-shadow: 0 10px 30px rgba(2,6,23,0.08); } | |
| .title { display:flex; align-items:center; gap:8px; flex-wrap:wrap; } | |
| .title-text { font-weight: 600; font-size: 16px; font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; } | |
| .dataset-inline { font-size: 12px; } | |
| .chips { margin: 6px 0 4px 0; display:flex; gap:2px; flex-wrap:wrap; } | |
| .chips-tags { margin: 6px 0 2px 0; } | |
| .chips-langs { margin: 0 0 4px 0; } | |
| .chip { display:inline-block; padding:1px 2px; border-radius:999px; font-size:12px; background: #e6f2ff; color: #1e3a8a; } | |
| .chip-lang { background: #e8f5e9; color: #166534; } | |
| .abstract { color: #475569; font-size: 13.5px; line-height: 1.35; margin-top: 6px; min-height: 48px; } | |
| .links { margin-top: 10px; font-size:12px; } | |
| .links a { text-decoration: none; font-weight: 600; } | |
| .sep { color: #94a3b8; margin: 0 8px; } | |
| .dataset { margin-left: 8px; font-size: 12px; color: #0ea5e9; background: #ecfeff; padding: 2px 6px; border-radius: 6px; text-decoration: none; } | |
| /* Light mode */ | |
| :root { | |
| --bg: #f8fafc; | |
| --card-bg: #ffffff; | |
| --card-border: #e6f2ff; | |
| --title-color: #1e3a8a; | |
| --text-color: #0f172a; | |
| --muted: #475569; | |
| --link: #2563eb; | |
| } | |
| /* Dark mode overrides */ | |
| @media (prefers-color-scheme: dark) { | |
| :root { | |
| --bg: #0b1220; | |
| --card-bg: #071022; | |
| --card-border: #0f2a44; | |
| --title-color: #93c5fd; | |
| --text-color: #e6eef8; | |
| --muted: #cbd5e1; | |
| --link: #6ea8ff; | |
| } | |
| } | |
| /* apply */ | |
| body { background: var(--bg); color: var(--text-color); } | |
| .card { background: var(--card-bg); border: 1px solid var(--card-border); color: var(--text-color); } | |
| .title-text { color: var(--title-color); } | |
| .abstract { color: var(--muted); } | |
| .links a { color: var(--link); } | |
| .chips-tags .chip { background: #e6f2ff; color: #1e3a8a; } | |
| .chips-langs .chip { background: #e8f5e9; color: #166534; } | |
| /* tweak chips for dark mode for better contrast */ | |
| @media (prefers-color-scheme: dark) { | |
| .chips-tags .chip { background: rgba(29,78,216,0.35); color: #e6eef8; border: 1px solid rgba(148,163,184,0.15); } | |
| .chips-langs .chip { background: rgba(22,101,52,0.35); color: #e6eef8; border: 1px solid rgba(148,163,184,0.15); } | |
| } | |
| /* small screens adjustments */ | |
| @media (max-width: 520px) { | |
| .cards-grid { gap: 10px; } | |
| .title-text { font-size: 15px; } | |
| } | |
| </style> | |
| """ | |
| return style + "<div class=\"cards-grid\">" + "\n".join(items) + "</div>" | |
| def on_filter(languages: list[str], tags: list[str], search: str): | |
| tasks = filter_tasks(languages, tags, search) | |
| count = len(tasks) | |
| total = len(ALL_TASKS) | |
| counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**" | |
| return counter_text, render_cards(tasks) | |
| def on_toggle_language_choices(show_all: bool, selected_langs: list[str], tags: list[str], search: str): | |
| choices = ALL_LANGS if show_all else TOP_LANGS | |
| kept = [lang for lang in (selected_langs or []) if lang in choices] | |
| tasks = filter_tasks(kept, tags, search) | |
| count = len(tasks) | |
| total = len(ALL_TASKS) | |
| counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**" | |
| return gr.update(choices=choices, value=kept), counter_text, render_cards(tasks) | |
| def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str): | |
| # Only toggle visibility; preserve current tag selections and keep them active in filtering | |
| tags_value: list[str] = selected_tags or [] | |
| tasks = filter_tasks(languages, tags_value, search) | |
| count = len(tasks) | |
| total = len(ALL_TASKS) | |
| counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**" | |
| # keep selections when showing; when hiding we keep value but component hidden (so filter still uses them) | |
| return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks) | |
| with gr.Blocks(title="Lighteval Tasks Explorer", css=None) as demo: | |
| # Header / hero | |
| with gr.Row(): | |
| with gr.Column(): | |
| gr.Markdown( | |
| """ | |
| <h2 style="margin:6px 0 2px 0;"><a href="https://huggingface.co/huggingface/lighteval">Lighteval</a> Tasks Explorer</h2> | |
| <p style="margin:0 0 12px 0; color:var(--muted);">Browse tasks by language, tags and search the task descriptions.</p> | |
| """ | |
| ) | |
| task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**") | |
| # Controls and results in two columns (left: controls, right: cards) | |
| with gr.Row(equal_height=False): | |
| with gr.Column(scale=2): | |
| gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.") | |
| # Search with interactive debounce | |
| search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="", interactive=True) | |
| # We want debounce behavior: use .change with every character by setting interactive=True and triggering on input | |
| # Filters | |
| with gr.Group(): | |
| gr.Markdown("**Languages**") | |
| show_all_langs = gr.Checkbox(label="Show all languages", value=False) | |
| lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[]) # default none selected | |
| with gr.Group(): | |
| gr.Markdown("**Benchmark type**") | |
| show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False) | |
| tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False) | |
| # small hint | |
| gr.Markdown("Tip: use the filters and search together. Results update live.") | |
| with gr.Column(scale=5): | |
| cards = gr.HTML() | |
| # put an initially visible loading placeholder | |
| cards.value = "<div style='padding:18px'>Loading tasks…</div>" | |
| # Wire interactions | |
| # Toggle expand/collapse language choices | |
| show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards]) | |
| # Toggle tag filter visibility (keeps values) | |
| show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards]) | |
| # Live filtering: wire change events on controls to update cards. | |
| # Textbox: trigger on every change (interactive True). If Gradio runtime has debounce param, it's used internally. | |
| search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards]) | |
| lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards]) | |
| tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards]) | |
| # Initial load: display all tasks (starred benchmarks first) | |
| initial_tasks = filter_tasks([], [], "") | |
| cards.value = render_cards(initial_tasks) | |
| # Run with `python benchmark_finder/app.py` | |
| demo.launch() | |