Spaces:

OpenEvals
/

open_benchmark_index

Running

File size: 18,542 Bytes

"""
Gradio dashboard to explore Lighteval tasks.

Scans `src/lighteval/tasks/tasks` and `src/lighteval/tasks/multilingual/tasks`
for module-level docstrings with this format:

name: <task display name>
dataset: <dataset id(s)>
abstract: <free text>
languages: <comma/newline separated language codes or names>
tags: <comma/newline separated tags>
paper: <url>

This file stays outside the lighteval src tree, per request.
"""

import ast
import os
import re
from collections import Counter
from dataclasses import dataclass

import gradio as gr


REPO_ROOT = "."
TASK_DIRS = [
    os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "tasks"),
    os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "multilingual", "tasks"),
]


star_benchmarks = [
    "aime",
    "mmlu_pro",
    "gpqa:diamond",
    "hle",
    "arc_agi_2",
    "ifbench",
    "ifeval",
    "live code bench",
    "math 500",
    "mix_eval",
    "musr",
    "simpleqa",
    "MMLU pro"
]


@dataclass
class TaskDoc:
    file_path: str
    module: str
    abstract: str
    languages: list[str]
    tags: list[str]
    paper: str | None
    dataset: str | None
    name: str | None = None


def read_file_text(path: str) -> str | None:
    try:
        with open(path, "r", encoding="utf-8") as f:
            return f.read()
    except Exception:
        return None


def parse_module_docstring(text: str) -> str | None:
    try:
        mod = ast.parse(text)
        return ast.get_docstring(mod)
    except Exception:
        # Fallback: naive regex for triple-quoted string at top
        m = re.match(r"^\s*([\'\"])\1\1([\s\S]*?)\1\1\1", text)
        return m.group(2).strip() if m else None


def parse_sections(doc: str) -> dict[str, str]:
    # Very simple section parser keyed by lines ending with ':' on their own
    # Expected keys: name, dataset, abstract, languages, tags, paper
    out: dict[str, str] = {"name": "", "dataset": "", "abstract": "", "languages": "", "tags": "", "paper": ""}
    current_key: str | None = None
    for raw_line in doc.splitlines():
        line = raw_line.rstrip()
        if line.endswith(":") and line.strip().lower() in {"name:", "dataset:", "abstract:", "languages:", "tags:", "paper:"}:
            current_key = line[:-1].strip().lower()
            continue
        if current_key is not None:
            # Preserve paragraphs; we will normalize later
            out[current_key] = (out[current_key] + ("\n" if out[current_key] else "") + line).strip()
    return out


def split_list_field(value: str) -> list[str]:
    if not value:
        return []
    # Support comma and newline separated values
    parts = re.split(r"[\n,]", value)
    cleaned: list[str] = []
    for p in parts:
        token = p.strip()
        if not token:
            continue
        cleaned.append(token)
    return cleaned


def discover_task_files() -> list[str]:
    files: list[str] = []
    print(f"Discovering task files in: {TASK_DIRS}")
    for base in TASK_DIRS:
        print(f"Discovering task files in: {base}")
        if not os.path.isdir(base):
            continue
        # Top-level python files in the directory
        for name in os.listdir(base):
            if name.endswith(".py"):
                files.append(os.path.join(base, name))
        # Also include subdirectory main.py files
        for dirpath, dirnames, filenames in os.walk(base):
            if dirpath == base:
                continue
            if "main.py" in filenames:
                files.append(os.path.join(dirpath, "main.py"))
    # Deduplicate while preserving order
    seen: set = set()
    unique_files: list[str] = []
    for p in files:
        if p in seen:
            continue
        seen.add(p)
        unique_files.append(p)
    return sorted(unique_files)


def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
    docs: list[TaskDoc] = []
    language_counts: Counter = Counter()
    tag_set: set = set()
    for path in discover_task_files():
        text = read_file_text(path)
        if not text:
            continue
        doc = parse_module_docstring(text)
        if not doc:
            continue
        sections = parse_sections(doc)
        abstract = sections.get("abstract", "").strip()
        langs = [lang.lower() for lang in split_list_field(sections.get("languages", ""))]
        tgs = [t.lower() for t in split_list_field(sections.get("tags", ""))]
        paper = sections.get("paper", "").strip() or None
        dataset = sections.get("dataset", "").strip() or None
        name = sections.get("name", "").strip() or None
        for lang in langs:
            language_counts[lang] += 1
        for t in tgs:
            tag_set.add(t)
        module = os.path.relpath(path, REPO_ROOT)
        docs.append(TaskDoc(file_path=path, module=module, abstract=abstract, languages=langs, tags=tgs, paper=paper, dataset=dataset, name=name))
    languages_sorted = [
        lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
    ]
    tags_sorted = sorted(tag_set)
    return docs, languages_sorted, tags_sorted


def build_index() -> tuple[list[TaskDoc], list[str], list[str]]:
    return index_tasks()


ALL_TASKS, ALL_LANGS, ALL_TAGS = build_index()
TOP_LANGS = ALL_LANGS[:8]  # show more by default


def normalize_name_for_matching(name: str) -> str:
    # Normalize for comparison: lowercase, remove underscores/spaces/colons
    return re.sub(r"[_\s:]+", "", name.lower())


def is_starred_benchmark(td: TaskDoc) -> bool:
    # Check multiple possible identifiers
    parts = td.module.replace("\\", "/").split("/")
    base_no_ext = parts[-1].rsplit(".", 1)[0]
    fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
    
    # Normalize all possible identifiers
    task_name_raw = (td.name or "").lower().strip()
    task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
    normalized_task_display = normalize_name_for_matching(task_name_display)
    normalized_module = normalize_name_for_matching(base_no_ext)
    normalized_name = normalize_name_for_matching(task_name_raw)
    
    # Also check dataset if available
    normalized_dataset = normalize_name_for_matching(td.dataset or "")
    
    # Check against star_benchmarks list - try multiple matching strategies
    for star_name in star_benchmarks:
        normalized_star = normalize_name_for_matching(star_name)
        # Try exact match or substring match on various fields
        if (normalized_star == normalized_task_display or 
            normalized_star == normalized_module or
            normalized_star == normalized_name or
            normalized_star in normalized_task_display or
            normalized_star in normalized_module or
            (normalized_dataset and normalized_star in normalized_dataset) or
            star_name.lower() in task_name_display or
            star_name.lower() in base_no_ext.lower()):
            return True
    return False


def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
    selected_langs = [lang.lower() for lang in (languages or [])]
    selected_tags = [t.lower() for t in (tags or [])]
    search_lc = (search or "").strip().lower()
    out: list[TaskDoc] = []
    for td in ALL_TASKS:
        if selected_langs and not any(lang in td.languages for lang in selected_langs):
            continue
        if selected_tags and not any(t in td.tags for t in selected_tags):
            continue
        if search_lc:
            # Search module path, abstract, tags, and dataset names
            hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
            if search_lc not in hay:
                continue
        out.append(td)
    # Sort: starred benchmarks first, then by name
    out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower()))
    return out


def render_cards(tasks: list[TaskDoc]) -> str:
    # Responsive grid of pretty cards; show all details without clicks
    items: list[str] = []
    for t in tasks:
        parts = t.module.replace("\\", "/").split("/")
        base_no_ext = parts[-1].rsplit(".", 1)[0]
        fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
        task_name = (t.name or fallback_name).replace("_", " ").title()
        mod_path = t.module.replace("\\", "/")
        mod_path = mod_path.split("/", 1)[1]
        source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
        paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
        tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
        langs_html = " ".join([f'<span class=\"chip chip-lang\" title=\"language: {lang}\">{lang}</span>' for lang in t.languages]) if t.languages else ""
        chips_tags_html = f'<div class="chips chips-tags">{tags_html}</div>' if tags_html else ""
        chips_langs_html = f'<div class="chips chips-langs">{langs_html}</div>' if langs_html else ""
        abstract_html = (t.abstract or "-").replace("\n", "<br/>")
        sep_html = ' <span class="sep">|</span> ' if paper_html else ""
        links_html = f"{source_html}{sep_html}{paper_html}"
        dataset_links = []
        if t.dataset:
            for ds in [d.strip() for d in t.dataset.split(",") if d.strip()]:
                dataset_links.append(f'<a class="dataset" href="https://huggingface.co/datasets/{ds}" target="_blank" rel="noopener">{ds}</a>')
        dataset_html = " ".join(dataset_links) if dataset_links else ""
        star_icon = "⭐ " if is_starred_benchmark(t) else ""
        items.append(
            f"""
            <article class="card" tabindex="0" aria-label="Task {task_name}">
              <div class="title"><span class="title-text">{star_icon}{task_name}</span> <span class="dataset-inline">{dataset_html}</span></div>
              {chips_tags_html}
              {chips_langs_html}
              <div class="abstract">{abstract_html}</div>
              <div class="links">{links_html}</div>
            </article>
            """
        )
    # CSS includes light and dark mode support
    style = """
    <style>
      /* layout */
      .cards-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 16px; margin-top: 10px; }

      /* card base */
      .card { border-radius: 12px; padding: 14px; transition: box-shadow 160ms ease, transform 120ms ease, border-color 120ms ease; outline: none; }
      .card:hover, .card:focus { transform: translateY(-4px); box-shadow: 0 10px 30px rgba(2,6,23,0.08); }

      .title { display:flex; align-items:center; gap:8px; flex-wrap:wrap; }
      .title-text { font-weight: 600; font-size: 16px; font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
      .dataset-inline { font-size: 12px; }

      .chips { margin: 6px 0 4px 0; display:flex; gap:2px; flex-wrap:wrap; }
      .chips-tags { margin: 6px 0 2px 0; }
      .chips-langs { margin: 0 0 4px 0; }
      .chip { display:inline-block; padding:1px 2px; border-radius:999px; font-size:12px; background: #e6f2ff; color: #1e3a8a; }
      .chip-lang { background: #e8f5e9; color: #166534; }

      .abstract { color: #475569; font-size: 13.5px; line-height: 1.35; margin-top: 6px; min-height: 48px; }
      .links { margin-top: 10px; font-size:12px; }
      .links a { text-decoration: none; font-weight: 600; }

      .sep { color: #94a3b8; margin: 0 8px; }

      .dataset { margin-left: 8px; font-size: 12px; color: #0ea5e9; background: #ecfeff; padding: 2px 6px; border-radius: 6px; text-decoration: none; }

      /* Light mode */
      :root {
        --bg: #f8fafc;
        --card-bg: #ffffff;
        --card-border: #e6f2ff;
        --title-color: #1e3a8a;
        --text-color: #0f172a;
        --muted: #475569;
        --link: #2563eb;
      }

      /* Dark mode overrides */
      @media (prefers-color-scheme: dark) {
        :root {
          --bg: #0b1220;
          --card-bg: #071022;
          --card-border: #0f2a44;
          --title-color: #93c5fd;
          --text-color: #e6eef8;
          --muted: #cbd5e1;
          --link: #6ea8ff;
        }
      }

      /* apply */
      body { background: var(--bg); color: var(--text-color); }
      .card { background: var(--card-bg); border: 1px solid var(--card-border); color: var(--text-color); }
      .title-text { color: var(--title-color); }
      .abstract { color: var(--muted); }
      .links a { color: var(--link); }
      .chips-tags .chip { background: #e6f2ff; color: #1e3a8a; }
      .chips-langs .chip { background: #e8f5e9; color: #166534; }
      /* tweak chips for dark mode for better contrast */
      @media (prefers-color-scheme: dark) {
        .chips-tags .chip { background: rgba(29,78,216,0.35); color: #e6eef8; border: 1px solid rgba(148,163,184,0.15); }
        .chips-langs .chip { background: rgba(22,101,52,0.35); color: #e6eef8; border: 1px solid rgba(148,163,184,0.15); }
      }

      /* small screens adjustments */
      @media (max-width: 520px) {
        .cards-grid { gap: 10px; }
        .title-text { font-size: 15px; }
      }
    </style>
    """
    return style + "<div class=\"cards-grid\">" + "\n".join(items) + "</div>"


def on_filter(languages: list[str], tags: list[str], search: str):
    tasks = filter_tasks(languages, tags, search)
    count = len(tasks)
    total = len(ALL_TASKS)
    counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
    return counter_text, render_cards(tasks)


def on_toggle_language_choices(show_all: bool, selected_langs: list[str], tags: list[str], search: str):
    choices = ALL_LANGS if show_all else TOP_LANGS
    kept = [lang for lang in (selected_langs or []) if lang in choices]
    tasks = filter_tasks(kept, tags, search)
    count = len(tasks)
    total = len(ALL_TASKS)
    counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
    return gr.update(choices=choices, value=kept), counter_text, render_cards(tasks)


def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
    # Only toggle visibility; preserve current tag selections and keep them active in filtering
    tags_value: list[str] = selected_tags or []
    tasks = filter_tasks(languages, tags_value, search)
    count = len(tasks)
    total = len(ALL_TASKS)
    counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
    # keep selections when showing; when hiding we keep value but component hidden (so filter still uses them)
    return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)


with gr.Blocks(title="Lighteval Tasks Explorer", css=None) as demo:
    # Header / hero
    with gr.Row():
        with gr.Column():
            gr.Markdown(
                """
                <h2 style="margin:6px 0 2px 0;"><a href="https://huggingface.co/huggingface/lighteval">Lighteval</a> Tasks Explorer</h2>
                <p style="margin:0 0 12px 0; color:var(--muted);">Browse tasks by language, tags and search the task descriptions.</p>
                """
            )
            task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**")

    # Controls and results in two columns (left: controls, right: cards)
    with gr.Row(equal_height=False):
        with gr.Column(scale=2):
            gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
            # Search with interactive debounce
            search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="", interactive=True)
            # We want debounce behavior: use .change with every character by setting interactive=True and triggering on input
            # Filters
            with gr.Group():
                gr.Markdown("**Languages**")
                show_all_langs = gr.Checkbox(label="Show all languages", value=False)
                lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[])  # default none selected
            with gr.Group():
                gr.Markdown("**Benchmark type**")
                show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
                tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
            # small hint
            gr.Markdown("Tip: use the filters and search together. Results update live.")

        with gr.Column(scale=5):
            cards = gr.HTML()
            # put an initially visible loading placeholder
            cards.value = "<div style='padding:18px'>Loading tasks…</div>"

    # Wire interactions
    # Toggle expand/collapse language choices
    show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
    # Toggle tag filter visibility (keeps values)
    show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])

    # Live filtering: wire change events on controls to update cards.
    # Textbox: trigger on every change (interactive True). If Gradio runtime has debounce param, it's used internally.
    search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
    lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
    tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])

    # Initial load: display all tasks (starred benchmarks first)
    initial_tasks = filter_tasks([], [], "")
    cards.value = render_cards(initial_tasks)


if __name__ == "__main__":
    from git import Repo  # pip install gitpython

    git_url = "https://github.com/huggingface/lighteval.git"
    repo_dir = "./lighteval"
    
    if os.path.exists(repo_dir) and os.path.isdir(os.path.join(repo_dir, ".git")):
        print(f"Pulling latest changes from {git_url}...")
        repo = Repo(repo_dir)
        repo.remotes.origin.pull()
    else:
        print(f"Cloning {git_url} to {repo_dir}...")
        Repo.clone_from(git_url, repo_dir)
    
    # Run with `python benchmark_finder/app.py`
    demo.launch()