Linker1907's picture
fix
430430b
raw
history blame
18.5 kB
"""
Gradio dashboard to explore Lighteval tasks.
Scans `src/lighteval/tasks/tasks` and `src/lighteval/tasks/multilingual/tasks`
for module-level docstrings with this format:
name: <task display name>
dataset: <dataset id(s)>
abstract: <free text>
languages: <comma/newline separated language codes or names>
tags: <comma/newline separated tags>
paper: <url>
This file stays outside the lighteval src tree, per request.
"""
import ast
import os
import re
from collections import Counter
from git import Repo # pip install gitpython
from dataclasses import dataclass
import gradio as gr
git_url = "https://github.com/huggingface/lighteval.git"
repo_dir = "./lighteval"
if os.path.exists(repo_dir) and os.path.isdir(os.path.join(repo_dir, ".git")):
print(f"Pulling latest changes from {git_url}...")
repo = Repo(repo_dir)
repo.remotes.origin.pull()
else:
print(f"Cloning {git_url} to {repo_dir}...")
Repo.clone_from(git_url, repo_dir)
REPO_ROOT = "."
TASK_DIRS = [
os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "tasks"),
os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "multilingual", "tasks"),
]
star_benchmarks = [
"aime",
"mmlu_pro",
"gpqa:diamond",
"hle",
"arc_agi_2",
"ifbench",
"ifeval",
"live code bench",
"math 500",
"mix_eval",
"musr",
"simpleqa",
"MMLU pro"
]
@dataclass
class TaskDoc:
file_path: str
module: str
abstract: str
languages: list[str]
tags: list[str]
paper: str | None
dataset: str | None
name: str | None = None
def read_file_text(path: str) -> str | None:
try:
with open(path, "r", encoding="utf-8") as f:
return f.read()
except Exception:
return None
def parse_module_docstring(text: str) -> str | None:
try:
mod = ast.parse(text)
return ast.get_docstring(mod)
except Exception:
# Fallback: naive regex for triple-quoted string at top
m = re.match(r"^\s*([\'\"])\1\1([\s\S]*?)\1\1\1", text)
return m.group(2).strip() if m else None
def parse_sections(doc: str) -> dict[str, str]:
# Very simple section parser keyed by lines ending with ':' on their own
# Expected keys: name, dataset, abstract, languages, tags, paper
out: dict[str, str] = {"name": "", "dataset": "", "abstract": "", "languages": "", "tags": "", "paper": ""}
current_key: str | None = None
for raw_line in doc.splitlines():
line = raw_line.rstrip()
if line.endswith(":") and line.strip().lower() in {"name:", "dataset:", "abstract:", "languages:", "tags:", "paper:"}:
current_key = line[:-1].strip().lower()
continue
if current_key is not None:
# Preserve paragraphs; we will normalize later
out[current_key] = (out[current_key] + ("\n" if out[current_key] else "") + line).strip()
return out
def split_list_field(value: str) -> list[str]:
if not value:
return []
# Support comma and newline separated values
parts = re.split(r"[\n,]", value)
cleaned: list[str] = []
for p in parts:
token = p.strip()
if not token:
continue
cleaned.append(token)
return cleaned
def discover_task_files() -> list[str]:
files: list[str] = []
print(f"Discovering task files in: {TASK_DIRS}")
for base in TASK_DIRS:
print(f"Discovering task files in: {base}")
if not os.path.isdir(base):
continue
# Top-level python files in the directory
for name in os.listdir(base):
if name.endswith(".py"):
files.append(os.path.join(base, name))
# Also include subdirectory main.py files
for dirpath, dirnames, filenames in os.walk(base):
if dirpath == base:
continue
if "main.py" in filenames:
files.append(os.path.join(dirpath, "main.py"))
# Deduplicate while preserving order
seen: set = set()
unique_files: list[str] = []
for p in files:
if p in seen:
continue
seen.add(p)
unique_files.append(p)
return sorted(unique_files)
def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
docs: list[TaskDoc] = []
language_counts: Counter = Counter()
tag_set: set = set()
for path in discover_task_files():
text = read_file_text(path)
if not text:
continue
doc = parse_module_docstring(text)
if not doc:
continue
sections = parse_sections(doc)
abstract = sections.get("abstract", "").strip()
langs = [lang.lower() for lang in split_list_field(sections.get("languages", ""))]
tgs = [t.lower() for t in split_list_field(sections.get("tags", ""))]
paper = sections.get("paper", "").strip() or None
dataset = sections.get("dataset", "").strip() or None
name = sections.get("name", "").strip() or None
for lang in langs:
language_counts[lang] += 1
for t in tgs:
tag_set.add(t)
module = os.path.relpath(path, REPO_ROOT)
docs.append(TaskDoc(file_path=path, module=module, abstract=abstract, languages=langs, tags=tgs, paper=paper, dataset=dataset, name=name))
languages_sorted = [
lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
]
tags_sorted = sorted(tag_set)
return docs, languages_sorted, tags_sorted
def build_index() -> tuple[list[TaskDoc], list[str], list[str]]:
return index_tasks()
ALL_TASKS, ALL_LANGS, ALL_TAGS = build_index()
TOP_LANGS = ALL_LANGS[:8] # show more by default
def normalize_name_for_matching(name: str) -> str:
# Normalize for comparison: lowercase, remove underscores/spaces/colons
return re.sub(r"[_\s:]+", "", name.lower())
def is_starred_benchmark(td: TaskDoc) -> bool:
# Check multiple possible identifiers
parts = td.module.replace("\\", "/").split("/")
base_no_ext = parts[-1].rsplit(".", 1)[0]
fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
# Normalize all possible identifiers
task_name_raw = (td.name or "").lower().strip()
task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
normalized_task_display = normalize_name_for_matching(task_name_display)
normalized_module = normalize_name_for_matching(base_no_ext)
normalized_name = normalize_name_for_matching(task_name_raw)
# Also check dataset if available
normalized_dataset = normalize_name_for_matching(td.dataset or "")
# Check against star_benchmarks list - try multiple matching strategies
for star_name in star_benchmarks:
normalized_star = normalize_name_for_matching(star_name)
# Try exact match or substring match on various fields
if (normalized_star == normalized_task_display or
normalized_star == normalized_module or
normalized_star == normalized_name or
normalized_star in normalized_task_display or
normalized_star in normalized_module or
(normalized_dataset and normalized_star in normalized_dataset) or
star_name.lower() in task_name_display or
star_name.lower() in base_no_ext.lower()):
return True
return False
def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
selected_langs = [lang.lower() for lang in (languages or [])]
selected_tags = [t.lower() for t in (tags or [])]
search_lc = (search or "").strip().lower()
out: list[TaskDoc] = []
for td in ALL_TASKS:
if selected_langs and not any(lang in td.languages for lang in selected_langs):
continue
if selected_tags and not any(t in td.tags for t in selected_tags):
continue
if search_lc:
# Search module path, abstract, tags, and dataset names
hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
if search_lc not in hay:
continue
out.append(td)
# Sort: starred benchmarks first, then by name
out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower()))
return out
def render_cards(tasks: list[TaskDoc]) -> str:
# Responsive grid of pretty cards; show all details without clicks
items: list[str] = []
for t in tasks:
parts = t.module.replace("\\", "/").split("/")
base_no_ext = parts[-1].rsplit(".", 1)[0]
fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
task_name = (t.name or fallback_name).replace("_", " ").title()
mod_path = t.module.replace("\\", "/")
mod_path = mod_path.split("/", 1)[1]
source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
langs_html = " ".join([f'<span class=\"chip chip-lang\" title=\"language: {lang}\">{lang}</span>' for lang in t.languages]) if t.languages else ""
chips_tags_html = f'<div class="chips chips-tags">{tags_html}</div>' if tags_html else ""
chips_langs_html = f'<div class="chips chips-langs">{langs_html}</div>' if langs_html else ""
abstract_html = (t.abstract or "-").replace("\n", "<br/>")
sep_html = ' <span class="sep">|</span> ' if paper_html else ""
links_html = f"{source_html}{sep_html}{paper_html}"
dataset_links = []
if t.dataset:
for ds in [d.strip() for d in t.dataset.split(",") if d.strip()]:
dataset_links.append(f'<a class="dataset" href="https://huggingface.co/datasets/{ds}" target="_blank" rel="noopener">{ds}</a>')
dataset_html = " ".join(dataset_links) if dataset_links else ""
star_icon = "⭐ " if is_starred_benchmark(t) else ""
items.append(
f"""
<article class="card" tabindex="0" aria-label="Task {task_name}">
<div class="title"><span class="title-text">{star_icon}{task_name}</span> <span class="dataset-inline">{dataset_html}</span></div>
{chips_tags_html}
{chips_langs_html}
<div class="abstract">{abstract_html}</div>
<div class="links">{links_html}</div>
</article>
"""
)
# CSS includes light and dark mode support
style = """
<style>
/* layout */
.cards-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 16px; margin-top: 10px; }
/* card base */
.card { border-radius: 12px; padding: 14px; transition: box-shadow 160ms ease, transform 120ms ease, border-color 120ms ease; outline: none; }
.card:hover, .card:focus { transform: translateY(-4px); box-shadow: 0 10px 30px rgba(2,6,23,0.08); }
.title { display:flex; align-items:center; gap:8px; flex-wrap:wrap; }
.title-text { font-weight: 600; font-size: 16px; font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
.dataset-inline { font-size: 12px; }
.chips { margin: 6px 0 4px 0; display:flex; gap:2px; flex-wrap:wrap; }
.chips-tags { margin: 6px 0 2px 0; }
.chips-langs { margin: 0 0 4px 0; }
.chip { display:inline-block; padding:1px 2px; border-radius:999px; font-size:12px; background: #e6f2ff; color: #1e3a8a; }
.chip-lang { background: #e8f5e9; color: #166534; }
.abstract { color: #475569; font-size: 13.5px; line-height: 1.35; margin-top: 6px; min-height: 48px; }
.links { margin-top: 10px; font-size:12px; }
.links a { text-decoration: none; font-weight: 600; }
.sep { color: #94a3b8; margin: 0 8px; }
.dataset { margin-left: 8px; font-size: 12px; color: #0ea5e9; background: #ecfeff; padding: 2px 6px; border-radius: 6px; text-decoration: none; }
/* Light mode */
:root {
--bg: #f8fafc;
--card-bg: #ffffff;
--card-border: #e6f2ff;
--title-color: #1e3a8a;
--text-color: #0f172a;
--muted: #475569;
--link: #2563eb;
}
/* Dark mode overrides */
@media (prefers-color-scheme: dark) {
:root {
--bg: #0b1220;
--card-bg: #071022;
--card-border: #0f2a44;
--title-color: #93c5fd;
--text-color: #e6eef8;
--muted: #cbd5e1;
--link: #6ea8ff;
}
}
/* apply */
body { background: var(--bg); color: var(--text-color); }
.card { background: var(--card-bg); border: 1px solid var(--card-border); color: var(--text-color); }
.title-text { color: var(--title-color); }
.abstract { color: var(--muted); }
.links a { color: var(--link); }
.chips-tags .chip { background: #e6f2ff; color: #1e3a8a; }
.chips-langs .chip { background: #e8f5e9; color: #166534; }
/* tweak chips for dark mode for better contrast */
@media (prefers-color-scheme: dark) {
.chips-tags .chip { background: rgba(29,78,216,0.35); color: #e6eef8; border: 1px solid rgba(148,163,184,0.15); }
.chips-langs .chip { background: rgba(22,101,52,0.35); color: #e6eef8; border: 1px solid rgba(148,163,184,0.15); }
}
/* small screens adjustments */
@media (max-width: 520px) {
.cards-grid { gap: 10px; }
.title-text { font-size: 15px; }
}
</style>
"""
return style + "<div class=\"cards-grid\">" + "\n".join(items) + "</div>"
def on_filter(languages: list[str], tags: list[str], search: str):
tasks = filter_tasks(languages, tags, search)
count = len(tasks)
total = len(ALL_TASKS)
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
return counter_text, render_cards(tasks)
def on_toggle_language_choices(show_all: bool, selected_langs: list[str], tags: list[str], search: str):
choices = ALL_LANGS if show_all else TOP_LANGS
kept = [lang for lang in (selected_langs or []) if lang in choices]
tasks = filter_tasks(kept, tags, search)
count = len(tasks)
total = len(ALL_TASKS)
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
return gr.update(choices=choices, value=kept), counter_text, render_cards(tasks)
def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
# Only toggle visibility; preserve current tag selections and keep them active in filtering
tags_value: list[str] = selected_tags or []
tasks = filter_tasks(languages, tags_value, search)
count = len(tasks)
total = len(ALL_TASKS)
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
# keep selections when showing; when hiding we keep value but component hidden (so filter still uses them)
return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)
with gr.Blocks(title="Lighteval Tasks Explorer", css=None) as demo:
# Header / hero
with gr.Row():
with gr.Column():
gr.Markdown(
"""
<h2 style="margin:6px 0 2px 0;"><a href="https://huggingface.co/huggingface/lighteval">Lighteval</a> Tasks Explorer</h2>
<p style="margin:0 0 12px 0; color:var(--muted);">Browse tasks by language, tags and search the task descriptions.</p>
"""
)
task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**")
# Controls and results in two columns (left: controls, right: cards)
with gr.Row(equal_height=False):
with gr.Column(scale=2):
gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
# Search with interactive debounce
search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="", interactive=True)
# We want debounce behavior: use .change with every character by setting interactive=True and triggering on input
# Filters
with gr.Group():
gr.Markdown("**Languages**")
show_all_langs = gr.Checkbox(label="Show all languages", value=False)
lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[]) # default none selected
with gr.Group():
gr.Markdown("**Benchmark type**")
show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
# small hint
gr.Markdown("Tip: use the filters and search together. Results update live.")
with gr.Column(scale=5):
cards = gr.HTML()
# put an initially visible loading placeholder
cards.value = "<div style='padding:18px'>Loading tasks…</div>"
# Wire interactions
# Toggle expand/collapse language choices
show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
# Toggle tag filter visibility (keeps values)
show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])
# Live filtering: wire change events on controls to update cards.
# Textbox: trigger on every change (interactive True). If Gradio runtime has debounce param, it's used internally.
search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
# Initial load: display all tasks (starred benchmarks first)
initial_tasks = filter_tasks([], [], "")
cards.value = render_cards(initial_tasks)
# Run with `python benchmark_finder/app.py`
demo.launch()