SaylorTwift's picture
SaylorTwift HF Staff
Update app.py
8964485 verified
"""
Gradio dashboard to explore Lighteval tasks.
Loads tasks from the lighteval Registry and displays them in a searchable,
filterable interface.
"""
import re
from collections import Counter
from dataclasses import dataclass, field
import gradio as gr
from lighteval.tasks.registry import Registry
registry = Registry(custom_tasks=None, load_multilingual=True)
modules_data = registry.get_tasks_dump()
star_benchmarks = [
"aime",
"mmlu_pro",
"gpqa",
"hle",
"arc_agi_2",
"ifbench",
"ifeval",
"live code bench",
"math 500",
"mix_eval",
"musr",
"simpleqa",
"MMLU pro"
]
@dataclass
class TaskDoc:
module: str
abstract: str
languages: list[str]
tags: list[str]
paper: str | None
dataset: str | None
name: str | None = None
task_names: list[str] = field(default_factory=list)
def _module_to_github_path(module: str) -> str:
"""Convert module path to GitHub source URL path."""
if module.startswith("lighteval."):
mod_path_parts = module[len("lighteval."):].split(".")
return "src/lighteval/" + "/".join(mod_path_parts) + ".py"
return "src/lighteval/" + module.replace(".", "/") + ".py"
def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
"""Load tasks from registry and build index."""
docs: list[TaskDoc] = []
language_counts: Counter = Counter()
tag_set: set = set()
for entry in modules_data:
docstring = entry.get("docstring", {})
module = entry.get("module", "")
# Extract fields from docstring
abstract = docstring.get("abstract", "").strip()
langs = [lang.lower() for lang in docstring.get("languages", [])]
tgs = [t.lower() for t in docstring.get("tags", [])]
paper = docstring.get("paper", "").strip() or None
name = docstring.get("name", "").strip() or None
# Convert dataset array to comma-separated string
dataset_list = docstring.get("dataset", [])
dataset = ", ".join(dataset_list) if dataset_list else None
# Extract task names from tasks array
tasks_list = entry.get("tasks", [])
task_names = [task.get("name", "") for task in tasks_list if task.get("name")]
# Update counters
for lang in langs:
language_counts[lang] += 1
for t in tgs:
tag_set.add(t)
docs.append(TaskDoc(
module=module,
abstract=abstract,
languages=langs,
tags=tgs,
paper=paper,
dataset=dataset,
name=name,
task_names=task_names
))
languages_sorted = [
lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
]
tags_sorted = sorted(tag_set)
return docs, languages_sorted, tags_sorted
ALL_TASKS, ALL_LANGS, ALL_TAGS = index_tasks()
TOP_LANGS = ALL_LANGS[:8]
def normalize_name_for_matching(name: str) -> str:
"""Normalize name for comparison: lowercase, remove underscores/spaces/colons."""
return re.sub(r"[_\s:]+", "", name.lower())
def is_starred_benchmark(td: TaskDoc) -> bool:
"""Check if task is a starred benchmark."""
module_parts = td.module.split(".")
base_no_ext = module_parts[-1] if module_parts else ""
fallback_name = module_parts[-2] if base_no_ext == "main" and len(module_parts) >= 2 else base_no_ext
task_name_raw = (td.name or "").lower().strip()
task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
normalized_task_display = normalize_name_for_matching(task_name_display)
normalized_module = normalize_name_for_matching(base_no_ext)
normalized_name = normalize_name_for_matching(task_name_raw)
normalized_dataset = normalize_name_for_matching(td.dataset or "")
for star_name in star_benchmarks:
normalized_star = normalize_name_for_matching(star_name)
if (normalized_star == normalized_task_display or
normalized_star == normalized_module or
normalized_star == normalized_name or
normalized_star in normalized_task_display or
normalized_star in normalized_module or
(normalized_dataset and normalized_star in normalized_dataset) or
star_name.lower() in task_name_display or
star_name.lower() in base_no_ext.lower()):
return True
return False
def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
"""Filter tasks by languages, tags, and search query."""
selected_langs = [lang.lower() for lang in (languages or [])]
selected_tags = [t.lower() for t in (tags or [])]
search_lc = (search or "").strip().lower()
out: list[TaskDoc] = []
for td in ALL_TASKS:
if selected_langs and not any(lang in td.languages for lang in selected_langs):
continue
if selected_tags and not any(t in td.tags for t in selected_tags):
continue
if search_lc:
hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
if search_lc not in hay:
continue
out.append(td)
out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower()))
return out
def truncate_text(text: str, max_length: int = 250) -> str:
"""Truncate text to max_length, breaking at word boundary if possible."""
if len(text) <= max_length:
return text
truncated = text[:max_length]
last_space = truncated.rfind(" ")
if last_space > max_length * 0.7:
truncated = truncated[:last_space]
return truncated + "..."
def group_task_names_by_prefix(task_names: list[str]) -> list[str]:
"""Group task names by prefix (part before colon).
If multiple tasks share the same prefix, only show the prefix once.
Tasks without a colon are shown as-is.
Preserves original order as much as possible.
"""
prefix_groups: dict[str, list[str]] = {}
prefix_first_pos: dict[str, int] = {} # Track first occurrence position
result: list[tuple[int, str]] = [] # (position, name) tuples
for pos, task_name in enumerate(task_names):
if ":" in task_name:
prefix = task_name.split(":")[0]
if prefix not in prefix_groups:
prefix_groups[prefix] = []
prefix_first_pos[prefix] = pos
prefix_groups[prefix].append(task_name)
else:
# Standalone task - add directly at its position
result.append((pos, task_name))
# Process prefix groups
for prefix, tasks in prefix_groups.items():
pos = prefix_first_pos[prefix]
if len(tasks) > 1:
# Multiple tasks share this prefix - show only the prefix
result.append((pos, prefix))
else:
# Only one task with this prefix - show the full task name
result.append((pos, tasks[0]))
# Sort by position to preserve original order
result.sort(key=lambda x: x[0])
return [name for _, name in result]
def render_cards(tasks: list[TaskDoc]) -> str:
"""Render task cards as HTML."""
items: list[str] = []
for t in tasks:
# Get display name
module_parts = t.module.split(".")
base_no_ext = module_parts[-1] if module_parts else ""
fallback_name = module_parts[-2] if base_no_ext == "main" and len(module_parts) >= 2 else base_no_ext
task_name = (t.name or fallback_name).replace("_", " ").title()
# Build source link
mod_path = _module_to_github_path(t.module)
source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
langs_html = " ".join([f'<span class=\"chip chip-lang\" title=\"language: {lang}\">{lang}</span>' for lang in t.languages]) if t.languages else ""
chips_tags_html = f'<div class="chips chips-tags">{tags_html}</div>' if tags_html else ""
chips_langs_html = f'<div class="chips chips-langs">{langs_html}</div>' if langs_html else ""
abstract_text = t.abstract or "-"
abstract_text = truncate_text(abstract_text)
abstract_html = abstract_text.replace("\n", "<br/>")
sep_html = ' <span class="sep">|</span> ' if paper_html else ""
links_html = f"{source_html}{sep_html}{paper_html}"
dataset_links = []
if t.dataset:
datasets = [d.strip() for d in t.dataset.split(",") if d.strip()]
for ds in datasets[:6]:
dataset_links.append(f'<a class="dataset" href="https://huggingface.co/datasets/{ds}" target="_blank" rel="noopener">{ds}</a>')
if len(datasets) > 6:
dataset_links.append(f'<span class="dataset-more">+{len(datasets) - 6} more</span>')
dataset_html = " ".join(dataset_links) if dataset_links else ""
star_icon = "⭐ " if is_starred_benchmark(t) else ""
# Display evaluation task names (max 3 visible, with dropdown for more)
# Group task names by prefix to collapse shared prefixes
task_names_html = ""
if t.task_names:
grouped_names = group_task_names_by_prefix(t.task_names)
visible_names = grouped_names[:3]
remaining_names = grouped_names[3:]
visible_html = " ".join([f'<span class="task-name">{name}</span>' for name in visible_names])
if remaining_names:
remaining_html = " ".join([f'<span class="task-name">{name}</span>' for name in remaining_names])
task_names_html = f'''
<div class="task-names">
<div class="task-names-label">Run using lighteval:</div>
<div class="task-names-list">{visible_html}</div>
<details class="task-names-details">
<summary class="task-names-summary">Show {len(remaining_names)} more</summary>
<div class="task-names-list task-names-remaining">{remaining_html}</div>
</details>
</div>
'''
else:
task_names_html = f'<div class="task-names"><div class="task-names-label">Run using lighteval:</div><div class="task-names-list">{visible_html}</div></div>'
items.append(
f"""
<article class="card" tabindex="0" aria-label="Task {task_name}">
<div class="title"><span class="title-text">{star_icon}{task_name}</span> <span class="dataset-inline">{dataset_html}</span></div>
{chips_tags_html}
{chips_langs_html}
<div class="abstract">{abstract_html}</div>
{task_names_html}
<div class="links">{links_html}</div>
</article>
"""
)
return "<div class=\"cards-grid\">" + "\n".join(items) + "</div>"
def on_filter(languages: list[str], tags: list[str], search: str):
tasks = filter_tasks(languages, tags, search)
count = len(tasks)
total = len(ALL_TASKS)
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
return counter_text, render_cards(tasks)
def on_toggle_language_choices(show_all: bool, selected_langs: list[str], tags: list[str], search: str):
choices = ALL_LANGS if show_all else TOP_LANGS
kept = [lang for lang in (selected_langs or []) if lang in choices]
tasks = filter_tasks(kept, tags, search)
count = len(tasks)
total = len(ALL_TASKS)
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
return gr.update(choices=choices, value=kept), counter_text, render_cards(tasks)
def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
"""Toggle tag filter visibility while preserving selections."""
tags_value: list[str] = selected_tags or []
tasks = filter_tasks(languages, tags_value, search)
count = len(tasks)
total = len(ALL_TASKS)
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)
# Custom CSS for the app
custom_css = """
/* layout */
.cards-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 20px; margin-top: 10px; }
/* card base */
.card {
border-radius: 16px;
padding: 18px;
transition: all 0.3s cubic-bezier(0.4, 0, 0.2, 1);
outline: none;
position: relative;
overflow: hidden;
border: 2px solid transparent;
}
.card::before {
content: '';
position: absolute;
top: 0;
left: -100%;
width: 100%;
height: 100%;
background: linear-gradient(
90deg,
transparent,
rgba(255, 255, 255, 0.1),
transparent
);
transition: left 0.5s;
}
.card:hover::before {
left: 100%;
}
.card:hover, .card:focus {
transform: translateY(-6px) scale(1.02);
box-shadow: 0 20px 40px rgba(0, 0, 0, 0.12), 0 8px 16px rgba(0, 0, 0, 0.08);
}
.title {
display: flex;
align-items: center;
gap: 8px;
flex-wrap: wrap;
position: relative;
z-index: 1;
}
.title-text {
font-weight: 700;
font-size: 17px;
font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial;
letter-spacing: -0.01em;
}
.dataset-inline {
font-size: 12px;
display: flex;
flex-wrap: wrap;
gap: 6px;
align-items: center;
margin-left: 8px;
}
.chips {
margin: 8px 0 6px 0;
display: flex;
gap: 4px;
flex-wrap: wrap;
}
.chips-tags { margin: 8px 0 4px 0; }
.chips-langs { margin: 4px 0 6px 0; }
.chip {
display: inline-block;
padding: 4px 10px;
border-radius: 12px;
font-size: 11px;
font-weight: 500;
background: linear-gradient(135deg, #e6f2ff 0%, #d6e9ff 100%);
color: #1e3a8a;
transition: all 0.2s ease;
border: 1px solid rgba(30, 58, 138, 0.1);
}
.chip:hover {
transform: translateY(-1px);
box-shadow: 0 2px 8px rgba(30, 58, 138, 0.2);
}
.chip-lang {
background: linear-gradient(135deg, #e8f5e9 0%, #d4edda 100%);
color: #166534;
border-color: rgba(22, 101, 52, 0.1);
}
.chip-lang:hover {
box-shadow: 0 2px 8px rgba(22, 101, 52, 0.2);
}
.abstract {
color: #475569;
font-size: 13.5px;
line-height: 1.6;
margin-top: 8px;
min-height: 48px;
}
.task-names {
margin-top: 10px;
padding-top: 8px;
border-top: 1px solid rgba(148, 163, 184, 0.15);
}
.task-names-label {
font-size: 11px;
font-weight: 600;
color: #64748b;
margin-bottom: 6px;
text-transform: uppercase;
letter-spacing: 0.5px;
}
.task-names-list {
display: flex;
flex-wrap: wrap;
gap: 6px;
}
.task-names-remaining {
margin-top: 8px;
padding-top: 8px;
border-top: 1px solid rgba(148, 163, 184, 0.15);
}
.task-names-details {
margin-top: 8px;
}
.task-names-summary {
font-size: 11px;
font-weight: 600;
color: #64748b;
cursor: pointer;
user-select: none;
padding: 4px 8px;
border-radius: 4px;
display: inline-block;
transition: all 0.2s ease;
background: rgba(148, 163, 184, 0.1);
}
.task-names-summary:hover {
background: rgba(148, 163, 184, 0.2);
color: #475569;
}
.task-names-summary::-webkit-details-marker {
display: none;
}
.task-names-details[open] .task-names-summary {
margin-bottom: 8px;
}
.task-name {
display: inline-block;
padding: 3px 8px;
border-radius: 6px;
font-size: 11px;
font-weight: 500;
background: linear-gradient(135deg, #fef3c7 0%, #fde68a 100%);
color: #92400e;
border: 1px solid rgba(146, 64, 14, 0.2);
font-family: 'Monaco', 'Menlo', 'Ubuntu Mono', monospace;
transition: all 0.2s ease;
}
.task-name:hover {
transform: translateY(-1px);
box-shadow: 0 2px 6px rgba(146, 64, 14, 0.2);
background: linear-gradient(135deg, #fde68a 0%, #fcd34d 100%);
}
.links {
margin-top: 12px;
font-size: 12px;
padding-top: 8px;
border-top: 1px solid rgba(148, 163, 184, 0.2);
}
.links a {
text-decoration: none;
font-weight: 600;
transition: all 0.2s ease;
position: relative;
}
.links a::after {
content: '';
position: absolute;
width: 0;
height: 2px;
bottom: -2px;
left: 0;
background: currentColor;
transition: width 0.3s ease;
}
.links a:hover::after {
width: 100%;
}
.links a:hover {
transform: translateX(2px);
}
.sep { color: #94a3b8; margin: 0 8px; }
.dataset {
display: inline-block;
font-size: 12px;
color: #0ea5e9;
background: linear-gradient(135deg, #ecfeff 0%, #e0f7fa 100%);
padding: 4px 10px;
border-radius: 8px;
text-decoration: none;
transition: all 0.2s ease;
border: 1px solid rgba(14, 165, 233, 0.2);
font-weight: 500;
white-space: nowrap;
}
.dataset:hover {
transform: translateY(-1px);
box-shadow: 0 4px 12px rgba(14, 165, 233, 0.3);
background: linear-gradient(135deg, #e0f7fa 0%, #d1f2eb 100%);
}
.dataset-more {
display: inline-block;
font-size: 12px;
color: #64748b;
background: linear-gradient(135deg, #f1f5f9 0%, #e2e8f0 100%);
padding: 4px 10px;
border-radius: 8px;
font-weight: 500;
white-space: nowrap;
}
/* Light mode */
:root {
--bg-start: #f8fafc;
--bg-end: #f1f5f9;
--card-bg: #ffffff;
--card-border: rgba(226, 232, 240, 0.8);
--title-color: #1e3a8a;
--text-color: #0f172a;
--muted: #475569;
--link: #2563eb;
}
/* Dark mode overrides */
@media (prefers-color-scheme: dark) {
:root {
--bg-start: #0b1220;
--bg-end: #0f172a;
--card-bg: #071022;
--card-border: rgba(15, 42, 68, 0.8);
--title-color: #93c5fd;
--text-color: #e6eef8;
--muted: #cbd5e1;
--link: #6ea8ff;
}
.dataset-more {
color: #94a3b8;
background: linear-gradient(135deg, rgba(148, 163, 184, 0.15) 0%, rgba(148, 163, 184, 0.1) 100%);
}
.chips-tags .chip {
background: linear-gradient(135deg, rgba(29, 78, 216, 0.35) 0%, rgba(29, 78, 216, 0.25) 100%);
color: #e6eef8;
border: 1px solid rgba(148, 163, 184, 0.15);
}
.chips-langs .chip {
background: linear-gradient(135deg, rgba(22, 101, 52, 0.35) 0%, rgba(22, 101, 52, 0.25) 100%);
color: #e6eef8;
border: 1px solid rgba(148, 163, 184, 0.15);
}
.links {
border-top-color: rgba(148, 163, 184, 0.3);
}
.task-names {
border-top-color: rgba(148, 163, 184, 0.25);
}
.task-names-label {
color: #94a3b8;
}
.task-name {
background: linear-gradient(135deg, rgba(146, 64, 14, 0.3) 0%, rgba(146, 64, 14, 0.2) 100%);
color: #fbbf24;
border-color: rgba(146, 64, 14, 0.3);
}
.task-name:hover {
background: linear-gradient(135deg, rgba(146, 64, 14, 0.4) 0%, rgba(146, 64, 14, 0.3) 100%);
box-shadow: 0 2px 6px rgba(251, 191, 36, 0.3);
}
.task-names-summary {
background: rgba(148, 163, 184, 0.15);
color: #94a3b8;
}
.task-names-summary:hover {
background: rgba(148, 163, 184, 0.25);
color: #cbd5e1;
}
.task-names-remaining {
border-top-color: rgba(148, 163, 184, 0.25);
}
}
/* apply */
body {
background: linear-gradient(135deg, var(--bg-start) 0%, var(--bg-end) 100%);
background-attachment: fixed;
color: var(--text-color);
min-height: 100vh;
}
.card {
background: var(--card-bg);
border: 2px solid var(--card-border);
color: var(--text-color);
backdrop-filter: blur(10px);
}
.title-text { color: var(--title-color); }
.abstract { color: var(--muted); }
.links a { color: var(--link); }
/* small screens adjustments */
@media (max-width: 520px) {
.cards-grid {
gap: 12px;
grid-template-columns: 1fr;
}
.title-text { font-size: 16px; }
.card { padding: 14px; }
}
"""
with gr.Blocks(title="Lighteval Tasks Explorer", css=custom_css) as demo:
with gr.Row():
with gr.Column():
gr.Markdown(
"""
<h2 style="margin:6px 0 2px 0;"><a href="https://github.com/huggingface/lighteval">Lighteval</a> Tasks Explorer</h2>
<p style="margin:0 0 12px 0; color:var(--muted);">Browse tasks by language, tags and search the task descriptions.</p>
"""
)
task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**")
with gr.Row(equal_height=False):
with gr.Column(scale=2):
gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="")
with gr.Group():
gr.Markdown("**Languages**")
show_all_langs = gr.Checkbox(label="Show all languages", value=False)
lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[])
with gr.Group():
gr.Markdown("**Benchmark type**")
show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
gr.Markdown("Tip: use the filters and search together. Results update live.")
with gr.Column(scale=5):
cards = gr.HTML()
cards.value = "<div style='padding:18px'>Loading tasks…</div>"
show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])
search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
initial_tasks = filter_tasks([], [], "")
cards.value = render_cards(initial_tasks)
if __name__ == "__main__":
demo.launch()