Spaces:
Running
Running
File size: 18,542 Bytes
8628943 7532d9b 8628943 72e68df 7532d9b 8628943 7532d9b 8628943 3917981 387857c 3917981 8628943 445a7b6 8628943 445a7b6 8628943 445a7b6 8628943 445a7b6 8628943 445a7b6 8628943 445a7b6 8628943 445a7b6 8628943 445a7b6 7532d9b 8628943 7532d9b 8628943 445a7b6 8628943 445a7b6 8628943 7532d9b 8628943 7532d9b 290947d 8628943 3917981 445a7b6 8628943 445a7b6 8628943 290947d 8628943 3917981 8628943 445a7b6 8628943 445a7b6 8628943 290947d 8628943 290947d 7532d9b 290947d b7fe2d8 290947d 8628943 290947d 8628943 3917981 8628943 290947d 3917981 b7fe2d8 290947d 8628943 290947d 8628943 290947d b7fe2d8 290947d 8628943 290947d b7fe2d8 290947d b7fe2d8 290947d 8628943 445a7b6 8628943 ca0d805 8628943 445a7b6 8628943 ca0d805 8628943 445a7b6 8628943 445a7b6 8628943 ca0d805 290947d ca0d805 8628943 290947d ca0d805 8628943 290947d 3917981 290947d ca0d805 290947d ca0d805 8628943 290947d ca0d805 8628943 3917981 ca0d805 8628943 7532d9b 8628943 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 |
"""
Gradio dashboard to explore Lighteval tasks.
Scans `src/lighteval/tasks/tasks` and `src/lighteval/tasks/multilingual/tasks`
for module-level docstrings with this format:
name: <task display name>
dataset: <dataset id(s)>
abstract: <free text>
languages: <comma/newline separated language codes or names>
tags: <comma/newline separated tags>
paper: <url>
This file stays outside the lighteval src tree, per request.
"""
import ast
import os
import re
from collections import Counter
from dataclasses import dataclass
import gradio as gr
REPO_ROOT = "."
TASK_DIRS = [
os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "tasks"),
os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "multilingual", "tasks"),
]
star_benchmarks = [
"aime",
"mmlu_pro",
"gpqa:diamond",
"hle",
"arc_agi_2",
"ifbench",
"ifeval",
"live code bench",
"math 500",
"mix_eval",
"musr",
"simpleqa",
"MMLU pro"
]
@dataclass
class TaskDoc:
file_path: str
module: str
abstract: str
languages: list[str]
tags: list[str]
paper: str | None
dataset: str | None
name: str | None = None
def read_file_text(path: str) -> str | None:
try:
with open(path, "r", encoding="utf-8") as f:
return f.read()
except Exception:
return None
def parse_module_docstring(text: str) -> str | None:
try:
mod = ast.parse(text)
return ast.get_docstring(mod)
except Exception:
# Fallback: naive regex for triple-quoted string at top
m = re.match(r"^\s*([\'\"])\1\1([\s\S]*?)\1\1\1", text)
return m.group(2).strip() if m else None
def parse_sections(doc: str) -> dict[str, str]:
# Very simple section parser keyed by lines ending with ':' on their own
# Expected keys: name, dataset, abstract, languages, tags, paper
out: dict[str, str] = {"name": "", "dataset": "", "abstract": "", "languages": "", "tags": "", "paper": ""}
current_key: str | None = None
for raw_line in doc.splitlines():
line = raw_line.rstrip()
if line.endswith(":") and line.strip().lower() in {"name:", "dataset:", "abstract:", "languages:", "tags:", "paper:"}:
current_key = line[:-1].strip().lower()
continue
if current_key is not None:
# Preserve paragraphs; we will normalize later
out[current_key] = (out[current_key] + ("\n" if out[current_key] else "") + line).strip()
return out
def split_list_field(value: str) -> list[str]:
if not value:
return []
# Support comma and newline separated values
parts = re.split(r"[\n,]", value)
cleaned: list[str] = []
for p in parts:
token = p.strip()
if not token:
continue
cleaned.append(token)
return cleaned
def discover_task_files() -> list[str]:
files: list[str] = []
print(f"Discovering task files in: {TASK_DIRS}")
for base in TASK_DIRS:
print(f"Discovering task files in: {base}")
if not os.path.isdir(base):
continue
# Top-level python files in the directory
for name in os.listdir(base):
if name.endswith(".py"):
files.append(os.path.join(base, name))
# Also include subdirectory main.py files
for dirpath, dirnames, filenames in os.walk(base):
if dirpath == base:
continue
if "main.py" in filenames:
files.append(os.path.join(dirpath, "main.py"))
# Deduplicate while preserving order
seen: set = set()
unique_files: list[str] = []
for p in files:
if p in seen:
continue
seen.add(p)
unique_files.append(p)
return sorted(unique_files)
def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
docs: list[TaskDoc] = []
language_counts: Counter = Counter()
tag_set: set = set()
for path in discover_task_files():
text = read_file_text(path)
if not text:
continue
doc = parse_module_docstring(text)
if not doc:
continue
sections = parse_sections(doc)
abstract = sections.get("abstract", "").strip()
langs = [lang.lower() for lang in split_list_field(sections.get("languages", ""))]
tgs = [t.lower() for t in split_list_field(sections.get("tags", ""))]
paper = sections.get("paper", "").strip() or None
dataset = sections.get("dataset", "").strip() or None
name = sections.get("name", "").strip() or None
for lang in langs:
language_counts[lang] += 1
for t in tgs:
tag_set.add(t)
module = os.path.relpath(path, REPO_ROOT)
docs.append(TaskDoc(file_path=path, module=module, abstract=abstract, languages=langs, tags=tgs, paper=paper, dataset=dataset, name=name))
languages_sorted = [
lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
]
tags_sorted = sorted(tag_set)
return docs, languages_sorted, tags_sorted
def build_index() -> tuple[list[TaskDoc], list[str], list[str]]:
return index_tasks()
ALL_TASKS, ALL_LANGS, ALL_TAGS = build_index()
TOP_LANGS = ALL_LANGS[:8] # show more by default
def normalize_name_for_matching(name: str) -> str:
# Normalize for comparison: lowercase, remove underscores/spaces/colons
return re.sub(r"[_\s:]+", "", name.lower())
def is_starred_benchmark(td: TaskDoc) -> bool:
# Check multiple possible identifiers
parts = td.module.replace("\\", "/").split("/")
base_no_ext = parts[-1].rsplit(".", 1)[0]
fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
# Normalize all possible identifiers
task_name_raw = (td.name or "").lower().strip()
task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
normalized_task_display = normalize_name_for_matching(task_name_display)
normalized_module = normalize_name_for_matching(base_no_ext)
normalized_name = normalize_name_for_matching(task_name_raw)
# Also check dataset if available
normalized_dataset = normalize_name_for_matching(td.dataset or "")
# Check against star_benchmarks list - try multiple matching strategies
for star_name in star_benchmarks:
normalized_star = normalize_name_for_matching(star_name)
# Try exact match or substring match on various fields
if (normalized_star == normalized_task_display or
normalized_star == normalized_module or
normalized_star == normalized_name or
normalized_star in normalized_task_display or
normalized_star in normalized_module or
(normalized_dataset and normalized_star in normalized_dataset) or
star_name.lower() in task_name_display or
star_name.lower() in base_no_ext.lower()):
return True
return False
def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
selected_langs = [lang.lower() for lang in (languages or [])]
selected_tags = [t.lower() for t in (tags or [])]
search_lc = (search or "").strip().lower()
out: list[TaskDoc] = []
for td in ALL_TASKS:
if selected_langs and not any(lang in td.languages for lang in selected_langs):
continue
if selected_tags and not any(t in td.tags for t in selected_tags):
continue
if search_lc:
# Search module path, abstract, tags, and dataset names
hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
if search_lc not in hay:
continue
out.append(td)
# Sort: starred benchmarks first, then by name
out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower()))
return out
def render_cards(tasks: list[TaskDoc]) -> str:
# Responsive grid of pretty cards; show all details without clicks
items: list[str] = []
for t in tasks:
parts = t.module.replace("\\", "/").split("/")
base_no_ext = parts[-1].rsplit(".", 1)[0]
fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
task_name = (t.name or fallback_name).replace("_", " ").title()
mod_path = t.module.replace("\\", "/")
mod_path = mod_path.split("/", 1)[1]
source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
langs_html = " ".join([f'<span class=\"chip chip-lang\" title=\"language: {lang}\">{lang}</span>' for lang in t.languages]) if t.languages else ""
chips_tags_html = f'<div class="chips chips-tags">{tags_html}</div>' if tags_html else ""
chips_langs_html = f'<div class="chips chips-langs">{langs_html}</div>' if langs_html else ""
abstract_html = (t.abstract or "-").replace("\n", "<br/>")
sep_html = ' <span class="sep">|</span> ' if paper_html else ""
links_html = f"{source_html}{sep_html}{paper_html}"
dataset_links = []
if t.dataset:
for ds in [d.strip() for d in t.dataset.split(",") if d.strip()]:
dataset_links.append(f'<a class="dataset" href="https://huggingface.co/datasets/{ds}" target="_blank" rel="noopener">{ds}</a>')
dataset_html = " ".join(dataset_links) if dataset_links else ""
star_icon = "⭐ " if is_starred_benchmark(t) else ""
items.append(
f"""
<article class="card" tabindex="0" aria-label="Task {task_name}">
<div class="title"><span class="title-text">{star_icon}{task_name}</span> <span class="dataset-inline">{dataset_html}</span></div>
{chips_tags_html}
{chips_langs_html}
<div class="abstract">{abstract_html}</div>
<div class="links">{links_html}</div>
</article>
"""
)
# CSS includes light and dark mode support
style = """
<style>
/* layout */
.cards-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 16px; margin-top: 10px; }
/* card base */
.card { border-radius: 12px; padding: 14px; transition: box-shadow 160ms ease, transform 120ms ease, border-color 120ms ease; outline: none; }
.card:hover, .card:focus { transform: translateY(-4px); box-shadow: 0 10px 30px rgba(2,6,23,0.08); }
.title { display:flex; align-items:center; gap:8px; flex-wrap:wrap; }
.title-text { font-weight: 600; font-size: 16px; font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
.dataset-inline { font-size: 12px; }
.chips { margin: 6px 0 4px 0; display:flex; gap:2px; flex-wrap:wrap; }
.chips-tags { margin: 6px 0 2px 0; }
.chips-langs { margin: 0 0 4px 0; }
.chip { display:inline-block; padding:1px 2px; border-radius:999px; font-size:12px; background: #e6f2ff; color: #1e3a8a; }
.chip-lang { background: #e8f5e9; color: #166534; }
.abstract { color: #475569; font-size: 13.5px; line-height: 1.35; margin-top: 6px; min-height: 48px; }
.links { margin-top: 10px; font-size:12px; }
.links a { text-decoration: none; font-weight: 600; }
.sep { color: #94a3b8; margin: 0 8px; }
.dataset { margin-left: 8px; font-size: 12px; color: #0ea5e9; background: #ecfeff; padding: 2px 6px; border-radius: 6px; text-decoration: none; }
/* Light mode */
:root {
--bg: #f8fafc;
--card-bg: #ffffff;
--card-border: #e6f2ff;
--title-color: #1e3a8a;
--text-color: #0f172a;
--muted: #475569;
--link: #2563eb;
}
/* Dark mode overrides */
@media (prefers-color-scheme: dark) {
:root {
--bg: #0b1220;
--card-bg: #071022;
--card-border: #0f2a44;
--title-color: #93c5fd;
--text-color: #e6eef8;
--muted: #cbd5e1;
--link: #6ea8ff;
}
}
/* apply */
body { background: var(--bg); color: var(--text-color); }
.card { background: var(--card-bg); border: 1px solid var(--card-border); color: var(--text-color); }
.title-text { color: var(--title-color); }
.abstract { color: var(--muted); }
.links a { color: var(--link); }
.chips-tags .chip { background: #e6f2ff; color: #1e3a8a; }
.chips-langs .chip { background: #e8f5e9; color: #166534; }
/* tweak chips for dark mode for better contrast */
@media (prefers-color-scheme: dark) {
.chips-tags .chip { background: rgba(29,78,216,0.35); color: #e6eef8; border: 1px solid rgba(148,163,184,0.15); }
.chips-langs .chip { background: rgba(22,101,52,0.35); color: #e6eef8; border: 1px solid rgba(148,163,184,0.15); }
}
/* small screens adjustments */
@media (max-width: 520px) {
.cards-grid { gap: 10px; }
.title-text { font-size: 15px; }
}
</style>
"""
return style + "<div class=\"cards-grid\">" + "\n".join(items) + "</div>"
def on_filter(languages: list[str], tags: list[str], search: str):
tasks = filter_tasks(languages, tags, search)
count = len(tasks)
total = len(ALL_TASKS)
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
return counter_text, render_cards(tasks)
def on_toggle_language_choices(show_all: bool, selected_langs: list[str], tags: list[str], search: str):
choices = ALL_LANGS if show_all else TOP_LANGS
kept = [lang for lang in (selected_langs or []) if lang in choices]
tasks = filter_tasks(kept, tags, search)
count = len(tasks)
total = len(ALL_TASKS)
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
return gr.update(choices=choices, value=kept), counter_text, render_cards(tasks)
def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
# Only toggle visibility; preserve current tag selections and keep them active in filtering
tags_value: list[str] = selected_tags or []
tasks = filter_tasks(languages, tags_value, search)
count = len(tasks)
total = len(ALL_TASKS)
counter_text = f"**Showing {count} of {total} tasks**" if count != total else f"**{total} tasks**"
# keep selections when showing; when hiding we keep value but component hidden (so filter still uses them)
return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)
with gr.Blocks(title="Lighteval Tasks Explorer", css=None) as demo:
# Header / hero
with gr.Row():
with gr.Column():
gr.Markdown(
"""
<h2 style="margin:6px 0 2px 0;"><a href="https://huggingface.co/huggingface/lighteval">Lighteval</a> Tasks Explorer</h2>
<p style="margin:0 0 12px 0; color:var(--muted);">Browse tasks by language, tags and search the task descriptions.</p>
"""
)
task_counter = gr.Markdown(f"**{len(ALL_TASKS)} tasks**")
# Controls and results in two columns (left: controls, right: cards)
with gr.Row(equal_height=False):
with gr.Column(scale=2):
gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
# Search with interactive debounce
search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="", interactive=True)
# We want debounce behavior: use .change with every character by setting interactive=True and triggering on input
# Filters
with gr.Group():
gr.Markdown("**Languages**")
show_all_langs = gr.Checkbox(label="Show all languages", value=False)
lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[]) # default none selected
with gr.Group():
gr.Markdown("**Benchmark type**")
show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
# small hint
gr.Markdown("Tip: use the filters and search together. Results update live.")
with gr.Column(scale=5):
cards = gr.HTML()
# put an initially visible loading placeholder
cards.value = "<div style='padding:18px'>Loading tasks…</div>"
# Wire interactions
# Toggle expand/collapse language choices
show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
# Toggle tag filter visibility (keeps values)
show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])
# Live filtering: wire change events on controls to update cards.
# Textbox: trigger on every change (interactive True). If Gradio runtime has debounce param, it's used internally.
search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
# Initial load: display all tasks (starred benchmarks first)
initial_tasks = filter_tasks([], [], "")
cards.value = render_cards(initial_tasks)
if __name__ == "__main__":
from git import Repo # pip install gitpython
git_url = "https://github.com/huggingface/lighteval.git"
repo_dir = "./lighteval"
if os.path.exists(repo_dir) and os.path.isdir(os.path.join(repo_dir, ".git")):
print(f"Pulling latest changes from {git_url}...")
repo = Repo(repo_dir)
repo.remotes.origin.pull()
else:
print(f"Cloning {git_url} to {repo_dir}...")
Repo.clone_from(git_url, repo_dir)
# Run with `python benchmark_finder/app.py`
demo.launch()
|