Spaces:

OpenEvals
/

open_benchmark_index

Running

App Files Files Community

open_benchmark_index / app.py

Linker1907

fix

430430b 12 days ago

raw

history blame

18.5 kB

	"""
	Gradio dashboard to explore Lighteval tasks.

	Scans `src/lighteval/tasks/tasks` and `src/lighteval/tasks/multilingual/tasks`
	for module-level docstrings with this format:

	name: <task display name>
	dataset: <dataset id(s)>
	abstract: <free text>
	languages: <comma/newline separated language codes or names>
	tags: <comma/newline separated tags>
	paper: <url>

	This file stays outside the lighteval src tree, per request.
	"""

	import ast
	import os
	import re
	from collections import Counter
	from git import Repo # pip install gitpython
	from dataclasses import dataclass

	import gradio as gr


	git_url = "https://github.com/huggingface/lighteval.git"
	repo_dir = "./lighteval"

	if os.path.exists(repo_dir) and os.path.isdir(os.path.join(repo_dir, ".git")):
	print(f"Pulling latest changes from {git_url}...")
	repo = Repo(repo_dir)
	repo.remotes.origin.pull()
	else:
	print(f"Cloning {git_url} to {repo_dir}...")
	Repo.clone_from(git_url, repo_dir)


	REPO_ROOT = "."
	TASK_DIRS = [
	os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "tasks"),
	os.path.join(REPO_ROOT, "lighteval", "src", "lighteval", "tasks", "multilingual", "tasks"),
	]


	star_benchmarks = [
	"aime",
	"mmlu_pro",
	"gpqa:diamond",
	"hle",
	"arc_agi_2",
	"ifbench",
	"ifeval",
	"live code bench",
	"math 500",
	"mix_eval",
	"musr",
	"simpleqa",
	"MMLU pro"
	]


	@dataclass
	class TaskDoc:
	file_path: str
	module: str
	abstract: str
	languages: list[str]
	tags: list[str]
	paper: str \| None
	dataset: str \| None
	name: str \| None = None


	def read_file_text(path: str) -> str \| None:
	try:
	with open(path, "r", encoding="utf-8") as f:
	return f.read()
	except Exception:
	return None


	def parse_module_docstring(text: str) -> str \| None:
	try:
	mod = ast.parse(text)
	return ast.get_docstring(mod)
	except Exception:
	# Fallback: naive regex for triple-quoted string at top
	m = re.match(r"^\s([\'\"])\1\1([\s\S]?)\1\1\1", text)
	return m.group(2).strip() if m else None


	def parse_sections(doc: str) -> dict[str, str]:
	# Very simple section parser keyed by lines ending with ':' on their own
	# Expected keys: name, dataset, abstract, languages, tags, paper
	out: dict[str, str] = {"name": "", "dataset": "", "abstract": "", "languages": "", "tags": "", "paper": ""}
	current_key: str \| None = None
	for raw_line in doc.splitlines():
	line = raw_line.rstrip()
	if line.endswith(":") and line.strip().lower() in {"name:", "dataset:", "abstract:", "languages:", "tags:", "paper:"}:
	current_key = line[:-1].strip().lower()
	continue
	if current_key is not None:
	# Preserve paragraphs; we will normalize later
	out[current_key] = (out[current_key] + ("\n" if out[current_key] else "") + line).strip()
	return out


	def split_list_field(value: str) -> list[str]:
	if not value:
	return []
	# Support comma and newline separated values
	parts = re.split(r"[\n,]", value)
	cleaned: list[str] = []
	for p in parts:
	token = p.strip()
	if not token:
	continue
	cleaned.append(token)
	return cleaned


	def discover_task_files() -> list[str]:
	files: list[str] = []
	print(f"Discovering task files in: {TASK_DIRS}")
	for base in TASK_DIRS:
	print(f"Discovering task files in: {base}")
	if not os.path.isdir(base):
	continue
	# Top-level python files in the directory
	for name in os.listdir(base):
	if name.endswith(".py"):
	files.append(os.path.join(base, name))
	# Also include subdirectory main.py files
	for dirpath, dirnames, filenames in os.walk(base):
	if dirpath == base:
	continue
	if "main.py" in filenames:
	files.append(os.path.join(dirpath, "main.py"))
	# Deduplicate while preserving order
	seen: set = set()
	unique_files: list[str] = []
	for p in files:
	if p in seen:
	continue
	seen.add(p)
	unique_files.append(p)
	return sorted(unique_files)


	def index_tasks() -> tuple[list[TaskDoc], list[str], list[str]]:
	docs: list[TaskDoc] = []
	language_counts: Counter = Counter()
	tag_set: set = set()
	for path in discover_task_files():
	text = read_file_text(path)
	if not text:
	continue
	doc = parse_module_docstring(text)
	if not doc:
	continue
	sections = parse_sections(doc)
	abstract = sections.get("abstract", "").strip()
	langs = [lang.lower() for lang in split_list_field(sections.get("languages", ""))]
	tgs = [t.lower() for t in split_list_field(sections.get("tags", ""))]
	paper = sections.get("paper", "").strip() or None
	dataset = sections.get("dataset", "").strip() or None
	name = sections.get("name", "").strip() or None
	for lang in langs:
	language_counts[lang] += 1
	for t in tgs:
	tag_set.add(t)
	module = os.path.relpath(path, REPO_ROOT)
	docs.append(TaskDoc(file_path=path, module=module, abstract=abstract, languages=langs, tags=tgs, paper=paper, dataset=dataset, name=name))
	languages_sorted = [
	lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
	]
	tags_sorted = sorted(tag_set)
	return docs, languages_sorted, tags_sorted


	def build_index() -> tuple[list[TaskDoc], list[str], list[str]]:
	return index_tasks()


	ALL_TASKS, ALL_LANGS, ALL_TAGS = build_index()
	TOP_LANGS = ALL_LANGS[:8] # show more by default


	def normalize_name_for_matching(name: str) -> str:
	# Normalize for comparison: lowercase, remove underscores/spaces/colons
	return re.sub(r"[_\s:]+", "", name.lower())


	def is_starred_benchmark(td: TaskDoc) -> bool:
	# Check multiple possible identifiers
	parts = td.module.replace("\\", "/").split("/")
	base_no_ext = parts[-1].rsplit(".", 1)[0]
	fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext

	# Normalize all possible identifiers
	task_name_raw = (td.name or "").lower().strip()
	task_name_display = (td.name or fallback_name).replace("_", " ").lower().strip()
	normalized_task_display = normalize_name_for_matching(task_name_display)
	normalized_module = normalize_name_for_matching(base_no_ext)
	normalized_name = normalize_name_for_matching(task_name_raw)

	# Also check dataset if available
	normalized_dataset = normalize_name_for_matching(td.dataset or "")

	# Check against star_benchmarks list - try multiple matching strategies
	for star_name in star_benchmarks:
	normalized_star = normalize_name_for_matching(star_name)
	# Try exact match or substring match on various fields
	if (normalized_star == normalized_task_display or
	normalized_star == normalized_module or
	normalized_star == normalized_name or
	normalized_star in normalized_task_display or
	normalized_star in normalized_module or
	(normalized_dataset and normalized_star in normalized_dataset) or
	star_name.lower() in task_name_display or
	star_name.lower() in base_no_ext.lower()):
	return True
	return False


	def filter_tasks(languages: list[str], tags: list[str], search: str) -> list[TaskDoc]:
	selected_langs = [lang.lower() for lang in (languages or [])]
	selected_tags = [t.lower() for t in (tags or [])]
	search_lc = (search or "").strip().lower()
	out: list[TaskDoc] = []
	for td in ALL_TASKS:
	if selected_langs and not any(lang in td.languages for lang in selected_langs):
	continue
	if selected_tags and not any(t in td.tags for t in selected_tags):
	continue
	if search_lc:
	# Search module path, abstract, tags, and dataset names
	hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
	if search_lc not in hay:
	continue
	out.append(td)
	# Sort: starred benchmarks first, then by name
	out.sort(key=lambda td: (not is_starred_benchmark(td), (td.name or td.module).lower()))
	return out


	def render_cards(tasks: list[TaskDoc]) -> str:
	# Responsive grid of pretty cards; show all details without clicks
	items: list[str] = []
	for t in tasks:
	parts = t.module.replace("\\", "/").split("/")
	base_no_ext = parts[-1].rsplit(".", 1)[0]
	fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
	task_name = (t.name or fallback_name).replace("_", " ").title()
	mod_path = t.module.replace("\\", "/")
	mod_path = mod_path.split("/", 1)[1]
	source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
	paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
	tags_html = " ".join([f'<span class=\"chip\" title=\"tag: {tag}\">{tag}</span>' for tag in t.tags]) if t.tags else ""
	langs_html = " ".join([f'<span class=\"chip chip-lang\" title=\"language: {lang}\">{lang}</span>' for lang in t.languages]) if t.languages else ""
	chips_tags_html = f'<div class="chips chips-tags">{tags_html}</div>' if tags_html else ""
	chips_langs_html = f'<div class="chips chips-langs">{langs_html}</div>' if langs_html else ""
	abstract_html = (t.abstract or "-").replace("\n", "<br/>")
	sep_html = ' <span class="sep">\|</span> ' if paper_html else ""
	links_html = f"{source_html}{sep_html}{paper_html}"
	dataset_links = []
	if t.dataset:
	for ds in [d.strip() for d in t.dataset.split(",") if d.strip()]:
	dataset_links.append(f'<a class="dataset" href="https://huggingface.co/datasets/{ds}" target="_blank" rel="noopener">{ds}</a>')
	dataset_html = " ".join(dataset_links) if dataset_links else ""
	star_icon = "⭐ " if is_starred_benchmark(t) else ""
	items.append(
	f"""
	<article class="card" tabindex="0" aria-label="Task {task_name}">
	<div class="title"><span class="title-text">{star_icon}{task_name}</span> <span class="dataset-inline">{dataset_html}</span></div>
	{chips_tags_html}
	{chips_langs_html}
	<div class="abstract">{abstract_html}</div>
	<div class="links">{links_html}</div>
	</article>
	"""
	)
	# CSS includes light and dark mode support
	style = """
	<style>
	/* layout */
	.cards-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 16px; margin-top: 10px; }

	/* card base */
	.card { border-radius: 12px; padding: 14px; transition: box-shadow 160ms ease, transform 120ms ease, border-color 120ms ease; outline: none; }
	.card:hover, .card:focus { transform: translateY(-4px); box-shadow: 0 10px 30px rgba(2,6,23,0.08); }

	.title { display:flex; align-items:center; gap:8px; flex-wrap:wrap; }
	.title-text { font-weight: 600; font-size: 16px; font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
	.dataset-inline { font-size: 12px; }

	.chips { margin: 6px 0 4px 0; display:flex; gap:2px; flex-wrap:wrap; }
	.chips-tags { margin: 6px 0 2px 0; }
	.chips-langs { margin: 0 0 4px 0; }
	.chip { display:inline-block; padding:1px 2px; border-radius:999px; font-size:12px; background: #e6f2ff; color: #1e3a8a; }
	.chip-lang { background: #e8f5e9; color: #166534; }

	.abstract { color: #475569; font-size: 13.5px; line-height: 1.35; margin-top: 6px; min-height: 48px; }
	.links { margin-top: 10px; font-size:12px; }
	.links a { text-decoration: none; font-weight: 600; }

	.sep { color: #94a3b8; margin: 0 8px; }

	.dataset { margin-left: 8px; font-size: 12px; color: #0ea5e9; background: #ecfeff; padding: 2px 6px; border-radius: 6px; text-decoration: none; }

	/* Light mode */
	:root {
	--bg: #f8fafc;
	--card-bg: #ffffff;
	--card-border: #e6f2ff;
	--title-color: #1e3a8a;
	--text-color: #0f172a;
	--muted: #475569;
	--link: #2563eb;
	}

	/* Dark mode overrides */
	@media (prefers-color-scheme: dark) {
	:root {
	--bg: #0b1220;
	--card-bg: #071022;
	--card-border: #0f2a44;
	--title-color: #93c5fd;
	--text-color: #e6eef8;
	--muted: #cbd5e1;
	--link: #6ea8ff;
	}
	}

	/* apply */
	body { background: var(--bg); color: var(--text-color); }
	.card { background: var(--card-bg); border: 1px solid var(--card-border); color: var(--text-color); }
	.title-text { color: var(--title-color); }
	.abstract { color: var(--muted); }
	.links a { color: var(--link); }
	.chips-tags .chip { background: #e6f2ff; color: #1e3a8a; }
	.chips-langs .chip { background: #e8f5e9; color: #166534; }
	/* tweak chips for dark mode for better contrast */
	@media (prefers-color-scheme: dark) {
	.chips-tags .chip { background: rgba(29,78,216,0.35); color: #e6eef8; border: 1px solid rgba(148,163,184,0.15); }
	.chips-langs .chip { background: rgba(22,101,52,0.35); color: #e6eef8; border: 1px solid rgba(148,163,184,0.15); }
	}

	/* small screens adjustments */
	@media (max-width: 520px) {
	.cards-grid { gap: 10px; }
	.title-text { font-size: 15px; }
	}
	</style>
	"""
	return style + "<div class=\"cards-grid\">" + "\n".join(items) + "</div>"


	def on_filter(languages: list[str], tags: list[str], search: str):
	tasks = filter_tasks(languages, tags, search)
	count = len(tasks)
	total = len(ALL_TASKS)
	counter_text = f"Showing {count} of {total} tasks" if count != total else f"{total} tasks"
	return counter_text, render_cards(tasks)


	def on_toggle_language_choices(show_all: bool, selected_langs: list[str], tags: list[str], search: str):
	choices = ALL_LANGS if show_all else TOP_LANGS
	kept = [lang for lang in (selected_langs or []) if lang in choices]
	tasks = filter_tasks(kept, tags, search)
	count = len(tasks)
	total = len(ALL_TASKS)
	counter_text = f"Showing {count} of {total} tasks" if count != total else f"{total} tasks"
	return gr.update(choices=choices, value=kept), counter_text, render_cards(tasks)


	def on_toggle_tags_visibility(show: bool, selected_tags: list[str], languages: list[str], search: str):
	# Only toggle visibility; preserve current tag selections and keep them active in filtering
	tags_value: list[str] = selected_tags or []
	tasks = filter_tasks(languages, tags_value, search)
	count = len(tasks)
	total = len(ALL_TASKS)
	counter_text = f"Showing {count} of {total} tasks" if count != total else f"{total} tasks"
	# keep selections when showing; when hiding we keep value but component hidden (so filter still uses them)
	return gr.update(visible=show, value=tags_value), counter_text, render_cards(tasks)


	with gr.Blocks(title="Lighteval Tasks Explorer", css=None) as demo:
	# Header / hero
	with gr.Row():
	with gr.Column():
	gr.Markdown(
	"""
	<h2 style="margin:6px 0 2px 0;"><a href="https://huggingface.co/huggingface/lighteval">Lighteval</a> Tasks Explorer</h2>
	<p style="margin:0 0 12px 0; color:var(--muted);">Browse tasks by language, tags and search the task descriptions.</p>
	"""
	)
	task_counter = gr.Markdown(f"{len(ALL_TASKS)} tasks")

	# Controls and results in two columns (left: controls, right: cards)
	with gr.Row(equal_height=False):
	with gr.Column(scale=2):
	gr.Markdown("⭐⭐⭐ Recommended benchmarks are marked with a star icon.")
	# Search with interactive debounce
	search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="", interactive=True)
	# We want debounce behavior: use .change with every character by setting interactive=True and triggering on input
	# Filters
	with gr.Group():
	gr.Markdown("Languages")
	show_all_langs = gr.Checkbox(label="Show all languages", value=False)
	lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[]) # default none selected
	with gr.Group():
	gr.Markdown("Benchmark type")
	show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
	tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
	# small hint
	gr.Markdown("Tip: use the filters and search together. Results update live.")

	with gr.Column(scale=5):
	cards = gr.HTML()
	# put an initially visible loading placeholder
	cards.value = "<div style='padding:18px'>Loading tasks…</div>"

	# Wire interactions
	# Toggle expand/collapse language choices
	show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, task_counter, cards])
	# Toggle tag filter visibility (keeps values)
	show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, task_counter, cards])

	# Live filtering: wire change events on controls to update cards.
	# Textbox: trigger on every change (interactive True). If Gradio runtime has debounce param, it's used internally.
	search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
	lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])
	tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[task_counter, cards])

	# Initial load: display all tasks (starred benchmarks first)
	initial_tasks = filter_tasks([], [], "")
	cards.value = render_cards(initial_tasks)


	# Run with `python benchmark_finder/app.py`
	demo.launch()