Spaces:

OpenEvals
/

open_benchmark_index

Running

App Files Files Community

open_benchmark_index / app.py

Linker1907

init

a22d77d about 2 months ago

raw

history blame

15.8 kB

	"""
	Gradio dashboard to explore Lighteval tasks.

	Scans `src/lighteval/tasks/tasks` and `src/lighteval/tasks/multilingual/tasks`
	for module-level docstrings with this format:

	name: <task display name>
	dataset: <dataset id(s)>
	abstract: <free text>
	languages: <comma/newline separated language codes or names>
	tags: <comma/newline separated tags>
	paper: <url>

	This file stays outside the lighteval src tree, per request.
	"""

	import ast
	import json
	import os
	import re
	from collections import Counter
	from dataclasses import asdict, dataclass
	from typing import Dict, List, Optional, Tuple

	import gradio as gr


	REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
	TASK_DIRS = [
	os.path.join(REPO_ROOT, "src", "lighteval", "tasks", "tasks"),
	os.path.join(REPO_ROOT, "src", "lighteval", "tasks", "multilingual", "tasks"),
	]
	# place cache inside repo root to avoid dirname('') issue
	CACHE_PATH = "tasks_index.json"


	@dataclass
	class TaskDoc:
	file_path: str
	module: str
	abstract: str
	languages: List[str]
	tags: List[str]
	paper: Optional[str]
	dataset: Optional[str]
	name: Optional[str] = None


	def read_file_text(path: str) -> Optional[str]:
	try:
	with open(path, "r", encoding="utf-8") as f:
	return f.read()
	except Exception:
	return None


	def parse_module_docstring(text: str) -> Optional[str]:
	try:
	mod = ast.parse(text)
	return ast.get_docstring(mod)
	except Exception:
	# Fallback: naive regex for triple-quoted string at top
	m = re.match(r"^\s([\'\"])\1\1([\s\S]?)\1\1\1", text)
	return m.group(2).strip() if m else None


	def parse_sections(doc: str) -> Dict[str, str]:
	# Very simple section parser keyed by lines ending with ':' on their own
	# Expected keys: name, dataset, abstract, languages, tags, paper
	out: Dict[str, str] = {"name": "", "dataset": "", "abstract": "", "languages": "", "tags": "", "paper": ""}
	current_key: Optional[str] = None
	for raw_line in doc.splitlines():
	line = raw_line.rstrip()
	if line.endswith(":") and line.strip().lower() in {"name:", "dataset:", "abstract:", "languages:", "tags:", "paper:"}:
	current_key = line[:-1].strip().lower()
	continue
	if current_key is not None:
	# Preserve paragraphs; we will normalize later
	out[current_key] = (out[current_key] + ("\n" if out[current_key] else "") + line).strip()
	return out


	def split_list_field(value: str) -> List[str]:
	if not value:
	return []
	# Support comma and newline separated values
	parts = re.split(r"[\n,]", value)
	cleaned: List[str] = []
	for p in parts:
	token = p.strip()
	if not token:
	continue
	cleaned.append(token)
	return cleaned


	def discover_task_files() -> List[str]:
	files: List[str] = []
	for base in TASK_DIRS:
	if not os.path.isdir(base):
	continue
	# Top-level python files in the directory
	for name in os.listdir(base):
	if name.endswith(".py"):
	files.append(os.path.join(base, name))
	# Also include subdirectory main.py files
	for dirpath, dirnames, filenames in os.walk(base):
	if dirpath == base:
	continue
	if "main.py" in filenames:
	files.append(os.path.join(dirpath, "main.py"))
	# Deduplicate while preserving order
	seen: set = set()
	unique_files: List[str] = []
	for p in files:
	if p in seen:
	continue
	seen.add(p)
	unique_files.append(p)
	return sorted(unique_files)


	def index_tasks() -> Tuple[List[TaskDoc], List[str], List[str]]:
	docs: List[TaskDoc] = []
	language_counts: Counter = Counter()
	tag_set: set = set()
	for path in discover_task_files():
	text = read_file_text(path)
	if not text:
	continue
	doc = parse_module_docstring(text)
	if not doc:
	continue
	sections = parse_sections(doc)
	abstract = sections.get("abstract", "").strip()
	langs = [lang.lower() for lang in split_list_field(sections.get("languages", ""))]
	tgs = [t.lower() for t in split_list_field(sections.get("tags", ""))]
	paper = sections.get("paper", "").strip() or None
	dataset = sections.get("dataset", "").strip() or None
	name = sections.get("name", "").strip() or None
	for lang in langs:
	language_counts[lang] += 1
	for t in tgs:
	tag_set.add(t)
	module = os.path.relpath(path, REPO_ROOT)
	docs.append(TaskDoc(file_path=path, module=module, abstract=abstract, languages=langs, tags=tgs, paper=paper, dataset=dataset, name=name))
	languages_sorted = [
	lang for lang, _ in sorted(language_counts.items(), key=lambda kv: (-kv[1], kv[0]))
	]
	tags_sorted = sorted(tag_set)
	return docs, languages_sorted, tags_sorted


	def save_index(path: str, tasks: List[TaskDoc], langs: List[str], tags: List[str]) -> None:
	data = {
	"tasks": [asdict(t) for t in tasks],
	"languages": list(langs),
	"tags": list(tags),
	}
	os.makedirs(os.path.dirname(path) or ".", exist_ok=True)
	with open(path, "w", encoding="utf-8") as f:
	json.dump(data, f, ensure_ascii=False, indent=2)


	def load_index(path: str) -> Optional[Tuple[List[TaskDoc], List[str], List[str]]]:
	if not os.path.exists(path):
	return None
	with open(path, "r", encoding="utf-8") as f:
	data = json.load(f)
	tasks = [TaskDoc(**t) for t in data.get("tasks", [])]
	langs = list(data.get("languages", []))
	tags = list(data.get("tags", []))
	return tasks, langs, tags


	def build_and_cache_index() -> Tuple[List[TaskDoc], List[str], List[str]]:
	tasks, langs, tags = index_tasks()
	save_index(CACHE_PATH, tasks, langs, tags)
	return tasks, langs, tags


	_loaded = load_index(CACHE_PATH)
	if _loaded is None:
	print("Building and caching index...")
	ALL_TASKS, ALL_LANGS, ALL_TAGS = build_and_cache_index()
	else:
	print("Loading index from cache...")
	ALL_TASKS, ALL_LANGS, ALL_TAGS = _loaded
	print(f"Loaded {len(ALL_TASKS)} tasks from cache")
	TOP_LANGS = ALL_LANGS[:8] # show more by default


	def filter_tasks(languages: List[str], tags: List[str], search: str) -> List[TaskDoc]:
	selected_langs = [lang.lower() for lang in (languages or [])]
	selected_tags = [t.lower() for t in (tags or [])]
	search_lc = (search or "").strip().lower()
	out: List[TaskDoc] = []
	for td in ALL_TASKS:
	if selected_langs and not any(lang in td.languages for lang in selected_langs):
	continue
	if selected_tags and not any(t in td.tags for t in selected_tags):
	continue
	if search_lc:
	# Search module path, abstract, tags, and dataset names
	hay = " ".join([td.module, td.abstract, ", ".join(td.tags), (td.dataset or "")]).lower()
	if search_lc not in hay:
	continue
	out.append(td)
	return out


	def render_cards(tasks: List[TaskDoc]) -> str:
	# Responsive grid of pretty cards; show all details without clicks
	items: List[str] = []
	for t in tasks:
	parts = t.module.replace("\\", "/").split("/")
	base_no_ext = parts[-1].rsplit(".", 1)[0]
	fallback_name = parts[-2] if base_no_ext == "main" and len(parts) >= 2 else base_no_ext
	task_name = (t.name or fallback_name).replace("_", " ").title()
	mod_path = t.module.replace("\\", "/")
	source_html = f'<a href="https://github.com/huggingface/lighteval/blob/main/{mod_path}" target="_blank" rel="noopener">source</a>'
	paper_html = f'<a href="{t.paper}" target="_blank" rel="noopener">paper</a>' if t.paper else ""
	tags_html = " ".join([f'<span class="chip" title="tag: {tag}">{tag}</span>' for tag in t.tags]) if t.tags else ""
	langs_html = " ".join([f'<span class="chip chip-lang" title="language: {lang}">{lang}</span>' for lang in t.languages]) if t.languages else ""
	abstract_html = (t.abstract or "-").replace("\n", "<br/>")
	sep_html = ' <span class="sep">\|</span> ' if paper_html else ""
	links_html = f"{source_html}{sep_html}{paper_html}"
	dataset_links = []
	if t.dataset:
	for ds in [d.strip() for d in t.dataset.split(",") if d.strip()]:
	dataset_links.append(f'<a class="dataset" href="https://huggingface.co/datasets/{ds}" target="_blank" rel="noopener">{ds}</a>')
	dataset_html = " ".join(dataset_links) if dataset_links else ""
	items.append(
	f"""
	<article class="card" tabindex="0" aria-label="Task {task_name}">
	<div class="title"><span class="title-text">{task_name}</span> <span class="dataset-inline">{dataset_html}</span></div>
	<div class="chips">{tags_html} {langs_html}</div>
	<div class="abstract">{abstract_html}</div>
	<div class="links">{links_html}</div>
	</article>
	"""
	)
	# CSS includes light and dark mode support
	style = """
	<style>
	/* layout */
	.cards-grid { display: grid; grid-template-columns: repeat(auto-fill, minmax(280px, 1fr)); gap: 16px; margin-top: 10px; }

	/* card base */
	.card { border-radius: 12px; padding: 14px; transition: box-shadow 160ms ease, transform 120ms ease, border-color 120ms ease; outline: none; }
	.card:hover, .card:focus { transform: translateY(-4px); box-shadow: 0 10px 30px rgba(2,6,23,0.08); }

	.title { display:flex; align-items:center; gap:8px; flex-wrap:wrap; }
	.title-text { font-weight: 600; font-size: 16px; font-family: ui-sans-serif, system-ui, -apple-system, "Segoe UI", Roboto, "Helvetica Neue", Arial; }
	.dataset-inline { font-size: 12px; }

	.chips { margin: 8px 0 6px 0; display:flex; gap:8px; flex-wrap:wrap; }
	.chip { display:inline-block; padding:4px 8px; border-radius:999px; font-size:12px; background: rgba(99,102,241,0.08); color: #3730a3; }
	.chip-lang { background: rgba(14,165,233,0.08); color: #0369a1; }

	.abstract { color: #475569; font-size: 13.5px; line-height: 1.35; margin-top: 6px; min-height: 48px; }
	.links { margin-top: 10px; font-size:12px; }
	.links a { text-decoration: none; font-weight: 600; }

	.sep { color: #94a3b8; margin: 0 8px; }

	.dataset { margin-left: 8px; font-size: 12px; color: #0ea5e9; background: #ecfeff; padding: 2px 6px; border-radius: 6px; text-decoration: none; }

	/* Light mode */
	:root {
	--bg: #f8fafc;
	--card-bg: #ffffff;
	--card-border: #e6f2ff;
	--title-color: #1e3a8a;
	--text-color: #0f172a;
	--muted: #475569;
	--link: #2563eb;
	}

	/* Dark mode overrides */
	@media (prefers-color-scheme: dark) {
	:root {
	--bg: #0b1220;
	--card-bg: #071022;
	--card-border: #0f2a44;
	--title-color: #93c5fd;
	--text-color: #e6eef8;
	--muted: #cbd5e1;
	--link: #6ea8ff;
	}
	}

	/* apply */
	body { background: var(--bg); color: var(--text-color); }
	.card { background: var(--card-bg); border: 1px solid var(--card-border); color: var(--text-color); }
	.title-text { color: var(--title-color); }
	.abstract { color: var(--muted); }
	.links a { color: var(--link); }
	.chip { background: rgba(255,255,255,0.03); }
	/* tweak chips for dark mode for better contrast */
	@media (prefers-color-scheme: dark) {
	.chip { background: rgba(255,255,255,0.04); color: var(--text-color); border: 1px solid rgba(255,255,255,0.02); }
	.chip-lang { background: rgba(255,255,255,0.02); }
	}

	/* small screens adjustments */
	@media (max-width: 520px) {
	.cards-grid { gap: 10px; }
	.title-text { font-size: 15px; }
	}
	</style>
	"""
	return style + "<div class=\"cards-grid\">" + "\n".join(items) + "</div>"


	def on_filter(languages: List[str], tags: List[str], search: str):
	tasks = filter_tasks(languages, tags, search)
	return render_cards(tasks)


	def on_toggle_language_choices(show_all: bool, selected_langs: List[str], tags: List[str], search: str):
	choices = ALL_LANGS if show_all else TOP_LANGS
	kept = [lang for lang in (selected_langs or []) if lang in choices]
	tasks = filter_tasks(kept, tags, search)
	return gr.update(choices=choices, value=kept), render_cards(tasks)


	def on_toggle_tags_visibility(show: bool, selected_tags: List[str], languages: List[str], search: str):
	# Only toggle visibility; preserve current tag selections and keep them active in filtering
	tags_value: List[str] = selected_tags or []
	tasks = filter_tasks(languages, tags_value, search)
	# keep selections when showing; when hiding we keep value but component hidden (so filter still uses them)
	return gr.update(visible=show, value=tags_value), render_cards(tasks)


	with gr.Blocks(title="Lighteval Tasks Explorer", css=None) as demo:
	# Header / hero
	with gr.Row():
	gr.Markdown(
	"""
	<h2 style="margin:6px 0 2px 0;">Lighteval Tasks Explorer</h2>
	<p style="margin:0 0 12px 0; color:var(--muted);">Browse tasks by language, tags and search the task descriptions.</p>
	"""
	)

	# Controls and results in two columns (left: controls, right: cards)
	with gr.Row(equal_height=False):
	with gr.Column(scale=2):
	# Search with interactive debounce
	search_tb = gr.Textbox(label="Search", placeholder="Search in module path, tags, abstract…", value="", interactive=True)
	# We want debounce behavior: use .change with every character by setting interactive=True and triggering on input
	# Filters
	with gr.Group():
	gr.Markdown("Languages")
	show_all_langs = gr.Checkbox(label="Show all languages", value=False)
	lang_dd = gr.CheckboxGroup(choices=TOP_LANGS, value=[]) # default none selected
	with gr.Group():
	gr.Markdown("Benchmark type")
	show_tags_filters = gr.Checkbox(label="Show tag checkboxes", value=False)
	tag_dd = gr.CheckboxGroup(choices=ALL_TAGS, value=[], visible=False)
	# small hint
	gr.Markdown("Tip: use the filters and search together. Results update live.")

	with gr.Column(scale=5):
	cards = gr.HTML()
	# put an initially visible loading placeholder
	cards.value = "<div style='padding:18px'>Loading tasks…</div>"

	# Wire interactions
	# Toggle expand/collapse language choices
	show_all_langs.change(on_toggle_language_choices, inputs=[show_all_langs, lang_dd, tag_dd, search_tb], outputs=[lang_dd, cards])
	# Toggle tag filter visibility (keeps values)
	show_tags_filters.change(on_toggle_tags_visibility, inputs=[show_tags_filters, tag_dd, lang_dd, search_tb], outputs=[tag_dd, cards])

	# Live filtering: wire change events on controls to update cards.
	# Textbox: trigger on every change (interactive True). If Gradio runtime has debounce param, it's used internally.
	search_tb.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[cards])
	lang_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[cards])
	tag_dd.change(on_filter, inputs=[lang_dd, tag_dd, search_tb], outputs=[cards])

	# Initial load: display all tasks
	cards.value = render_cards(ALL_TASKS)


	if __name__ == "__main__":
	# Run with `python benchmark_finder/app.py`
	demo.launch()