import gradio as gr import pandas as pd from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download import time import functools import gc import os from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, INTRODUCTION_TEXT, LLM_BENCHMARKS_TEXT_1, LLM_BENCHMARKS_TEXT_2, CROSS_EVALUATION_METRICS, NOTE_GENERATION_METRICS, HEALTHBENCH_METRICS, TITLE, LOGO, FIVE_PILLAR_DIAGRAM ) from src.display.css_html_js import custom_css from src.display.utils import ( DATASET_BENCHMARK_COLS, OPEN_ENDED_BENCHMARK_COLS, MED_SAFETY_BENCHMARK_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, ACI_BENCHMARK_COLS, SOAP_BENCHMARK_COLS, HEALTHBENCH_BENCHMARK_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, DATASET_COLS, OPEN_ENDED_COLS, MED_SAFETY_COLS, MEDICAL_SUMMARIZATION_COLS, ACI_COLS, SOAP_COLS, HEALTHBENCH_COLS, HEALTHBENCH_HARD_COLS, EVAL_COLS, EVAL_TYPES, NUMERIC_INTERVALS, TYPES, AutoEvalColumn, ModelType, Precision, WeightType, fields, render_generation_templates, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, OpenEndedGreek_COLS, OpenEndedGreek_BENCHMARK_COLS, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, ) from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.populate import get_evaluation_queue_df, get_leaderboard_df from src.submission.submit import add_new_eval # ===================================================================================== # 1. SETUP AND DATA LOADING # ===================================================================================== def restart_space(): API.restart_space(repo_id=REPO_ID) print("Downloading evaluation data...") try: snapshot_download(repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN) snapshot_download(repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", token=TOKEN) print("Downloads complete.") except Exception as e: print(f"An error occurred during download: {e}") restart_space() print("Loading all dataframes into a central dictionary...") start_time = time.time() _, harness_datasets_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, DATASET_COLS, DATASET_BENCHMARK_COLS, "accuracy", "datasets") _, open_ended_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OPEN_ENDED_COLS, OPEN_ENDED_BENCHMARK_COLS, "score", "open_ended") _, med_safety_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MED_SAFETY_COLS, MED_SAFETY_BENCHMARK_COLS, "score", "med_safety") _, medical_summarization_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, MEDICAL_SUMMARIZATION_COLS, MEDICAL_SUMMARIZATION_BENCHMARK_COLS, "score", "medical_summarization") _, aci_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ACI_COLS, ACI_BENCHMARK_COLS, "score", "aci") _, soap_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, SOAP_COLS, SOAP_BENCHMARK_COLS, "score", "soap") _, healthbench_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_COLS, HEALTHBENCH_BENCHMARK_COLS, "score", "healthbench") _, healthbench_hard_original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, HEALTHBENCH_HARD_COLS, HEALTHBENCH_HARD_BENCHMARK_COLS, "score", "healthbench_hard") _, open_ended_arabic_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedArabic_COLS, OpenEndedArabic_BENCHMARK_COLS, "score", "open_ended_arabic") _, open_ended_french_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedFrench_COLS, OpenEndedFrench_BENCHMARK_COLS, "score", "open_ended_french") _, open_ended_portuguese_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedPortuguese_COLS, OpenEndedPortuguese_BENCHMARK_COLS, "score", "open_ended_portuguese") _, open_ended_romanian_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedRomanian_COLS, OpenEndedRomanian_BENCHMARK_COLS, "score", "open_ended_romanian") _, open_ended_greek_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedGreek_COLS, OpenEndedGreek_BENCHMARK_COLS, "score", "open_ended_greek") _, open_ended_spanish_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, OpenEndedSpanish_COLS, OpenEndedSpanish_BENCHMARK_COLS, "score", "open_ended_spanish") _, closed_ended_multilingual_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, ClosedEndedMultilingual_COLS, ClosedEndedMultilingual_BENCHMARK_COLS, "score", "closed_ended_multilingual") ALL_DATASETS = { "datasets": harness_datasets_original_df, "open_ended": open_ended_original_df, "med_safety": med_safety_original_df, "medical_summarization": medical_summarization_original_df, "aci": aci_original_df, "soap": soap_original_df, "healthbench": healthbench_original_df, "healthbench_hard": healthbench_hard_original_df, "open_ended_arabic": open_ended_arabic_df, "open_ended_french": open_ended_french_df, "open_ended_portuguese": open_ended_portuguese_df, "open_ended_romanian": open_ended_romanian_df, "open_ended_greek": open_ended_greek_df, "open_ended_spanish": open_ended_spanish_df, "closed_ended_multilingual": closed_ended_multilingual_df, } end_time = time.time() print(f"Dataframes loaded in {end_time - start_time:.2f} seconds.") # Evaluation Queue DataFrames (finished_eval_queue_df, running_eval_queue_df, pending_eval_queue_df) = get_evaluation_queue_df(EVAL_REQUESTS_PATH, EVAL_COLS) # ===================================================================================== # 2. EFFICIENT FILTERING LOGIC # ===================================================================================== def search_table(df: pd.DataFrame, query: str) -> pd.DataFrame: return df[(df[AutoEvalColumn.model.name].str.contains(query, case=False))] def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame: final_df = [] if query != "": queries = [q.strip() for q in query.split(";")] for _q in queries: _q = _q.strip() if _q != "": temp_filtered_df = search_table(filtered_df, _q) if len(temp_filtered_df) > 0: final_df.append(temp_filtered_df) if len(final_df) > 0: filtered_df = pd.concat(final_df) filtered_df = filtered_df.drop_duplicates( subset=[ AutoEvalColumn.model.name, ] ) return filtered_df def filter_models( df: pd.DataFrame, type_query: list, domain_specific_query: list, size_query: list, precision_query: list, show_deleted: bool ) -> pd.DataFrame: filtered_df = df if type_query is not None: type_name = [t.split(" ")[1] for t in type_query] filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type.name].isin(type_name)] if domain_specific_query is not None: domain_specifics = [] if "πŸ₯ Clinical models" in domain_specific_query: domain_specifics.append(True) if "Generic models" in domain_specific_query: domain_specifics.append(False) filtered_df = filtered_df.loc[df[AutoEvalColumn.is_domain_specific.name].isin(domain_specifics)] if precision_query is not None: if AutoEvalColumn.precision.name in df.columns: filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])] if size_query is not None: numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query])) params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce") mask = params_column.apply(lambda x: any(numeric_interval.contains(x))) filtered_df = filtered_df.loc[mask] return filtered_df def get_filtered_table( shown_columns: list, query: str, domain_specific_query: list, size_query: list, *, # force subset_name to be a keyword-only argument subset_name: str ): original_df = ALL_DATASETS[subset_name] type_query = None filtered_df = filter_models(original_df, type_query, domain_specific_query, size_query, None, False) filtered_df = filter_queries(query, filtered_df) always_here_cols = [AutoEvalColumn.model.name] available_cols = [c for c in shown_columns if c in filtered_df.columns] final_df = filtered_df[always_here_cols + available_cols] del filtered_df gc.collect() return final_df # ===================================================================================== # 3. REUSABLE UI CREATION FUNCTION # ===================================================================================== def create_leaderboard_ui(subset_name: str, column_choices: list, default_columns: list): """Creates a full leaderboard UI block for a given subset.""" with gr.Row(): with gr.Column(): with gr.Row(): search_bar = gr.Textbox( placeholder=f"πŸ” Search for models...", show_label=False, elem_id=f"search-bar-{subset_name}", ) with gr.Row(): shown_columns = gr.CheckboxGroup( choices=column_choices, value=default_columns, label="Select columns to show", elem_id=f"column-select-{subset_name}", interactive=True, ) with gr.Column(min_width=320): filter_domain_specific = gr.CheckboxGroup( label="Domain Specificity", choices=["πŸ₯ Clinical models", "Generic models"], value=["πŸ₯ Clinical models", "Generic models"], interactive=True, elem_id=f"filter-domain-{subset_name}", ) filter_columns_size = gr.CheckboxGroup( label="Model sizes (in billions of parameters)", choices=list(NUMERIC_INTERVALS.keys()), value=list(NUMERIC_INTERVALS.keys()), interactive=True, elem_id=f"filter-size-{subset_name}", ) update_fn = functools.partial(get_filtered_table, subset_name=subset_name) initial_df = update_fn( shown_columns=default_columns, query="", domain_specific_query=["πŸ₯ Clinical models", "Generic models"], size_query=list(NUMERIC_INTERVALS.keys()) ) leaderboard_table = gr.Dataframe( value=initial_df, headers=[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + default_columns, datatype=TYPES, elem_id=f"leaderboard-table-{subset_name}", interactive=False, ) inputs = [shown_columns, search_bar, filter_domain_specific, filter_columns_size] # Attach listeners to all input components for component in inputs: if isinstance(component, gr.Textbox): component.submit(update_fn, inputs, leaderboard_table) else: component.change(update_fn, inputs, leaderboard_table) return leaderboard_table # ===================================================================================== # 4. GRADIO DEMO UI (Main application layout) # ===================================================================================== demo = gr.Blocks(css=custom_css) with demo: gr.HTML(LOGO) gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons") as tabs: with gr.TabItem("πŸ… Open Ended Evaluation", elem_id="llm-benchmark-tab-table", id=1): with gr.Tabs(elem_classes="tab-buttons6") as language_tabs: LANGUAGES = { "πŸ‡ΊπŸ‡Έ English": "open_ended", "πŸ‡¦πŸ‡ͺ Arabic": "open_ended_arabic", "πŸ‡«πŸ‡· French": "open_ended_french", "πŸ‡ͺπŸ‡Έ Spanish": "open_ended_spanish", "πŸ‡΅πŸ‡Ή Portuguese": "open_ended_portuguese", "πŸ‡·πŸ‡΄ Romanian": "open_ended_romanian", "πŸ‡¬πŸ‡· Greek": "open_ended_greek", } for idx, (label, subset) in enumerate(LANGUAGES.items()): with gr.TabItem(label, elem_id=f"llm-benchmark-tab-open-{subset}", id=idx): judge_text = "**Note:** Llama 3.1 70B Instruct has been used as judge for English." if label == "πŸ‡ΊπŸ‡Έ English" else "**Note:** Qwen 2.5 72B Instruct has been used as judge for this language." gr.Markdown(judge_text, elem_classes="markdown-text") create_leaderboard_ui( subset_name=subset, column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)], default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.open_ended_col)] ) with gr.Accordion("πŸ’¬ Generation templates", open=False): with gr.Accordion("Response generation", open=False): render_generation_templates(task="open_ended", generation_type="response_generation") with gr.Accordion("Scoring Rubric", open=False): render_generation_templates(task="open_ended", generation_type="scoring_rubric") with gr.TabItem("πŸ… Medical Summarization", elem_id="llm-benchmark-tab-table", id=2): gr.Markdown(CROSS_EVALUATION_METRICS, elem_classes="markdown-text") create_leaderboard_ui( subset_name="medical_summarization", column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)], default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.medical_summarization_col)] ) with gr.Accordion("πŸ’¬ Generation templates", open=False): with gr.Accordion("Response generation", open=False): render_generation_templates(task="medical_summarization", generation_type="response_generation") with gr.Accordion("Question generation", open=False): render_generation_templates(task="ce", generation_type="question_generation") with gr.Accordion("Cross Examination", open=False): render_generation_templates(task="ce", generation_type="cross_examination") with gr.TabItem("πŸ… Note generation", elem_id="llm-benchmark-tab-table", id=3): gr.Markdown(NOTE_GENERATION_METRICS, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons2"): with gr.TabItem("ACI Bench", id=0): create_leaderboard_ui( subset_name="aci", column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)], default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.aci_col)] ) with gr.TabItem("SOAP Notes", id=1): create_leaderboard_ui( subset_name="soap", column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)], default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.soap_col)] ) # Add accordions for this section if needed, similar to other tabs with gr.TabItem("πŸ… HealthBench", elem_id="llm-benchmark-tab-table", id=4): gr.Markdown(HEALTHBENCH_METRICS, elem_classes="markdown-text") with gr.Tabs(elem_classes="tab-buttons2"): with gr.TabItem("HealthBench", id=0): create_leaderboard_ui( subset_name="healthbench", column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)], default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_col)] ) with gr.TabItem("HealthBench-Hard", id=1): create_leaderboard_ui( subset_name="healthbench_hard", column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)], default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.healthbench_hard_col)] ) with gr.TabItem("πŸ… Med Safety", elem_id="llm-benchmark-tab-table", id=5): create_leaderboard_ui( subset_name="med_safety", column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)], default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.med_safety_col)] ) with gr.Accordion("πŸ’¬ Generation templates", open=False): with gr.Accordion("Response generation", open=False): render_generation_templates(task="med_safety", generation_type="response_generation") with gr.Accordion("Scoring Rubric", open=False): render_generation_templates(task="med_safety", generation_type="scoring_rubric") with gr.TabItem("πŸ… Closed Ended Evaluation", elem_id="llm-benchmark-tab-closed", id=6): with gr.Tabs(elem_classes="tab-buttons2"): with gr.TabItem("English", id=0): create_leaderboard_ui( subset_name="datasets", column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)], default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.dataset_task_col)] ) with gr.TabItem("🌍 Multilingual", id=1): gr.Markdown("πŸ“Š **Dataset Information:** This tab uses the Global MMLU dataset filtering only the subcategory: medical (10.7%)") create_leaderboard_ui( subset_name="closed_ended_multilingual", column_choices=[c.name for c in fields(AutoEvalColumn) if not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)], default_columns=[c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden and not c.never_hidden and (c.invariant or c.closed_ended_multilingual_col)] ) with gr.TabItem("πŸ“ About", elem_id="llm-benchmark-tab-table", id=7): gr.Markdown(LLM_BENCHMARKS_TEXT_1, elem_classes="markdown-text") gr.HTML(FIVE_PILLAR_DIAGRAM) gr.Markdown(LLM_BENCHMARKS_TEXT_2, elem_classes="markdown-text") with gr.TabItem("πŸš€ Submit here! ", elem_id="llm-benchmark-tab-table", id=8): with gr.Column(): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Accordion(f"βœ… Finished Evaluations ({len(finished_eval_queue_df)})", open=False): gr.Dataframe(value=finished_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5) with gr.Accordion(f"πŸ”„ Running Evaluation Queue ({len(running_eval_queue_df)})", open=False): gr.Dataframe(value=running_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5) with gr.Accordion(f"⏳ Pending Evaluation Queue ({len(pending_eval_queue_df)})", open=False): gr.Dataframe(value=pending_eval_queue_df, headers=EVAL_COLS, datatype=EVAL_TYPES, row_count=5) with gr.Row(): gr.Markdown("# βœ‰οΈβœ¨ Submit your model here!", elem_classes="markdown-text") with gr.Row(): with gr.Column(): model_name_textbox = gr.Textbox(label="Model name") revision_name_textbox = gr.Textbox(label="Revision commit", placeholder="main") model_type = gr.Dropdown( choices=[t.to_str(" : ") for t in ModelType if t != ModelType.Unknown], label="Model type", multiselect=False, value=None, interactive=True, ) with gr.Column(): precision = gr.Dropdown( choices=[i.value.name for i in Precision if i != Precision.Unknown], label="Precision", multiselect=False, value="auto", interactive=True, ) weight_type = gr.Dropdown( choices=[i.value.name for i in WeightType], label="Weights type", multiselect=False, value=WeightType.Original.value.name, interactive=False, ) base_model_name_textbox = gr.Textbox(label="Base model (for delta or adapter weights)", interactive=False) with gr.Row(): domain_specific_toggle = gr.Checkbox( label="Domain specific", value=False, info="Is your model medically oriented?", ) chat_template_toggle = gr.Checkbox( label="Use chat template", value=False, info="Is your model a chat model?", ) submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() submit_button.click( add_new_eval, [ model_name_textbox, base_model_name_textbox, revision_name_textbox, model_type, domain_specific_toggle, chat_template_toggle, precision, weight_type ], submission_result, ) with gr.Row(): with gr.Accordion("πŸ“™ Citation", open=False): gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=20, elem_id="citation-button", show_copy_button=True, ) scheduler = BackgroundScheduler() scheduler.add_job(restart_space, "interval", seconds=86400) scheduler.start() demo.queue(default_concurrency_limit=40).launch(allowed_paths=['./assets/'], share=True , ssr_mode=False)