import gradio as gr from gradio_rangeslider import RangeSlider import pandas as pd import argparse from apscheduler.schedulers.background import BackgroundScheduler from huggingface_hub import snapshot_download from src.data_utils import get_dataframe_category, get_dataframe_language import src.config as configs from src.display.formatting import get_display_model_name from utils import start_watchdog_in_background import time # Parse command line arguments at the top level parser = argparse.ArgumentParser() parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to launch the app on") parser.add_argument("--port", type=int, default=7860, help="Port to launch the app on") parser.add_argument("--mode", default="open", choices=["open"]) args = parser.parse_args() from utils import get_profile_and_organizations, download_with_restart from vis_utils import load_leaderboard_data, create_domain_radar_chart, create_len_overall_scatter, load_leaderboard_language_data, create_language_radar_chart from src.about import ( CITATION_BUTTON_LABEL, CITATION_BUTTON_TEXT, EVALUATION_QUEUE_TEXT, EVALUATION_QUEUE_TEXT_OPTION1, INTRODUCTION_TEXT, BANNER, TITLE, LINK, ) from src.display.css_html_js import custom_css from src.display.utils import ( Precision ) from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN from src.submission.submit import add_new_eval_option from ui import create_leaderboard_tab if args.mode == "open": def restart_space(): API.restart_space(repo_id=REPO_ID) ### Space initialisation download_with_restart( snapshot_download, repo_id=QUEUE_REPO, local_dir=EVAL_REQUESTS_PATH, repo_type="dataset", token=TOKEN, restart_func=restart_space ) download_with_restart( snapshot_download, repo_id=RESULTS_REPO, local_dir=EVAL_RESULTS_PATH, repo_type="dataset", token=TOKEN, restart_func=restart_space ) theme = gr.themes.Default( primary_hue="gray", neutral_hue="gray" ) def create_benchmark_tab_content(data_prefix: str): gr.HTML(INTRODUCTION_TEXT) gr.HTML("""

Category Analysis

TRUEBench consists of 10 categories and 46 sub-categories which highly related to productivity assistants.

""") # --- Category Explanation Box (2x5 grid, emoji, desc from about.py) --- from src.about import CATEGORY_DESCRIPTIONS gr.HTML(f"""
📝 Content Generation
{CATEGORY_DESCRIPTIONS["Content Generation"]}
✂️ Editing
{CATEGORY_DESCRIPTIONS["Editing"]}
📊 Data Analysis
{CATEGORY_DESCRIPTIONS["Data Analysis"]}
🧠 Reasoning
{CATEGORY_DESCRIPTIONS["Reasoning"]}
🦄 Hallucination
{CATEGORY_DESCRIPTIONS["Hallucination"]}
🛡️ Safety
{CATEGORY_DESCRIPTIONS["Safety"]}
🔁 Repetition
{CATEGORY_DESCRIPTIONS["Repetition"]}
📝 Summarization
{CATEGORY_DESCRIPTIONS["Summarization"]}
🌐 Translation
{CATEGORY_DESCRIPTIONS["Translation"]}
💬 Multi-Turn
{CATEGORY_DESCRIPTIONS["Multi-Turn"]}
""") df_cat = get_dataframe_category(data_prefix=data_prefix) gr.HTML("""
""") leaderboard_tab_cat = create_leaderboard_tab( df_cat, "Category", mode=args.mode ) gr.HTML("
") # --- Category Radar Chart Section --- from vis_utils import load_leaderboard_data, create_domain_radar_chart initial_df_cat = load_leaderboard_data(data_prefix=data_prefix) # Top 5 models based on leaderboard (Average Accuracy) if "Overall" in initial_df_cat.columns: top5_models_cat = initial_df_cat.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5] else: top5_models_cat = initial_df_cat['Model Name'].tolist()[:5] gr.HTML('
') # Radar chart model selector (up to 5) display_names_cat = initial_df_cat['Model Name'].apply(get_display_model_name).tolist() original_names_cat = initial_df_cat['Model Name'].tolist() display_to_original_cat = dict(zip(display_names_cat, original_names_cat)) top5_display_names_cat = [get_display_model_name(m) for m in top5_models_cat] model_selector_cat = gr.Dropdown( choices=display_names_cat, value=top5_display_names_cat, multiselect=True, label="🎯 Select Models for Radar Chart", info="Choose up to 5 models to visualize", elem_classes=["dropdown", "custom-dropdown"], interactive=True, filterable=True, allow_custom_value=False ) gr.HTML(""" """) radar_chart_cat = gr.Plot( label="", value=create_domain_radar_chart( initial_df_cat, "Average Accuracy", top5_models_cat, mode=args.mode ), elem_classes=["radar-chart", "plot-container"] ) gr.HTML('
') # --- Speed Med Bar Plot Section (NEW) --- import json with open(f"src/data/{data_prefix}/time_data.json", "r") as f: time_data = json.load(f) time_data_state = gr.State(value=time_data) gr.HTML("""

Speed per GPU

Speed per GPU represents the number of tokens generated per second divided by the number of GPUs during the inference.

Setting: We measured the speed in an H100 GPU environment consisting of 4 nodes with 8 GPUs each, using vLLM and Ray to set the tensor parallel size between 1 and 32 (In the plot, GPU refers to the tensor parallel size).
We performed inference by sending an asynchronous request to the served model, and we set the concurrency to 32.
Note: We measured the speed by directly serving open-source models, and proprietary models were excluded from the plot.

""") # --- Speed Bar Plot UI: Row with left (category selector) and right (min/max dials) --- category_columns = [col for col in configs.ON_LOAD_COLUMNS_CATEGORY if col not in configs.CATEGORY_EXCLUDED_COLUMNS] default_category = "Overall" default_x_axis_sort_by = "Overall Score" with gr.Row(): with gr.Column(scale=1): x_axis_sort_by = gr.Radio( choices=["Overall Score", "Speed per GPU"], value="Overall Score", label="Sort X-Axis by", elem_id=f"x-axis-btn-radio-{data_prefix.replace('/', '')}", # Make elem_id unique elem_classes=["x-axis-btn-radio"], interactive=True, show_label=True ) with gr.Column(scale=1): min_max_score_slider = RangeSlider( minimum=0, maximum=100, value=(0, 100), step=1, label="Minimum and Maximum Overall Score", interactive=True ) with gr.Column(scale=1): min_max_param_size_slider = RangeSlider( minimum=0, maximum=1000, value=(0, 1000), step=1, label="Minimum and Maximum Parameter Size (B)", interactive=True ) # Speed Bar Plot from vis_utils import create_speed_med_bar_plot speed_med_bar_plot = gr.Plot( label="", value=create_speed_med_bar_plot( initial_df_cat, time_data, min_size=0, max_size=1000, min_score=0, max_score=100, category=default_category, theme="light", x_axis_sort_by=default_x_axis_sort_by, mode=args.mode ), elem_classes=["speed-med-bar-plot", "plot-container"] ) gr.HTML("
") # --- Event handler: update Speed bar plot and dials when category or dials change --- def update_speed_med_bar_plot(x_axis_sort_by, min_max_size, min_max_score, current_time_data_state, current_leaderboard_df=None): df = current_leaderboard_df if current_leaderboard_df is not None else initial_df_cat return create_speed_med_bar_plot( df, current_time_data_state, min_size=min_max_size[0], max_size=min_max_size[1], min_score=min_max_score[0], max_score=min_max_score[1], theme="light", x_axis_sort_by=x_axis_sort_by, mode=args.mode ) # Connect category selector to dials and plot x_axis_sort_by.change( fn=update_speed_med_bar_plot, inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state], outputs=speed_med_bar_plot ) min_max_param_size_slider.change( fn=update_speed_med_bar_plot, inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state], outputs=speed_med_bar_plot ) min_max_score_slider.change( fn=update_speed_med_bar_plot, inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state], outputs=speed_med_bar_plot ) # Connect leaderboard filters to dials and plot (if leaderboard_tab_cat provides a filtered DataFrame state) if "df_state" in leaderboard_tab_cat: leaderboard_tab_cat["df_state"].change( fn=lambda df, x_axis_sort_by, min_max_size, min_max_score, time_data: update_speed_med_bar_plot(x_axis_sort_by, min_max_size, min_max_score, time_data, df), inputs=[leaderboard_tab_cat["df_state"], x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state], outputs=speed_med_bar_plot ) # Update radar chart when model_selector_cat selection changes def update_radar_chart_cat(selected_display_names): # If no selection, fallback to top-5 if not selected_display_names or len(selected_display_names) == 0: df = load_leaderboard_data(data_prefix=data_prefix) selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]] selected_models = [display_to_original_cat[name] for name in selected_display_names if name in display_to_original_cat] return create_domain_radar_chart( load_leaderboard_data(data_prefix=data_prefix), "Average Accuracy", selected_models, mode=args.mode ) model_selector_cat.change( fn=update_radar_chart_cat, inputs=model_selector_cat, outputs=radar_chart_cat ) # --- Med. Len. vs Overall Scatter Plot Section --- from vis_utils import create_len_overall_scatter import json with open(f"src/data/{data_prefix}/length_data.json", "r") as f: length_data = json.load(f) # --- Create a Gradio State component to hold length_data --- length_data_state = gr.State(value=length_data) gr.HTML("""

Output Length vs. Category Score

Explore the relationship between median output length and model performance by category

Median Length: Median number of tokens including both Think and Answer
Median Response Length: Median number of answer tokens, excluding Think
Note: We measured the token length of open-source models only and proprietary models were excluded from the plot.

""") # Category selection buttons (HTML + Gradio Radio for event) category_columns = [col for col in configs.ON_LOAD_COLUMNS_CATEGORY if col not in configs.CATEGORY_EXCLUDED_COLUMNS] # (cat-btn-radio related style block removed, now handled in custom_css) category_selector = gr.Radio( choices=category_columns, value="Overall", label="Select Category for Y-Axis", elem_id=f"cat-btn-radio-{data_prefix.replace('/', '')}", # Make elem_id unique elem_classes=["cat-btn-radio"], interactive=True, show_label=True ) x_axis_selector = gr.Radio( choices=["Median Length", "Median Response Length"], value="Median Length", label="Select X-Axis Data", elem_id=f"x-axis-btn-radio-{data_prefix.replace('/', '')}", # Make elem_id unique elem_classes=["x-axis-btn-radio"], interactive=True, show_label=True ) gr.HTML('
') scatter_plot_cat = gr.Plot( label="", value=create_len_overall_scatter( load_leaderboard_data(data_prefix=data_prefix), y_col="Overall", length_data=length_data, x_axis_data_source=x_axis_selector.value ), elem_classes=["efficiency-chart", "plot-container"] ) gr.HTML('
') gr.HTML("
") # Update plot when category or x-axis selection changes def update_scatter_plot_cat(selected_category, selected_x_source, current_length_data_state): return create_len_overall_scatter( load_leaderboard_data(data_prefix=data_prefix), y_col=selected_category, length_data=current_length_data_state, x_axis_data_source=selected_x_source ) category_selector.change( fn=update_scatter_plot_cat, inputs=[category_selector, x_axis_selector, length_data_state], outputs=scatter_plot_cat ) x_axis_selector.change( fn=update_scatter_plot_cat, inputs=[category_selector, x_axis_selector, length_data_state], outputs=scatter_plot_cat ) # When leaderboard selectors change, synchronize model_selector_cat and radar_chart_cat to top-5 def update_model_selector_and_radar_chart_cat_from_leaderboard(types, model_types, thinks, df, sort_col): _, _, top5_models = leaderboard_tab_cat["unified_filter"](types, model_types, thinks, df, sort_col) top5_display_names = [get_display_model_name(m) for m in top5_models[:5]] return gr.update(value=top5_display_names), create_domain_radar_chart( load_leaderboard_data(data_prefix=data_prefix), "Average Accuracy", top5_models[:5], mode=args.mode ) leaderboard_selectors_cat = [ leaderboard_tab_cat["type_selector"], leaderboard_tab_cat["model_type_selector"], leaderboard_tab_cat["think_selector"], leaderboard_tab_cat["df_state"], leaderboard_tab_cat["sort_col_dropdown"] ] for selector in leaderboard_selectors_cat: selector.change( fn=update_model_selector_and_radar_chart_cat_from_leaderboard, inputs=leaderboard_selectors_cat, outputs=[model_selector_cat, radar_chart_cat] ) gr.HTML("""

Language Analysis

As a multilingual benchmark, TRUEBench supports a total of 12 user input languages: Korean (KO), English (EN), Japanese (JA), Chinese (ZH), Polish (PL), German (DE), Portuguese (PT), Spanish (ES), French (FR), Italian (IT), Russian (RU), and Vietnamese (VI).

""") df_lang = get_dataframe_language(data_prefix=data_prefix) leaderboard_tab_lang = create_leaderboard_tab( df_lang, "Language", mode=args.mode ) # --- Language Radar Chart Section --- from vis_utils import load_leaderboard_language_data, create_language_radar_chart initial_df_lang = load_leaderboard_language_data(data_prefix=data_prefix) # Top 5 models based on leaderboard (Overall) if "Overall" in initial_df_lang.columns: top5_models_lang = initial_df_lang.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5] else: top5_models_lang = initial_df_lang['Model Name'].tolist()[:5] gr.HTML('
') # Add model selector display_names_lang = initial_df_lang['Model Name'].apply(get_display_model_name).tolist() original_names_lang = initial_df_lang['Model Name'].tolist() display_to_original_lang = dict(zip(display_names_lang, original_names_lang)) top5_display_names_lang = [get_display_model_name(m) for m in top5_models_lang] model_selector_lang = gr.Dropdown( choices=display_names_lang, value=top5_display_names_lang, multiselect=True, label="🎯 Select Models for Radar Chart", info="Choose up to 5 models to visualize", elem_classes=["dropdown", "custom-dropdown"], interactive=True, filterable=True, allow_custom_value=False ) gr.HTML(""" """) radar_chart_lang = gr.Plot( label="", value=create_language_radar_chart( initial_df_lang, "Average Accuracy", top5_models_lang, mode=args.mode ), elem_classes=["radar-chart", "plot-container"] ) gr.HTML('
') # Update radar chart when model_selector_lang selection changes def update_radar_chart_lang(selected_display_names): if not selected_display_names or len(selected_display_names) == 0: df = load_leaderboard_language_data(data_prefix=data_prefix) selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]] selected_models = [display_to_original_lang[name] for name in selected_display_names if name in display_to_original_lang] return create_language_radar_chart( load_leaderboard_language_data(data_prefix=data_prefix), "Average Accuracy", selected_models, mode=args.mode ) model_selector_lang.change( fn=update_radar_chart_lang, inputs=model_selector_lang, outputs=radar_chart_lang ) # When leaderboard selectors change, automatically synchronize model_selector_lang and radar_chart_lang to top-5 def update_model_selector_and_radar_chart_lang_from_leaderboard(types, model_types, thinks, df, sort_col): _, _, top5_models = leaderboard_tab_lang["unified_filter"](types, model_types, thinks, df, sort_col) top5_display_names = [get_display_model_name(m) for m in top5_models[:5]] return gr.update(value=top5_display_names), create_language_radar_chart( load_leaderboard_language_data(data_prefix=data_prefix), "Average Accuracy", top5_models[:5], mode=args.mode ) leaderboard_selectors_lang = [ leaderboard_tab_lang["type_selector"], leaderboard_tab_lang["model_type_selector"], leaderboard_tab_lang["think_selector"], leaderboard_tab_lang["df_state"], leaderboard_tab_lang["sort_col_dropdown"] ] for selector in leaderboard_selectors_lang: selector.change( fn=update_model_selector_and_radar_chart_lang_from_leaderboard, inputs=leaderboard_selectors_lang, outputs=[model_selector_lang, radar_chart_lang] ) demo = gr.Blocks(css=custom_css, theme=theme) with demo: gr.HTML(BANNER + TITLE + LINK) user_state = gr.State() organization_state = gr.State() with gr.Tabs(elem_classes="tab-buttons") as main_tabs: if args.mode == "open": tab_configurations = [ {"data_prefix": "open/", "tab_name": "TRUEBench", "tab_id": 2} ] else: tab_configurations = [ {"data_prefix": f"{args.mode}-public/", "tab_name": "TRUEBench (public set)", "tab_id": 2}, {"data_prefix": f"{args.mode}-full/", "tab_name": "TRUEBench (private set)", "tab_id": 3} ] for config in tab_configurations: with gr.TabItem(config["tab_name"], elem_id="llm-benchmark-tab-table", id=config["tab_id"]): create_benchmark_tab_content(data_prefix=config["data_prefix"]) with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=4): with gr.Column(): with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text") with gr.Row(): gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION1, elem_classes="markdown-text") with gr.Row(): gr.Markdown("## ✉️ Submit your model here!", elem_classes="markdown-text") if args.mode == "open": login_button = gr.LoginButton() with gr.Row(): with gr.Column(): contact_email = gr.Textbox(label="Contact Email", placeholder="Your email address", interactive=True) model_name_textbox = gr.Textbox(label="Model Name") model_type_dropdown = gr.Dropdown( choices=["Instruct", "Think", "Hybrid"], label="Model Type (Instruct, Think, or Hybrid)", multiselect=False, value="Instruct", interactive=True, ) think_type_dropdown = gr.Dropdown( choices=["On", "Off"], label="Think Mode (On/Off)", multiselect=False, value="Off", interactive=False, ) precision = gr.Dropdown( choices=[i.value.name for i in Precision if i != Precision.Unknown], label="Precision", multiselect=False, value="float16", interactive=True, ) # --- Dynamically control think_type based on model_type and connect event --- def update_think_type(model_type_value): if model_type_value == "Instruct": return gr.update(value="Off", interactive=False) elif model_type_value == "Think": return gr.update(value="On", interactive=False) else: # Hybrid return gr.update(value="On", interactive=True) model_type_dropdown.change( fn=update_think_type, inputs=model_type_dropdown, outputs=think_type_dropdown ) response_prefix_textbox = gr.Textbox(label="Response prefix", placeholder="(e.g., )") with gr.Column(): yml_textbox_placeholder = """# vLLM serving parameters # Refence: https://docs.vllm.ai/en/latest/cli/serve.html llm_serve_args: max_model_len: tensor_parallel_size: dtype: ... # OpenAI-compatible API (chat completion) # Reference: https://platform.openai.com/docs/api-reference/chat sampling_params: top_p: temperature: presence_penalty: ... # vLLM sampling parameters # Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#chat-api_1 extra_body: chat_template_kwargs: enable_thinking: ... top_k: repetition_penalty: ...""" yml_textbox = gr.Textbox( label="Configuration (YAML format)", elem_id="yml-textbox", lines=7, value=yml_textbox_placeholder ) upbox = gr.File( label="Upload configuration file as .yml or .yaml", file_types=[".yml", ".yaml"], type="filepath", height=150 ) # Add Translate to JSON button below upbox translate_button = gr.Button( "Translate to JSON", elem_id="translate-to-json-btn", elem_classes=["translate-btn"], scale=None ) # Add custom style for the button gr.HTML( ''' ''' ) with gr.Column(): requirements_textbox = gr.Textbox(label="(Optional) Requirements", lines=30, elem_id="requirements-textbox") output_dict = gr.Code(label="Translated Python Dictionary", language="json") submit_button = gr.Button("Submit Eval") submission_result = gr.Markdown() def parse_and_display_yaml_config(upbox_path, yml_textbox_value): import yaml, json if upbox_path: try: with open(upbox_path, "r", encoding="utf-8") as f: data = yaml.safe_load(f) if data is None: return "YAML file is empty." return json.dumps(data, indent=4, ensure_ascii=False) except Exception as e: return f"Error parsing YAML file: {e}" elif yml_textbox_value and yml_textbox_value.strip(): try: data = yaml.safe_load(yml_textbox_value) if data is None: return "YAML textbox is empty or invalid." return json.dumps(data, indent=4, ensure_ascii=False) except Exception as e: return f"Error parsing YAML textbox: {e}" else: return "" if args.mode == "open": event = submit_button.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state]) event.then( add_new_eval_option, [ contact_email, model_name_textbox, model_type_dropdown, think_type_dropdown, precision, response_prefix_textbox, requirements_textbox, user_state, organization_state, yml_textbox, upbox, ], submission_result, ).then( fn=parse_and_display_yaml_config, inputs=[upbox, yml_textbox], outputs=output_dict ) translate_button.click( fn=parse_and_display_yaml_config, inputs=[upbox, yml_textbox], outputs=output_dict ) with gr.Row(): with gr.Accordion("📙 Citation", open=False): citation_button = gr.Textbox( value=CITATION_BUTTON_TEXT, label=CITATION_BUTTON_LABEL, lines=9, elem_id="citation-button", show_copy_button=True, ) if args.mode == "open": def health_fn() -> str: try: initial_df_cat_ = load_leaderboard_data() if "Overall" in initial_df_cat_.columns: return "ok" else: return "error" except: return "error" gr.api(health_fn, api_name="health") demo.load(fn=lambda: start_watchdog_in_background(), inputs=None, outputs=None, queue=False) if args.mode == "open": demo.queue(default_concurrency_limit=40).launch(prevent_thread_lock=True) while True: time.sleep(600) else: if __name__ == "__main__": demo.queue(default_concurrency_limit=40).launch(server_name=args.ip, server_port=args.port)