import gradio as gr
from gradio_rangeslider import RangeSlider
import pandas as pd
import argparse
from apscheduler.schedulers.background import BackgroundScheduler
from huggingface_hub import snapshot_download
from src.data_utils import get_dataframe_category, get_dataframe_language
import src.config as configs
from src.display.formatting import get_display_model_name
from utils import start_watchdog_in_background
import time
# Parse command line arguments at the top level
parser = argparse.ArgumentParser()
parser.add_argument("--ip", type=str, default="127.0.0.1", help="IP address to launch the app on")
parser.add_argument("--port", type=int, default=7860, help="Port to launch the app on")
parser.add_argument("--mode", default="open", choices=["open"])
args = parser.parse_args()
from utils import get_profile_and_organizations, download_with_restart
from vis_utils import load_leaderboard_data, create_domain_radar_chart, create_len_overall_scatter, load_leaderboard_language_data, create_language_radar_chart
from src.about import (
CITATION_BUTTON_LABEL,
CITATION_BUTTON_TEXT,
EVALUATION_QUEUE_TEXT,
EVALUATION_QUEUE_TEXT_OPTION1,
INTRODUCTION_TEXT,
BANNER,
TITLE,
LINK,
)
from src.display.css_html_js import custom_css
from src.display.utils import (
Precision
)
from src.envs import API, EVAL_REQUESTS_PATH, EVAL_RESULTS_PATH, QUEUE_REPO, REPO_ID, RESULTS_REPO, TOKEN
from src.submission.submit import add_new_eval_option
from ui import create_leaderboard_tab
if args.mode == "open":
def restart_space():
API.restart_space(repo_id=REPO_ID)
### Space initialisation
download_with_restart(
snapshot_download,
repo_id=QUEUE_REPO,
local_dir=EVAL_REQUESTS_PATH,
repo_type="dataset",
token=TOKEN,
restart_func=restart_space
)
download_with_restart(
snapshot_download,
repo_id=RESULTS_REPO,
local_dir=EVAL_RESULTS_PATH,
repo_type="dataset",
token=TOKEN,
restart_func=restart_space
)
theme = gr.themes.Default(
primary_hue="gray",
neutral_hue="gray"
)
def create_benchmark_tab_content(data_prefix: str):
gr.HTML(INTRODUCTION_TEXT)
gr.HTML("""
TRUEBench consists of 10 categories and 46 sub-categories which highly related to productivity assistants.
""")
# --- Category Explanation Box (2x5 grid, emoji, desc from about.py) ---
from src.about import CATEGORY_DESCRIPTIONS
gr.HTML(f"""
📝 Content Generation{CATEGORY_DESCRIPTIONS["Content Generation"]}
✂️ Editing{CATEGORY_DESCRIPTIONS["Editing"]}
📊 Data Analysis{CATEGORY_DESCRIPTIONS["Data Analysis"]}
🧠 Reasoning{CATEGORY_DESCRIPTIONS["Reasoning"]}
🦄 Hallucination{CATEGORY_DESCRIPTIONS["Hallucination"]}
🛡️ Safety{CATEGORY_DESCRIPTIONS["Safety"]}
🔁 Repetition{CATEGORY_DESCRIPTIONS["Repetition"]}
📝 Summarization{CATEGORY_DESCRIPTIONS["Summarization"]}
🌐 Translation{CATEGORY_DESCRIPTIONS["Translation"]}
💬 Multi-Turn{CATEGORY_DESCRIPTIONS["Multi-Turn"]}
""")
df_cat = get_dataframe_category(data_prefix=data_prefix)
gr.HTML("""
""")
leaderboard_tab_cat = create_leaderboard_tab(
df_cat,
"Category",
mode=args.mode
)
gr.HTML("
")
# --- Category Radar Chart Section ---
from vis_utils import load_leaderboard_data, create_domain_radar_chart
initial_df_cat = load_leaderboard_data(data_prefix=data_prefix)
# Top 5 models based on leaderboard (Average Accuracy)
if "Overall" in initial_df_cat.columns:
top5_models_cat = initial_df_cat.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5]
else:
top5_models_cat = initial_df_cat['Model Name'].tolist()[:5]
gr.HTML('
')
# Radar chart model selector (up to 5)
display_names_cat = initial_df_cat['Model Name'].apply(get_display_model_name).tolist()
original_names_cat = initial_df_cat['Model Name'].tolist()
display_to_original_cat = dict(zip(display_names_cat, original_names_cat))
top5_display_names_cat = [get_display_model_name(m) for m in top5_models_cat]
model_selector_cat = gr.Dropdown(
choices=display_names_cat,
value=top5_display_names_cat,
multiselect=True,
label="🎯 Select Models for Radar Chart",
info="Choose up to 5 models to visualize",
elem_classes=["dropdown", "custom-dropdown"],
interactive=True,
filterable=True,
allow_custom_value=False
)
gr.HTML("""
""")
radar_chart_cat = gr.Plot(
label="",
value=create_domain_radar_chart(
initial_df_cat,
"Average Accuracy",
top5_models_cat,
mode=args.mode
),
elem_classes=["radar-chart", "plot-container"]
)
gr.HTML('
')
# --- Speed Med Bar Plot Section (NEW) ---
import json
with open(f"src/data/{data_prefix}/time_data.json", "r") as f:
time_data = json.load(f)
time_data_state = gr.State(value=time_data)
gr.HTML("""
Speed per GPU represents the number of tokens generated per second divided by the number of GPUs during the inference.
Setting: We measured the speed in an H100 GPU environment consisting of 4 nodes with 8 GPUs each, using vLLM and Ray to set the tensor parallel size between 1 and 32 (In the plot, GPU refers to the tensor parallel size).
We performed inference by sending an asynchronous request to the served model, and we set the concurrency to 32.
Note: We measured the speed by directly serving open-source models, and proprietary models were excluded from the plot.
""")
# --- Speed Bar Plot UI: Row with left (category selector) and right (min/max dials) ---
category_columns = [col for col in configs.ON_LOAD_COLUMNS_CATEGORY if col not in configs.CATEGORY_EXCLUDED_COLUMNS]
default_category = "Overall"
default_x_axis_sort_by = "Overall Score"
with gr.Row():
with gr.Column(scale=1):
x_axis_sort_by = gr.Radio(
choices=["Overall Score", "Speed per GPU"],
value="Overall Score",
label="Sort X-Axis by",
elem_id=f"x-axis-btn-radio-{data_prefix.replace('/', '')}", # Make elem_id unique
elem_classes=["x-axis-btn-radio"],
interactive=True,
show_label=True
)
with gr.Column(scale=1):
min_max_score_slider = RangeSlider(
minimum=0,
maximum=100,
value=(0, 100),
step=1,
label="Minimum and Maximum Overall Score",
interactive=True
)
with gr.Column(scale=1):
min_max_param_size_slider = RangeSlider(
minimum=0,
maximum=1000,
value=(0, 1000),
step=1,
label="Minimum and Maximum Parameter Size (B)",
interactive=True
)
# Speed Bar Plot
from vis_utils import create_speed_med_bar_plot
speed_med_bar_plot = gr.Plot(
label="",
value=create_speed_med_bar_plot(
initial_df_cat,
time_data,
min_size=0,
max_size=1000,
min_score=0,
max_score=100,
category=default_category,
theme="light",
x_axis_sort_by=default_x_axis_sort_by,
mode=args.mode
),
elem_classes=["speed-med-bar-plot", "plot-container"]
)
gr.HTML("
")
# --- Event handler: update Speed bar plot and dials when category or dials change ---
def update_speed_med_bar_plot(x_axis_sort_by, min_max_size, min_max_score, current_time_data_state, current_leaderboard_df=None):
df = current_leaderboard_df if current_leaderboard_df is not None else initial_df_cat
return create_speed_med_bar_plot(
df,
current_time_data_state,
min_size=min_max_size[0],
max_size=min_max_size[1],
min_score=min_max_score[0],
max_score=min_max_score[1],
theme="light",
x_axis_sort_by=x_axis_sort_by,
mode=args.mode
)
# Connect category selector to dials and plot
x_axis_sort_by.change(
fn=update_speed_med_bar_plot,
inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state],
outputs=speed_med_bar_plot
)
min_max_param_size_slider.change(
fn=update_speed_med_bar_plot,
inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state],
outputs=speed_med_bar_plot
)
min_max_score_slider.change(
fn=update_speed_med_bar_plot,
inputs=[x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state],
outputs=speed_med_bar_plot
)
# Connect leaderboard filters to dials and plot (if leaderboard_tab_cat provides a filtered DataFrame state)
if "df_state" in leaderboard_tab_cat:
leaderboard_tab_cat["df_state"].change(
fn=lambda df, x_axis_sort_by, min_max_size, min_max_score, time_data: update_speed_med_bar_plot(x_axis_sort_by, min_max_size, min_max_score, time_data, df),
inputs=[leaderboard_tab_cat["df_state"], x_axis_sort_by, min_max_param_size_slider, min_max_score_slider, time_data_state],
outputs=speed_med_bar_plot
)
# Update radar chart when model_selector_cat selection changes
def update_radar_chart_cat(selected_display_names):
# If no selection, fallback to top-5
if not selected_display_names or len(selected_display_names) == 0:
df = load_leaderboard_data(data_prefix=data_prefix)
selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]]
selected_models = [display_to_original_cat[name] for name in selected_display_names if name in display_to_original_cat]
return create_domain_radar_chart(
load_leaderboard_data(data_prefix=data_prefix),
"Average Accuracy",
selected_models,
mode=args.mode
)
model_selector_cat.change(
fn=update_radar_chart_cat,
inputs=model_selector_cat,
outputs=radar_chart_cat
)
# --- Med. Len. vs Overall Scatter Plot Section ---
from vis_utils import create_len_overall_scatter
import json
with open(f"src/data/{data_prefix}/length_data.json", "r") as f:
length_data = json.load(f)
# --- Create a Gradio State component to hold length_data ---
length_data_state = gr.State(value=length_data)
gr.HTML("""
Explore the relationship between median output length and model performance by category
Median Length: Median number of tokens including both Think and Answer
Median Response Length: Median number of answer tokens, excluding Think
Note: We measured the token length of open-source models only and proprietary models were excluded from the plot.
""")
# Category selection buttons (HTML + Gradio Radio for event)
category_columns = [col for col in configs.ON_LOAD_COLUMNS_CATEGORY if col not in configs.CATEGORY_EXCLUDED_COLUMNS]
# (cat-btn-radio related style block removed, now handled in custom_css)
category_selector = gr.Radio(
choices=category_columns,
value="Overall",
label="Select Category for Y-Axis",
elem_id=f"cat-btn-radio-{data_prefix.replace('/', '')}", # Make elem_id unique
elem_classes=["cat-btn-radio"],
interactive=True,
show_label=True
)
x_axis_selector = gr.Radio(
choices=["Median Length", "Median Response Length"],
value="Median Length",
label="Select X-Axis Data",
elem_id=f"x-axis-btn-radio-{data_prefix.replace('/', '')}", # Make elem_id unique
elem_classes=["x-axis-btn-radio"],
interactive=True,
show_label=True
)
gr.HTML('
')
scatter_plot_cat = gr.Plot(
label="",
value=create_len_overall_scatter(
load_leaderboard_data(data_prefix=data_prefix),
y_col="Overall",
length_data=length_data,
x_axis_data_source=x_axis_selector.value
),
elem_classes=["efficiency-chart", "plot-container"]
)
gr.HTML('
')
gr.HTML("
")
# Update plot when category or x-axis selection changes
def update_scatter_plot_cat(selected_category, selected_x_source, current_length_data_state):
return create_len_overall_scatter(
load_leaderboard_data(data_prefix=data_prefix),
y_col=selected_category,
length_data=current_length_data_state,
x_axis_data_source=selected_x_source
)
category_selector.change(
fn=update_scatter_plot_cat,
inputs=[category_selector, x_axis_selector, length_data_state],
outputs=scatter_plot_cat
)
x_axis_selector.change(
fn=update_scatter_plot_cat,
inputs=[category_selector, x_axis_selector, length_data_state],
outputs=scatter_plot_cat
)
# When leaderboard selectors change, synchronize model_selector_cat and radar_chart_cat to top-5
def update_model_selector_and_radar_chart_cat_from_leaderboard(types, model_types, thinks, df, sort_col):
_, _, top5_models = leaderboard_tab_cat["unified_filter"](types, model_types, thinks, df, sort_col)
top5_display_names = [get_display_model_name(m) for m in top5_models[:5]]
return gr.update(value=top5_display_names), create_domain_radar_chart(
load_leaderboard_data(data_prefix=data_prefix),
"Average Accuracy",
top5_models[:5],
mode=args.mode
)
leaderboard_selectors_cat = [
leaderboard_tab_cat["type_selector"],
leaderboard_tab_cat["model_type_selector"],
leaderboard_tab_cat["think_selector"],
leaderboard_tab_cat["df_state"],
leaderboard_tab_cat["sort_col_dropdown"]
]
for selector in leaderboard_selectors_cat:
selector.change(
fn=update_model_selector_and_radar_chart_cat_from_leaderboard,
inputs=leaderboard_selectors_cat,
outputs=[model_selector_cat, radar_chart_cat]
)
gr.HTML("""
As a multilingual benchmark, TRUEBench supports a total of 12 user input languages: Korean (KO), English (EN), Japanese (JA), Chinese (ZH), Polish (PL), German (DE), Portuguese (PT), Spanish (ES), French (FR), Italian (IT), Russian (RU), and Vietnamese (VI).
""")
df_lang = get_dataframe_language(data_prefix=data_prefix)
leaderboard_tab_lang = create_leaderboard_tab(
df_lang,
"Language",
mode=args.mode
)
# --- Language Radar Chart Section ---
from vis_utils import load_leaderboard_language_data, create_language_radar_chart
initial_df_lang = load_leaderboard_language_data(data_prefix=data_prefix)
# Top 5 models based on leaderboard (Overall)
if "Overall" in initial_df_lang.columns:
top5_models_lang = initial_df_lang.sort_values("Overall", ascending=False)['Model Name'].tolist()[:5]
else:
top5_models_lang = initial_df_lang['Model Name'].tolist()[:5]
gr.HTML('
')
# Add model selector
display_names_lang = initial_df_lang['Model Name'].apply(get_display_model_name).tolist()
original_names_lang = initial_df_lang['Model Name'].tolist()
display_to_original_lang = dict(zip(display_names_lang, original_names_lang))
top5_display_names_lang = [get_display_model_name(m) for m in top5_models_lang]
model_selector_lang = gr.Dropdown(
choices=display_names_lang,
value=top5_display_names_lang,
multiselect=True,
label="🎯 Select Models for Radar Chart",
info="Choose up to 5 models to visualize",
elem_classes=["dropdown", "custom-dropdown"],
interactive=True,
filterable=True,
allow_custom_value=False
)
gr.HTML("""
""")
radar_chart_lang = gr.Plot(
label="",
value=create_language_radar_chart(
initial_df_lang,
"Average Accuracy",
top5_models_lang,
mode=args.mode
),
elem_classes=["radar-chart", "plot-container"]
)
gr.HTML('
')
# Update radar chart when model_selector_lang selection changes
def update_radar_chart_lang(selected_display_names):
if not selected_display_names or len(selected_display_names) == 0:
df = load_leaderboard_language_data(data_prefix=data_prefix)
selected_display_names = [get_display_model_name(m) for m in df['Model Name'].tolist()[:5]]
selected_models = [display_to_original_lang[name] for name in selected_display_names if name in display_to_original_lang]
return create_language_radar_chart(
load_leaderboard_language_data(data_prefix=data_prefix),
"Average Accuracy",
selected_models,
mode=args.mode
)
model_selector_lang.change(
fn=update_radar_chart_lang,
inputs=model_selector_lang,
outputs=radar_chart_lang
)
# When leaderboard selectors change, automatically synchronize model_selector_lang and radar_chart_lang to top-5
def update_model_selector_and_radar_chart_lang_from_leaderboard(types, model_types, thinks, df, sort_col):
_, _, top5_models = leaderboard_tab_lang["unified_filter"](types, model_types, thinks, df, sort_col)
top5_display_names = [get_display_model_name(m) for m in top5_models[:5]]
return gr.update(value=top5_display_names), create_language_radar_chart(
load_leaderboard_language_data(data_prefix=data_prefix),
"Average Accuracy",
top5_models[:5],
mode=args.mode
)
leaderboard_selectors_lang = [
leaderboard_tab_lang["type_selector"],
leaderboard_tab_lang["model_type_selector"],
leaderboard_tab_lang["think_selector"],
leaderboard_tab_lang["df_state"],
leaderboard_tab_lang["sort_col_dropdown"]
]
for selector in leaderboard_selectors_lang:
selector.change(
fn=update_model_selector_and_radar_chart_lang_from_leaderboard,
inputs=leaderboard_selectors_lang,
outputs=[model_selector_lang, radar_chart_lang]
)
demo = gr.Blocks(css=custom_css, theme=theme)
with demo:
gr.HTML(BANNER + TITLE + LINK)
user_state = gr.State()
organization_state = gr.State()
with gr.Tabs(elem_classes="tab-buttons") as main_tabs:
if args.mode == "open":
tab_configurations = [
{"data_prefix": "open/", "tab_name": "TRUEBench", "tab_id": 2}
]
else:
tab_configurations = [
{"data_prefix": f"{args.mode}-public/", "tab_name": "TRUEBench (public set)", "tab_id": 2},
{"data_prefix": f"{args.mode}-full/", "tab_name": "TRUEBench (private set)", "tab_id": 3}
]
for config in tab_configurations:
with gr.TabItem(config["tab_name"], elem_id="llm-benchmark-tab-table", id=config["tab_id"]):
create_benchmark_tab_content(data_prefix=config["data_prefix"])
with gr.TabItem("🚀 Submit here! ", elem_id="llm-benchmark-tab-table", id=4):
with gr.Column():
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT, elem_classes="markdown-text")
with gr.Row():
gr.Markdown(EVALUATION_QUEUE_TEXT_OPTION1, elem_classes="markdown-text")
with gr.Row():
gr.Markdown("## ✉️ Submit your model here!", elem_classes="markdown-text")
if args.mode == "open":
login_button = gr.LoginButton()
with gr.Row():
with gr.Column():
contact_email = gr.Textbox(label="Contact Email", placeholder="Your email address", interactive=True)
model_name_textbox = gr.Textbox(label="Model Name")
model_type_dropdown = gr.Dropdown(
choices=["Instruct", "Think", "Hybrid"],
label="Model Type (Instruct, Think, or Hybrid)",
multiselect=False,
value="Instruct",
interactive=True,
)
think_type_dropdown = gr.Dropdown(
choices=["On", "Off"],
label="Think Mode (On/Off)",
multiselect=False,
value="Off",
interactive=False,
)
precision = gr.Dropdown(
choices=[i.value.name for i in Precision if i != Precision.Unknown],
label="Precision",
multiselect=False,
value="float16",
interactive=True,
)
# --- Dynamically control think_type based on model_type and connect event ---
def update_think_type(model_type_value):
if model_type_value == "Instruct":
return gr.update(value="Off", interactive=False)
elif model_type_value == "Think":
return gr.update(value="On", interactive=False)
else: # Hybrid
return gr.update(value="On", interactive=True)
model_type_dropdown.change(
fn=update_think_type,
inputs=model_type_dropdown,
outputs=think_type_dropdown
)
response_prefix_textbox = gr.Textbox(label="Response prefix", placeholder="(e.g., )")
with gr.Column():
yml_textbox_placeholder = """# vLLM serving parameters
# Refence: https://docs.vllm.ai/en/latest/cli/serve.html
llm_serve_args:
max_model_len:
tensor_parallel_size:
dtype:
...
# OpenAI-compatible API (chat completion)
# Reference: https://platform.openai.com/docs/api-reference/chat
sampling_params:
top_p:
temperature:
presence_penalty:
...
# vLLM sampling parameters
# Reference: https://docs.vllm.ai/en/latest/serving/openai_compatible_server.html#chat-api_1
extra_body:
chat_template_kwargs:
enable_thinking:
...
top_k:
repetition_penalty:
..."""
yml_textbox = gr.Textbox(
label="Configuration (YAML format)",
elem_id="yml-textbox",
lines=7,
value=yml_textbox_placeholder
)
upbox = gr.File(
label="Upload configuration file as .yml or .yaml",
file_types=[".yml", ".yaml"],
type="filepath",
height=150
)
# Add Translate to JSON button below upbox
translate_button = gr.Button(
"Translate to JSON",
elem_id="translate-to-json-btn",
elem_classes=["translate-btn"],
scale=None
)
# Add custom style for the button
gr.HTML(
'''
'''
)
with gr.Column():
requirements_textbox = gr.Textbox(label="(Optional) Requirements", lines=30, elem_id="requirements-textbox")
output_dict = gr.Code(label="Translated Python Dictionary", language="json")
submit_button = gr.Button("Submit Eval")
submission_result = gr.Markdown()
def parse_and_display_yaml_config(upbox_path, yml_textbox_value):
import yaml, json
if upbox_path:
try:
with open(upbox_path, "r", encoding="utf-8") as f:
data = yaml.safe_load(f)
if data is None:
return "YAML file is empty."
return json.dumps(data, indent=4, ensure_ascii=False)
except Exception as e:
return f"Error parsing YAML file: {e}"
elif yml_textbox_value and yml_textbox_value.strip():
try:
data = yaml.safe_load(yml_textbox_value)
if data is None:
return "YAML textbox is empty or invalid."
return json.dumps(data, indent=4, ensure_ascii=False)
except Exception as e:
return f"Error parsing YAML textbox: {e}"
else:
return ""
if args.mode == "open":
event = submit_button.click(get_profile_and_organizations, inputs=[], outputs=[user_state, organization_state])
event.then(
add_new_eval_option,
[
contact_email,
model_name_textbox,
model_type_dropdown,
think_type_dropdown,
precision,
response_prefix_textbox,
requirements_textbox,
user_state,
organization_state,
yml_textbox,
upbox,
],
submission_result,
).then(
fn=parse_and_display_yaml_config,
inputs=[upbox, yml_textbox],
outputs=output_dict
)
translate_button.click(
fn=parse_and_display_yaml_config,
inputs=[upbox, yml_textbox],
outputs=output_dict
)
with gr.Row():
with gr.Accordion("📙 Citation", open=False):
citation_button = gr.Textbox(
value=CITATION_BUTTON_TEXT,
label=CITATION_BUTTON_LABEL,
lines=9,
elem_id="citation-button",
show_copy_button=True,
)
if args.mode == "open":
def health_fn() -> str:
try:
initial_df_cat_ = load_leaderboard_data()
if "Overall" in initial_df_cat_.columns:
return "ok"
else:
return "error"
except:
return "error"
gr.api(health_fn, api_name="health")
demo.load(fn=lambda: start_watchdog_in_background(), inputs=None, outputs=None, queue=False)
if args.mode == "open":
demo.queue(default_concurrency_limit=40).launch(prevent_thread_lock=True)
while True:
time.sleep(600)
else:
if __name__ == "__main__":
demo.queue(default_concurrency_limit=40).launch(server_name=args.ip, server_port=args.port)