H2H-eval-comparator

Sleeping

App Files Files Community

H2H-eval-comparator / app.py

rohansampath

Update app.py

a955837 verified 9 months ago

raw

history blame

22.5 kB

	import gradio as gr
	import os
	from huggingface_hub import login
	from mmlu_pro_eval_adapted import evaluate_mmlu_pro
	import spaces
	import pandas as pd
	import time
	import traceback
	from dataset_previews import mmlupro_dataset_preview, format_preview_for_display

	# Read token and login
	hf_token = os.getenv("HF_READ_WRITE_TOKEN")
	if hf_token:
	login(hf_token)
	else:
	print("⚠️ No HF_READ_WRITE_TOKEN found in environment")

	# ---------------------------------------------------------------------------
	# 1. Model configuration
	# ---------------------------------------------------------------------------
	model_name = "mistralai/Mistral-7B-v0.1"

	# ---------------------------------------------------------------------------
	# 2. MMLU-Pro Evaluation
	# ---------------------------------------------------------------------------
	@spaces.GPU(duration=240)
	def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
	"""
	Runs the MMLU evaluation with the specified parameters.

	Args:
	subject_selection_mode (str): Mode of subject selection ("all", "number", or "specific")
	num_subjects (int): Number of subjects to evaluate (1-14)
	selected_subjects (list): List of specific subjects to evaluate
	num_shots (int): Number of few-shot examples (0-5)
	all_questions (bool): Whether to evaluate all questions per subject
	num_questions (int): Number of examples per subject (1-100 or all)
	progress (gr.Progress): Progress indicator
	"""
	try:
	# Convert parameters if needed
	if subject_selection_mode == "all":
	num_subjects = -1
	selected_subjects = []
	elif subject_selection_mode == "specific":
	num_subjects = len(selected_subjects) if selected_subjects else -1

	if all_questions:
	num_questions = -1

	# Run evaluation with timing
	start_time = time.time()
	results = evaluate_mmlu_pro(
	model_name,
	num_subjects=num_subjects,
	num_questions=num_questions,
	num_shots=num_shots,
	specific_subjects=selected_subjects if subject_selection_mode == "specific" else None
	)
	elapsed_time = time.time() - start_time

	# Format results
	overall_acc = results["overall_accuracy"]
	min_subject, min_acc = results["min_accuracy_subject"]
	max_subject, max_acc = results["max_accuracy_subject"]

	# Create DataFrame from results table
	results_df = pd.DataFrame(results["full_accuracy_table"])

	# Calculate totals for the overall row
	total_samples = results_df['Num_samples'].sum()
	total_correct = results_df['Num_correct'].sum()

	# Create overall row
	overall_row = pd.DataFrame({
	'Subject': ['Overall'],
	'Num_samples': [total_samples],
	'Num_correct': [total_correct],
	'Accuracy': [overall_acc]
	})

	# Concatenate overall row with results
	results_df = pd.concat([overall_row, results_df], ignore_index=True)

	# Format the report
	report = (
	f"### Overall Results\n"
	f"* Overall Accuracy: {overall_acc:.3f}\n"
	f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
	f"* Worst Performance: {min_subject} ({min_acc:.3f})\n"
	f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
	)

	# Return values that re-enable UI components after completion
	return (report,
	results_df,
	gr.update(interactive=True),
	gr.update(visible=False),
	gr.update(interactive=True),
	gr.update(interactive=True),
	gr.update(interactive=True),
	gr.update(interactive=True),
	gr.update(interactive=True),
	gr.update(visible=True))

	except Exception as e:
	# Handle errors gracefully
	error_trace = traceback.format_exc()
	error_message = f"### Error during evaluation\n```\n{error_trace}\n```"

	# Re-enable UI components on error
	return (error_message,
	None,
	gr.update(interactive=True),
	gr.update(visible=False),
	gr.update(interactive=True),
	gr.update(interactive=True),
	gr.update(interactive=True),
	gr.update(interactive=True),
	gr.update(interactive=True),
	gr.update(visible=False))

	# ---------------------------------------------------------------------------
	# 3. Gradio Interface
	# ---------------------------------------------------------------------------
	with gr.Blocks(css="""
	#preview_header {
	margin-bottom: 10px;
	margin-top: 5px;
	}
	#preview_table {
	background-color: #f8f9fa;
	border-radius: 8px;
	padding: 10px;
	}
	h1 {
	text-align: center;
	}
	.section-divider {
	border-top: 1px solid #ddd;
	margin: 12px 0;
	}
	.config-box {
	border: 1px solid #ddd;
	border-radius: 8px;
	padding: 15px;
	margin: 10px;
	background-color: #f9f9f9;
	}
	""") as demo:
	gr.Markdown("# Head-to-Head Model Evaluation Comparator")
	gr.Markdown("""
	This demo evaluates two models (or one model with two different configs), head-to-head, on a benchmark dataset.

	Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)

	Available Models: [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1)
	""")

	# Dataset Selection Section
	gr.Markdown("## (A) Select Dataset for Evaluation")

	with gr.Row():
	dataset_dropdown = gr.Dropdown(
	choices=["(Select Dataset)", "MMLU-Pro"],
	value="(Select Dataset)",
	label="Dataset",
	info="Select a dataset to perform the Head-to-Head Evaluation on. Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)"
	)
	preview_toggle = gr.Button("Show Dataset Preview", interactive=False, variant="secondary")

	# Dataset Preview Container - Initially hidden
	with gr.Column(visible=False) as dataset_preview_container:
	gr.Markdown("## Dataset Preview", elem_id="preview_header")
	preview_output = gr.DataFrame(
	interactive=False,
	wrap=True,
	elem_id="preview_table"
	)
	# Add a divider instead of lots of space
	gr.Markdown("<div class='section-divider'></div>")

	# Preview data state to store the loaded preview data
	preview_data_state = gr.State(None)

	# MMLU Config Container - Initially hidden until dataset is selected
	with gr.Column(visible=False) as mmlu_config_container:
	gr.Markdown("## (B) Select Dataset Configuration Options")

	with gr.Row():
	# Left column for subject selection
	with gr.Column(scale=1):
	with gr.Group(elem_classes=["config-box"]):
	gr.Markdown("### Choose Subjects")

	subject_selection_mode = gr.Radio(
	choices=["Evaluate All Subjects", "Choose Number of Subjects", "Specify which Subjects to Evaluate"],
	value="Evaluate All Subjects",
	label="Subject Selection Mode"
	)

	# Subject number slider - initially hidden, shown when "Choose Number of Subjects" is selected
	with gr.Column(visible=False) as num_subjects_container:
	num_subjects_slider = gr.Slider(
	minimum=1,
	maximum=14, # Will be updated dynamically based on preview data
	value=1,
	step=1,
	label="Number of Subjects",
	info="Number of subjects to evaluate. They will be loaded in alphabetical order."
	)

	# Subject checkboxes - initially hidden, shown when "Specify which Subjects to Evaluate" is selected
	with gr.Column(visible=False) as specific_subjects_container:
	# Will be populated dynamically from the preview data
	specific_subjects = gr.CheckboxGroup(
	choices=[], # Will be populated from preview data
	label="Select Specific Subjects",
	info="Select which specific subjects to evaluate"
	)

	# Right column for few-shot examples
	with gr.Column(scale=1):
	with gr.Group(elem_classes=["config-box"]):
	gr.Markdown("### Few-shot Configuration")

	num_shots_slider = gr.Slider(
	minimum=0,
	maximum=5,
	value=5,
	step=1,
	label="Number of Few-shot Examples",
	info="Number of examples to use for few-shot learning (0-5). They will be loaded in order of question_id."
	)

	# Add a small space
	gr.Markdown(" ")

	with gr.Row():
	all_questions_checkbox = gr.Checkbox(
	label="Evaluate All Questions",
	value=False,
	info="When checked, evaluates all available questions for each subject"
	)
	questions_info_text = gr.Markdown(visible=False, value="All questions across the selected subjects will be evaluated")

	with gr.Row(elem_id="questions_selection_row"):
	questions_container = gr.Column(scale=1, elem_id="questions_slider_container")

	with questions_container:
	num_questions_slider = gr.Slider(
	minimum=1,
	maximum=100,
	value=20,
	step=1,
	label="Questions per Subject",
	info="Choose a subset of questions (1-100) per subject. They will be loaded in order of question_id.",
	interactive=True
	)

	with gr.Row():
	with gr.Column(scale=1):
	eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True)
	cancel_mmlu_button = gr.Button("Cancel Evaluation", variant="stop", visible=False)

	# Results Section - Initially hidden
	with gr.Column(visible=False) as results_container:
	results_output = gr.Markdown(label="Evaluation Results")

	# Results table - Initially hidden until evaluation completes
	with gr.Column(visible=False) as results_table_container:
	with gr.Row():
	results_table = gr.DataFrame(
	interactive=True,
	label="Detailed Results (Sortable)",
	visible=True
	)

	# Track evaluation state
	evaluation_state = gr.State({"running": False})

	# Track preview visibility state
	preview_visibility = gr.State(False)

	# Function to process dataset preview data
	def get_subject_choices_from_preview(preview_data):
	if not preview_data or 'subject_counts' not in preview_data:
	print("FAILURE HERE")
	return [], 0

	# Get subject counts from preview data
	subject_counts = preview_data['subject_counts']

	# Sort subjects alphabetically
	subjects = sorted(subject_counts.keys())

	# Format as "Subject (n=count)"
	formatted_subjects = [f"{subject} (n={subject_counts[subject]})" for subject in subjects]
	print("Formatted Subjects", formatted_subjects)


	return formatted_subjects, len(subjects)

	# Function to load preview data and update UI
	def load_dataset_preview(dataset):
	if dataset == "MMLU-Pro":
	# Load the preview data
	preview_data = mmlupro_dataset_preview(regenerate_preview=False)

	# Extract subject choices and count
	subject_choices, subject_count = get_subject_choices_from_preview(preview_data)

	# Update the UI components
	return (
	preview_data, # Store the preview data
	gr.update(choices=subject_choices), # Update checkbox choices
	gr.update(maximum=subject_count, value=1) # Update slider max
	)
	return None, gr.update(), gr.update()

	# Function to show/hide configuration based on selected dataset
	def update_interface_based_on_dataset(dataset, current_visibility):
	if dataset == "MMLU-Pro":
	return (
	gr.update(visible=True), # mmlu_config_container
	gr.update(visible=True), # results_container
	gr.update(interactive=True), # preview_toggle
	gr.update(visible=False), # dataset_preview_container - hide it initially
	False, # Reset preview_visibility to False
	gr.update(value="Show Dataset Preview") # Reset button text
	)
	else:
	return (
	gr.update(visible=False), # mmlu_config_container
	gr.update(visible=False), # results_container
	gr.update(interactive=False), # preview_toggle
	gr.update(visible=False), # dataset_preview_container - hide when no dataset
	False, # Reset preview_visibility to False
	gr.update(value="Show Dataset Preview") # Reset button text
	)

	# Connect dataset dropdown to show/hide appropriate configuration and load preview data
	dataset_dropdown.change(
	fn=load_dataset_preview,
	inputs=[dataset_dropdown],
	outputs=[preview_data_state, specific_subjects, num_subjects_slider],
	).then(
	fn=update_interface_based_on_dataset,
	inputs=[dataset_dropdown, preview_visibility],
	outputs=[mmlu_config_container, results_container, preview_toggle, dataset_preview_container, preview_visibility, preview_toggle]
	)

	# Function to toggle dataset preview visibility
	def toggle_preview(dataset, preview_visibility, preview_data):
	# Toggle the visibility state
	is_visible = not preview_visibility

	# Update button text based on new state
	button_text = "Hide Dataset Preview" if is_visible else "Show Dataset Preview"

	# Format and show preview if becoming visible
	if is_visible and dataset == "MMLU-Pro":
	formatted_preview = format_preview_for_display(preview_data)
	return is_visible, gr.update(visible=True), formatted_preview, gr.update(value=button_text)
	elif is_visible:
	# For other datasets (not implemented yet)
	return is_visible, gr.update(visible=True), None, gr.update(value=button_text)
	else:
	# Hiding the preview
	return is_visible, gr.update(visible=False), None, gr.update(value=button_text)

	# Connect preview toggle to show/hide dataset information
	preview_toggle.click(
	fn=toggle_preview,
	inputs=[dataset_dropdown, preview_visibility, preview_data_state],
	outputs=[preview_visibility, dataset_preview_container, preview_output, preview_toggle]
	)

	# Function to update UI based on subject selection mode
	def update_subject_selection_ui(mode):
	if mode == "Evaluate All Subjects":
	return gr.update(visible=False), gr.update(visible=False)
	elif mode == "Choose Number of Subjects":
	return gr.update(visible=True), gr.update(visible=False)
	else: # "Specify which Subjects to Evaluate"
	return gr.update(visible=False), gr.update(visible=True)

	# Connect subject selection mode to UI updates
	subject_selection_mode.change(
	fn=update_subject_selection_ui,
	inputs=[subject_selection_mode],
	outputs=[num_subjects_container, specific_subjects_container]
	)

	# Update interface based on all_questions checkbox
	def update_questions_interface(checked):
	if checked:
	return gr.update(visible=False), gr.update(visible=True)
	else:
	return gr.update(visible=True), gr.update(visible=False)

	all_questions_checkbox.change(
	fn=update_questions_interface,
	inputs=[all_questions_checkbox],
	outputs=[questions_container, questions_info_text]
	)

	# Function to convert subject selection mode to parameters
	def get_subject_mode_param(mode):
	if mode == "Evaluate All Subjects":
	return "all"
	elif mode == "Choose Number of Subjects":
	return "number"
	else: # "Specify which Subjects to Evaluate"
	return "specific"

	# Function to extract subject names from checkboxes
	def get_subject_names(selected_subjects):
	# Extract just the subject name without the count
	return [subject.split(" (")[0] for subject in selected_subjects]

	# Function to disable UI components during evaluation
	def start_evaluation(state):
	if state["running"]:
	return [
	state,
	gr.update(interactive=False),
	gr.update(interactive=False),
	gr.update(interactive=False),
	gr.update(interactive=False),
	gr.update(interactive=False),
	gr.update(interactive=False),
	gr.update(visible=True),
	"Evaluation already in progress. Please wait.",
	None,
	gr.update(visible=False)
	]

	# Update state to running
	state["running"] = True

	return [
	state,
	gr.update(interactive=False), # subject_selection_mode
	gr.update(interactive=False), # num_subjects_slider
	gr.update(interactive=False), # specific_subjects
	gr.update(interactive=False), # num_shots_slider
	gr.update(interactive=False), # all_questions_checkbox
	gr.update(interactive=False), # num_questions_slider
	gr.update(interactive=False), # eval_mmlu_button
	gr.update(visible=True), # cancel_mmlu_button
	"Starting evaluation...", # results_output
	None, # results_table
	gr.update(visible=False) # results_table_container
	]

	# Function to reset UI after evaluation
	def finish_evaluation(state):
	state["running"] = False
	return state

	# Function to handle cancel button click
	def cancel_evaluation(state):
	# Note: This doesn't actually stop the evaluation process
	# It only updates the UI state to appear canceled
	state["running"] = False
	return [
	state,
	gr.update(interactive=True), # subject_selection_mode
	gr.update(interactive=True), # num_subjects_slider
	gr.update(interactive=True), # specific_subjects
	gr.update(interactive=True), # num_shots_slider
	gr.update(interactive=True), # all_questions_checkbox
	gr.update(interactive=True), # num_questions_slider
	gr.update(interactive=True), # eval_mmlu_button
	gr.update(visible=False), # cancel_mmlu_button
	"⚠️ Evaluation canceled by user (note: backend process may continue running)", # results_output
	None, # results_table
	gr.update(visible=False) # results_table_container
	]

	# Connect MMLU evaluation button with state tracking
	eval_mmlu_button.click(
	fn=start_evaluation,
	inputs=[evaluation_state],
	outputs=[
	evaluation_state,
	subject_selection_mode,
	num_subjects_slider,
	specific_subjects,
	num_shots_slider,
	all_questions_checkbox,
	num_questions_slider,
	eval_mmlu_button,
	cancel_mmlu_button,
	results_output,
	results_table,
	results_table_container
	]
	).then(
	fn=lambda mode, num, subjects, shots, all_q, num_q:
	run_mmlu_evaluation(
	get_subject_mode_param(mode),
	num,
	get_subject_names(subjects),
	shots,
	all_q,
	num_q
	),
	inputs=[
	subject_selection_mode,
	num_subjects_slider,
	specific_subjects,
	num_shots_slider,
	all_questions_checkbox,
	num_questions_slider
	],
	outputs=[
	results_output,
	results_table,
	eval_mmlu_button,
	cancel_mmlu_button,
	subject_selection_mode,
	num_subjects_slider,
	num_shots_slider,
	all_questions_checkbox,
	num_questions_slider,
	results_table_container
	]
	).then(
	fn=finish_evaluation,
	inputs=[evaluation_state],
	outputs=[evaluation_state]
	)

	# Connect cancel button
	cancel_mmlu_button.click(
	fn=cancel_evaluation,
	inputs=[evaluation_state],
	outputs=[
	evaluation_state,
	subject_selection_mode,
	num_subjects_slider,
	specific_subjects,
	num_shots_slider,
	all_questions_checkbox,
	num_questions_slider,
	eval_mmlu_button,
	cancel_mmlu_button,
	results_output,
	results_table,
	results_table_container
	]
	)

	demo.launch()