Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| from huggingface_hub import login | |
| from mmlu_pro_eval_adapted import evaluate_mmlu_pro | |
| import spaces | |
| import pandas as pd | |
| import time | |
| import traceback | |
| from dataset_previews import mmlupro_dataset_preview, format_preview_for_display | |
| # Read token and login | |
| hf_token = os.getenv("HF_READ_WRITE_TOKEN") | |
| if hf_token: | |
| login(hf_token) | |
| else: | |
| print("⚠️ No HF_READ_WRITE_TOKEN found in environment") | |
| # --------------------------------------------------------------------------- | |
| # 1. Model configuration | |
| # --------------------------------------------------------------------------- | |
| model_name = "mistralai/Mistral-7B-v0.1" | |
| # --------------------------------------------------------------------------- | |
| # 2. MMLU-Pro Evaluation | |
| # --------------------------------------------------------------------------- | |
| def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()): | |
| """ | |
| Runs the MMLU evaluation with the specified parameters. | |
| Args: | |
| subject_selection_mode (str): Mode of subject selection ("all", "number", or "specific") | |
| num_subjects (int): Number of subjects to evaluate (1-14) | |
| selected_subjects (list): List of specific subjects to evaluate | |
| num_shots (int): Number of few-shot examples (0-5) | |
| all_questions (bool): Whether to evaluate all questions per subject | |
| num_questions (int): Number of examples per subject (1-100 or all) | |
| progress (gr.Progress): Progress indicator | |
| """ | |
| try: | |
| # Convert parameters if needed | |
| if subject_selection_mode == "all": | |
| num_subjects = -1 | |
| selected_subjects = [] | |
| elif subject_selection_mode == "specific": | |
| num_subjects = len(selected_subjects) if selected_subjects else -1 | |
| if all_questions: | |
| num_questions = -1 | |
| # Run evaluation with timing | |
| start_time = time.time() | |
| results = evaluate_mmlu_pro( | |
| model_name, | |
| num_subjects=num_subjects, | |
| num_questions=num_questions, | |
| num_shots=num_shots, | |
| specific_subjects=selected_subjects if subject_selection_mode == "specific" else None | |
| ) | |
| elapsed_time = time.time() - start_time | |
| # Format results | |
| overall_acc = results["overall_accuracy"] | |
| min_subject, min_acc = results["min_accuracy_subject"] | |
| max_subject, max_acc = results["max_accuracy_subject"] | |
| # Create DataFrame from results table | |
| results_df = pd.DataFrame(results["full_accuracy_table"]) | |
| # Calculate totals for the overall row | |
| total_samples = results_df['Num_samples'].sum() | |
| total_correct = results_df['Num_correct'].sum() | |
| # Create overall row | |
| overall_row = pd.DataFrame({ | |
| 'Subject': ['**Overall**'], | |
| 'Num_samples': [total_samples], | |
| 'Num_correct': [total_correct], | |
| 'Accuracy': [overall_acc] | |
| }) | |
| # Concatenate overall row with results | |
| results_df = pd.concat([overall_row, results_df], ignore_index=True) | |
| # Format the report | |
| report = ( | |
| f"### Overall Results\n" | |
| f"* Overall Accuracy: {overall_acc:.3f}\n" | |
| f"* Best Performance: {max_subject} ({max_acc:.3f})\n" | |
| f"* Worst Performance: {min_subject} ({min_acc:.3f})\n" | |
| f"* Evaluation completed in {elapsed_time:.2f} seconds\n" | |
| ) | |
| # Return values that re-enable UI components after completion | |
| return (report, | |
| results_df, | |
| gr.update(interactive=True), | |
| gr.update(visible=False), | |
| gr.update(interactive=True), | |
| gr.update(interactive=True), | |
| gr.update(interactive=True), | |
| gr.update(interactive=True), | |
| gr.update(interactive=True), | |
| gr.update(visible=True)) | |
| except Exception as e: | |
| # Handle errors gracefully | |
| error_trace = traceback.format_exc() | |
| error_message = f"### Error during evaluation\n```\n{error_trace}\n```" | |
| # Re-enable UI components on error | |
| return (error_message, | |
| None, | |
| gr.update(interactive=True), | |
| gr.update(visible=False), | |
| gr.update(interactive=True), | |
| gr.update(interactive=True), | |
| gr.update(interactive=True), | |
| gr.update(interactive=True), | |
| gr.update(interactive=True), | |
| gr.update(visible=False)) | |
| # --------------------------------------------------------------------------- | |
| # 3. Gradio Interface | |
| # --------------------------------------------------------------------------- | |
| with gr.Blocks(css=""" | |
| #preview_header { | |
| margin-bottom: 10px; | |
| margin-top: 5px; | |
| } | |
| #preview_table { | |
| background-color: #f8f9fa; | |
| border-radius: 8px; | |
| padding: 10px; | |
| } | |
| h1 { | |
| text-align: center; | |
| } | |
| .section-divider { | |
| border-top: 1px solid #ddd; | |
| margin: 12px 0; | |
| } | |
| .config-box { | |
| border: 1px solid #ddd; | |
| border-radius: 8px; | |
| padding: 15px; | |
| margin: 10px; | |
| background-color: #f9f9f9; | |
| } | |
| """) as demo: | |
| gr.Markdown("# Head-to-Head Model Evaluation Comparator") | |
| gr.Markdown(""" | |
| This demo evaluates two models (or one model with two different configs), head-to-head, on a benchmark dataset. | |
| Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro) | |
| Available Models: [Mistral-7B-v0.1](https://huggingface.co/mistralai/Mistral-7B-v0.1) | |
| """) | |
| # Dataset Selection Section | |
| gr.Markdown("## (A) Select Dataset for Evaluation") | |
| with gr.Row(): | |
| dataset_dropdown = gr.Dropdown( | |
| choices=["(Select Dataset)", "MMLU-Pro"], | |
| value="(Select Dataset)", | |
| label="Dataset", | |
| info="Select a dataset to perform the Head-to-Head Evaluation on. Available Datasets: [MMLU-Pro](https://huggingface.co/datasets/TIGER-Lab/MMLU-Pro)" | |
| ) | |
| preview_toggle = gr.Button("Show Dataset Preview", interactive=False, variant="secondary") | |
| # Dataset Preview Container - Initially hidden | |
| with gr.Column(visible=False) as dataset_preview_container: | |
| gr.Markdown("## Dataset Preview", elem_id="preview_header") | |
| preview_output = gr.DataFrame( | |
| interactive=False, | |
| wrap=True, | |
| elem_id="preview_table" | |
| ) | |
| # Add a divider instead of lots of space | |
| gr.Markdown("<div class='section-divider'></div>") | |
| # Preview data state to store the loaded preview data | |
| preview_data_state = gr.State(None) | |
| # MMLU Config Container - Initially hidden until dataset is selected | |
| with gr.Column(visible=False) as mmlu_config_container: | |
| gr.Markdown("## (B) Select Dataset Configuration Options") | |
| with gr.Row(): | |
| # Left column for subject selection | |
| with gr.Column(scale=1): | |
| with gr.Group(elem_classes=["config-box"]): | |
| gr.Markdown("### Choose Subjects") | |
| subject_selection_mode = gr.Radio( | |
| choices=["Evaluate All Subjects", "Choose Number of Subjects", "Specify which Subjects to Evaluate"], | |
| value="Evaluate All Subjects", | |
| label="Subject Selection Mode" | |
| ) | |
| # Subject number slider - initially hidden, shown when "Choose Number of Subjects" is selected | |
| with gr.Column(visible=False) as num_subjects_container: | |
| num_subjects_slider = gr.Slider( | |
| minimum=1, | |
| maximum=14, # Will be updated dynamically based on preview data | |
| value=1, | |
| step=1, | |
| label="Number of Subjects", | |
| info="Number of subjects to evaluate. They will be loaded in alphabetical order." | |
| ) | |
| # Subject checkboxes - initially hidden, shown when "Specify which Subjects to Evaluate" is selected | |
| with gr.Column(visible=False) as specific_subjects_container: | |
| # Will be populated dynamically from the preview data | |
| specific_subjects = gr.CheckboxGroup( | |
| choices=[], # Will be populated from preview data | |
| label="Select Specific Subjects", | |
| info="Select which specific subjects to evaluate" | |
| ) | |
| # Right column for few-shot examples | |
| with gr.Column(scale=1): | |
| with gr.Group(elem_classes=["config-box"]): | |
| gr.Markdown("### Few-shot Configuration") | |
| num_shots_slider = gr.Slider( | |
| minimum=0, | |
| maximum=5, | |
| value=5, | |
| step=1, | |
| label="Number of Few-shot Examples", | |
| info="Number of examples to use for few-shot learning (0-5). They will be loaded in order of question_id." | |
| ) | |
| # Add a small space | |
| gr.Markdown(" ") | |
| with gr.Row(): | |
| all_questions_checkbox = gr.Checkbox( | |
| label="Evaluate All Questions", | |
| value=False, | |
| info="When checked, evaluates all available questions for each subject" | |
| ) | |
| questions_info_text = gr.Markdown(visible=False, value="**All questions across the selected subjects will be evaluated**") | |
| with gr.Row(elem_id="questions_selection_row"): | |
| questions_container = gr.Column(scale=1, elem_id="questions_slider_container") | |
| with questions_container: | |
| num_questions_slider = gr.Slider( | |
| minimum=1, | |
| maximum=100, | |
| value=20, | |
| step=1, | |
| label="Questions per Subject", | |
| info="Choose a subset of questions (1-100) per subject. They will be loaded in order of question_id.", | |
| interactive=True | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| eval_mmlu_button = gr.Button("Run MMLU-Pro Evaluation", variant="primary", interactive=True) | |
| cancel_mmlu_button = gr.Button("Cancel Evaluation", variant="stop", visible=False) | |
| # Results Section - Initially hidden | |
| with gr.Column(visible=False) as results_container: | |
| results_output = gr.Markdown(label="Evaluation Results") | |
| # Results table - Initially hidden until evaluation completes | |
| with gr.Column(visible=False) as results_table_container: | |
| with gr.Row(): | |
| results_table = gr.DataFrame( | |
| interactive=True, | |
| label="Detailed Results (Sortable)", | |
| visible=True | |
| ) | |
| # Track evaluation state | |
| evaluation_state = gr.State({"running": False}) | |
| # Track preview visibility state | |
| preview_visibility = gr.State(False) | |
| # Function to process dataset preview data | |
| def get_subject_choices_from_preview(preview_data): | |
| if not preview_data or 'subject_counts' not in preview_data: | |
| print("FAILURE HERE") | |
| return [], 0 | |
| # Get subject counts from preview data | |
| subject_counts = preview_data['subject_counts'] | |
| # Sort subjects alphabetically | |
| subjects = sorted(subject_counts.keys()) | |
| # Format as "Subject (n=count)" | |
| formatted_subjects = [f"{subject} (n={subject_counts[subject]})" for subject in subjects] | |
| print("Formatted Subjects", formatted_subjects) | |
| return formatted_subjects, len(subjects) | |
| # Function to load preview data and update UI | |
| def load_dataset_preview(dataset): | |
| if dataset == "MMLU-Pro": | |
| # Load the preview data | |
| preview_data = mmlupro_dataset_preview(regenerate_preview=False) | |
| # Extract subject choices and count | |
| subject_choices, subject_count = get_subject_choices_from_preview(preview_data) | |
| # Update the UI components | |
| return ( | |
| preview_data, # Store the preview data | |
| gr.update(choices=subject_choices), # Update checkbox choices | |
| gr.update(maximum=subject_count, value=1) # Update slider max | |
| ) | |
| return None, gr.update(), gr.update() | |
| # Function to show/hide configuration based on selected dataset | |
| def update_interface_based_on_dataset(dataset, current_visibility): | |
| if dataset == "MMLU-Pro": | |
| return ( | |
| gr.update(visible=True), # mmlu_config_container | |
| gr.update(visible=True), # results_container | |
| gr.update(interactive=True), # preview_toggle | |
| gr.update(visible=False), # dataset_preview_container - hide it initially | |
| False, # Reset preview_visibility to False | |
| gr.update(value="Show Dataset Preview") # Reset button text | |
| ) | |
| else: | |
| return ( | |
| gr.update(visible=False), # mmlu_config_container | |
| gr.update(visible=False), # results_container | |
| gr.update(interactive=False), # preview_toggle | |
| gr.update(visible=False), # dataset_preview_container - hide when no dataset | |
| False, # Reset preview_visibility to False | |
| gr.update(value="Show Dataset Preview") # Reset button text | |
| ) | |
| # Connect dataset dropdown to show/hide appropriate configuration and load preview data | |
| dataset_dropdown.change( | |
| fn=load_dataset_preview, | |
| inputs=[dataset_dropdown], | |
| outputs=[preview_data_state, specific_subjects, num_subjects_slider], | |
| ).then( | |
| fn=update_interface_based_on_dataset, | |
| inputs=[dataset_dropdown, preview_visibility], | |
| outputs=[mmlu_config_container, results_container, preview_toggle, dataset_preview_container, preview_visibility, preview_toggle] | |
| ) | |
| # Function to toggle dataset preview visibility | |
| def toggle_preview(dataset, preview_visibility, preview_data): | |
| # Toggle the visibility state | |
| is_visible = not preview_visibility | |
| # Update button text based on new state | |
| button_text = "Hide Dataset Preview" if is_visible else "Show Dataset Preview" | |
| # Format and show preview if becoming visible | |
| if is_visible and dataset == "MMLU-Pro": | |
| formatted_preview = format_preview_for_display(preview_data) | |
| return is_visible, gr.update(visible=True), formatted_preview, gr.update(value=button_text) | |
| elif is_visible: | |
| # For other datasets (not implemented yet) | |
| return is_visible, gr.update(visible=True), None, gr.update(value=button_text) | |
| else: | |
| # Hiding the preview | |
| return is_visible, gr.update(visible=False), None, gr.update(value=button_text) | |
| # Connect preview toggle to show/hide dataset information | |
| preview_toggle.click( | |
| fn=toggle_preview, | |
| inputs=[dataset_dropdown, preview_visibility, preview_data_state], | |
| outputs=[preview_visibility, dataset_preview_container, preview_output, preview_toggle] | |
| ) | |
| # Function to update UI based on subject selection mode | |
| def update_subject_selection_ui(mode): | |
| if mode == "Evaluate All Subjects": | |
| return gr.update(visible=False), gr.update(visible=False) | |
| elif mode == "Choose Number of Subjects": | |
| return gr.update(visible=True), gr.update(visible=False) | |
| else: # "Specify which Subjects to Evaluate" | |
| return gr.update(visible=False), gr.update(visible=True) | |
| # Connect subject selection mode to UI updates | |
| subject_selection_mode.change( | |
| fn=update_subject_selection_ui, | |
| inputs=[subject_selection_mode], | |
| outputs=[num_subjects_container, specific_subjects_container] | |
| ) | |
| # Update interface based on all_questions checkbox | |
| def update_questions_interface(checked): | |
| if checked: | |
| return gr.update(visible=False), gr.update(visible=True) | |
| else: | |
| return gr.update(visible=True), gr.update(visible=False) | |
| all_questions_checkbox.change( | |
| fn=update_questions_interface, | |
| inputs=[all_questions_checkbox], | |
| outputs=[questions_container, questions_info_text] | |
| ) | |
| # Function to convert subject selection mode to parameters | |
| def get_subject_mode_param(mode): | |
| if mode == "Evaluate All Subjects": | |
| return "all" | |
| elif mode == "Choose Number of Subjects": | |
| return "number" | |
| else: # "Specify which Subjects to Evaluate" | |
| return "specific" | |
| # Function to extract subject names from checkboxes | |
| def get_subject_names(selected_subjects): | |
| # Extract just the subject name without the count | |
| return [subject.split(" (")[0] for subject in selected_subjects] | |
| # Function to disable UI components during evaluation | |
| def start_evaluation(state): | |
| if state["running"]: | |
| return [ | |
| state, | |
| gr.update(interactive=False), | |
| gr.update(interactive=False), | |
| gr.update(interactive=False), | |
| gr.update(interactive=False), | |
| gr.update(interactive=False), | |
| gr.update(interactive=False), | |
| gr.update(visible=True), | |
| "Evaluation already in progress. Please wait.", | |
| None, | |
| gr.update(visible=False) | |
| ] | |
| # Update state to running | |
| state["running"] = True | |
| return [ | |
| state, | |
| gr.update(interactive=False), # subject_selection_mode | |
| gr.update(interactive=False), # num_subjects_slider | |
| gr.update(interactive=False), # specific_subjects | |
| gr.update(interactive=False), # num_shots_slider | |
| gr.update(interactive=False), # all_questions_checkbox | |
| gr.update(interactive=False), # num_questions_slider | |
| gr.update(interactive=False), # eval_mmlu_button | |
| gr.update(visible=True), # cancel_mmlu_button | |
| "Starting evaluation...", # results_output | |
| None, # results_table | |
| gr.update(visible=False) # results_table_container | |
| ] | |
| # Function to reset UI after evaluation | |
| def finish_evaluation(state): | |
| state["running"] = False | |
| return state | |
| # Function to handle cancel button click | |
| def cancel_evaluation(state): | |
| # Note: This doesn't actually stop the evaluation process | |
| # It only updates the UI state to appear canceled | |
| state["running"] = False | |
| return [ | |
| state, | |
| gr.update(interactive=True), # subject_selection_mode | |
| gr.update(interactive=True), # num_subjects_slider | |
| gr.update(interactive=True), # specific_subjects | |
| gr.update(interactive=True), # num_shots_slider | |
| gr.update(interactive=True), # all_questions_checkbox | |
| gr.update(interactive=True), # num_questions_slider | |
| gr.update(interactive=True), # eval_mmlu_button | |
| gr.update(visible=False), # cancel_mmlu_button | |
| "⚠️ Evaluation canceled by user (note: backend process may continue running)", # results_output | |
| None, # results_table | |
| gr.update(visible=False) # results_table_container | |
| ] | |
| # Connect MMLU evaluation button with state tracking | |
| eval_mmlu_button.click( | |
| fn=start_evaluation, | |
| inputs=[evaluation_state], | |
| outputs=[ | |
| evaluation_state, | |
| subject_selection_mode, | |
| num_subjects_slider, | |
| specific_subjects, | |
| num_shots_slider, | |
| all_questions_checkbox, | |
| num_questions_slider, | |
| eval_mmlu_button, | |
| cancel_mmlu_button, | |
| results_output, | |
| results_table, | |
| results_table_container | |
| ] | |
| ).then( | |
| fn=lambda mode, num, subjects, shots, all_q, num_q: | |
| run_mmlu_evaluation( | |
| get_subject_mode_param(mode), | |
| num, | |
| get_subject_names(subjects), | |
| shots, | |
| all_q, | |
| num_q | |
| ), | |
| inputs=[ | |
| subject_selection_mode, | |
| num_subjects_slider, | |
| specific_subjects, | |
| num_shots_slider, | |
| all_questions_checkbox, | |
| num_questions_slider | |
| ], | |
| outputs=[ | |
| results_output, | |
| results_table, | |
| eval_mmlu_button, | |
| cancel_mmlu_button, | |
| subject_selection_mode, | |
| num_subjects_slider, | |
| num_shots_slider, | |
| all_questions_checkbox, | |
| num_questions_slider, | |
| results_table_container | |
| ] | |
| ).then( | |
| fn=finish_evaluation, | |
| inputs=[evaluation_state], | |
| outputs=[evaluation_state] | |
| ) | |
| # Connect cancel button | |
| cancel_mmlu_button.click( | |
| fn=cancel_evaluation, | |
| inputs=[evaluation_state], | |
| outputs=[ | |
| evaluation_state, | |
| subject_selection_mode, | |
| num_subjects_slider, | |
| specific_subjects, | |
| num_shots_slider, | |
| all_questions_checkbox, | |
| num_questions_slider, | |
| eval_mmlu_button, | |
| cancel_mmlu_button, | |
| results_output, | |
| results_table, | |
| results_table_container | |
| ] | |
| ) | |
| demo.launch() |