Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| GAIA Benchmark AI Agent - Hugging Face Space | |
| ============================================ | |
| A Gradio-based web interface for running GAIA benchmark evaluations | |
| on Hugging Face Spaces with GPU acceleration. | |
| """ | |
| import gradio as gr | |
| import torch | |
| import json | |
| import os | |
| import logging | |
| import time | |
| import re | |
| from datetime import datetime | |
| from typing import Dict, List, Optional, Tuple, Any | |
| from dataclasses import dataclass | |
| import pandas as pd | |
| from pathlib import Path | |
| # Core ML libraries | |
| from transformers import ( | |
| AutoTokenizer, | |
| AutoModelForCausalLM, | |
| BitsAndBytesConfig, | |
| pipeline | |
| ) | |
| from datasets import load_dataset | |
| from huggingface_hub import HfApi, hf_hub_download | |
| # Import leaderboard integration | |
| from gaia_leaderboard_integration import ( | |
| enhanced_gaia_agent, | |
| run_custom_benchmark_interface, | |
| load_test_questions_interface, | |
| preview_dataset_structure_interface, | |
| get_leaderboard_info, | |
| get_question_selection_info | |
| ) | |
| # Setup logging | |
| logging.basicConfig(level=logging.INFO) | |
| logger = logging.getLogger(__name__) | |
| # ================================ | |
| # MAIN APPLICATION | |
| # ================================ | |
| if __name__ == "__main__": | |
| app = create_gaia_app() | |
| app.launch( | |
| server_name="0.0.0.0", | |
| server_port=7860, | |
| share=False | |
| ) | |
| # CORE DATA STRUCTURES | |
| # ================================ | |
| class GAIAQuestion: | |
| """Structure for GAIA benchmark questions""" | |
| task_id: str | |
| question: str | |
| level: int | |
| final_answer: Optional[str] = None | |
| file_name: Optional[str] = None | |
| annotator_metadata: Optional[Dict] = None | |
| def from_dict(cls, data: dict): | |
| return cls(**{k: v for k, v in data.items() if k in cls.__annotations__}) | |
| class GAIAResponse: | |
| """Structure for GAIA responses""" | |
| task_id: str | |
| model_answer: str | |
| reasoning_trace: str | |
| final_answer: str | |
| processing_time: float = 0.0 | |
| confidence_score: float = 0.0 | |
| # ================================ | |
| # GAIA PROMPT MANAGEMENT | |
| # ================================ | |
| class GAIAPromptManager: | |
| """Manages GAIA-specific prompting and formatting""" | |
| GAIA_SYSTEM_PROMPT = """You are a general AI assistant. I will ask you a question. Report your thoughts, and finish your answer with the following template: | |
| FINAL ANSWER: [YOUR FINAL ANSWER] | |
| YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma separated list of numbers and/or strings. If you are asked for a number, don't use comma to write your number neither use units such as $ or percent sign unless specified otherwise. If you are asked for a string, don't use articles, neither abbreviations (e.g. for cities), and write the digits in plain text unless specified otherwise. If you are asked for a comma separated list, apply the above rules depending of whether the element to be put in the list is a number or a string.""" | |
| def create_gaia_prompt(question: str) -> str: | |
| """Create properly formatted GAIA prompt""" | |
| return f"{GAIAPromptManager.GAIA_SYSTEM_PROMPT}\n\nQuestion: {question}\n\nLet me think step by step:" | |
| def extract_final_answer(response: str) -> Tuple[str, str]: | |
| """Extract final answer and reasoning from model response""" | |
| final_answer_pattern = r"FINAL ANSWER:\s*(.+?)(?:\n|$)" | |
| match = re.search(final_answer_pattern, response, re.IGNORECASE | re.DOTALL) | |
| if match: | |
| final_answer = match.group(1).strip() | |
| reasoning_end = match.start() | |
| reasoning = response[:reasoning_end].strip() | |
| else: | |
| lines = response.strip().split('\n') | |
| final_answer = lines[-1].strip() if lines else "" | |
| reasoning = '\n'.join(lines[:-1]) if len(lines) > 1 else response | |
| return final_answer, reasoning | |
| # ================================ | |
| # HF SPACES OPTIMIZED MODEL MANAGER | |
| # ================================ | |
| class HFSpaceModelManager: | |
| """Hugging Face Spaces optimized model manager""" | |
| SPACE_MODELS = { | |
| "Fast & Light": { | |
| "name": "microsoft/DialoGPT-medium", | |
| "size": "~345MB", | |
| "speed": "Fast", | |
| "quality": "Good", | |
| "gpu_required": False | |
| }, | |
| "Balanced": { | |
| "name": "stabilityai/stablelm-zephyr-3b", | |
| "size": "~3GB", | |
| "speed": "Medium", | |
| "quality": "Better", | |
| "gpu_required": True | |
| }, | |
| "High Quality": { | |
| "name": "HuggingFaceH4/zephyr-7b-beta", | |
| "size": "~7GB", | |
| "speed": "Slower", | |
| "quality": "Best", | |
| "gpu_required": True | |
| }, | |
| "Instruction Following": { | |
| "name": "mistralai/Mistral-7B-Instruct-v0.1", | |
| "size": "~7GB", | |
| "speed": "Medium", | |
| "quality": "Excellent", | |
| "gpu_required": True | |
| } | |
| } | |
| def __init__(self, model_choice: str = "Fast & Light"): | |
| self.model_config = self.SPACE_MODELS[model_choice] | |
| self.model_name = self.model_config["name"] | |
| self.tokenizer = None | |
| self.model = None | |
| self.pipeline = None | |
| self.device = "cuda" if torch.cuda.is_available() else "cpu" | |
| def load_model(self, progress_callback=None) -> str: | |
| """Load model with progress updates""" | |
| try: | |
| if progress_callback: | |
| progress_callback(0.1, "Loading tokenizer...") | |
| self.tokenizer = AutoTokenizer.from_pretrained(self.model_name) | |
| if self.tokenizer.pad_token is None: | |
| self.tokenizer.pad_token = self.tokenizer.eos_token | |
| if progress_callback: | |
| progress_callback(0.3, "Configuring model...") | |
| quantization_config = None | |
| if self.device == "cuda" and "7b" in self.model_name.lower(): | |
| quantization_config = BitsAndBytesConfig( | |
| load_in_4bit=True, | |
| bnb_4bit_compute_dtype=torch.float16, | |
| bnb_4bit_use_double_quant=True, | |
| bnb_4bit_quant_type="nf4" | |
| ) | |
| if progress_callback: | |
| progress_callback(0.6, "Loading model weights...") | |
| self.model = AutoModelForCausalLM.from_pretrained( | |
| self.model_name, | |
| quantization_config=quantization_config, | |
| device_map="auto" if self.device == "cuda" else None, | |
| torch_dtype=torch.float16 if self.device == "cuda" else torch.float32, | |
| trust_remote_code=True | |
| ) | |
| if progress_callback: | |
| progress_callback(0.9, "Creating pipeline...") | |
| self.pipeline = pipeline( | |
| "text-generation", | |
| model=self.model, | |
| tokenizer=self.tokenizer, | |
| max_new_tokens=384, | |
| temperature=0.7, | |
| do_sample=True, | |
| pad_token_id=self.tokenizer.eos_token_id, | |
| device=0 if self.device == "cuda" else -1 | |
| ) | |
| if progress_callback: | |
| progress_callback(1.0, "Model loaded successfully!") | |
| return f"✅ Model '{self.model_name}' loaded successfully on {self.device.upper()}" | |
| except Exception as e: | |
| error_msg = f"❌ Error loading model: {str(e)}" | |
| logger.error(error_msg) | |
| return error_msg | |
| def generate_response(self, prompt: str, max_tokens: int = 384) -> str: | |
| """Generate response with error handling""" | |
| if self.pipeline is None: | |
| return "❌ Model not loaded. Please load a model first." | |
| try: | |
| max_input_length = 1000 | |
| if len(prompt) > max_input_length: | |
| prompt = prompt[:max_input_length] + "..." | |
| outputs = self.pipeline( | |
| prompt, | |
| max_new_tokens=max_tokens, | |
| temperature=0.7, | |
| do_sample=True, | |
| return_full_text=False, | |
| pad_token_id=self.tokenizer.eos_token_id | |
| ) | |
| response = outputs[0]['generated_text'].strip() | |
| return response | |
| except Exception as e: | |
| return f"❌ Error generating response: {str(e)}" | |
| # ================================ | |
| # DATASET MANAGEMENT | |
| # ================================ | |
| class GAIADatasetManager: | |
| """Manages GAIA dataset loading and sample generation""" | |
| def load_gaia_dataset(split: str = "test", max_questions: int = None) -> Tuple[List[GAIAQuestion], str]: | |
| """Load GAIA dataset from Hugging Face Hub""" | |
| try: | |
| dataset = load_dataset("gaia-benchmark/GAIA", split=split, trust_remote_code=True) | |
| questions = [] | |
| items = dataset[:max_questions] if max_questions else dataset | |
| for i, item in enumerate(items): | |
| question = GAIAQuestion( | |
| task_id=item.get('task_id', f'gaia_{split}_{i:03d}'), | |
| question=item['Question'], | |
| level=item['Level'], | |
| final_answer=item.get('Final answer', None), | |
| file_name=item.get('file_name', None), | |
| annotator_metadata=item.get('Annotator Metadata', None) | |
| ) | |
| questions.append(question) | |
| status = f"✅ Loaded {len(questions)} questions from GAIA {split} split" | |
| return questions, status | |
| except Exception as e: | |
| error_msg = f"❌ Error loading GAIA dataset: {str(e)}" | |
| return GAIADatasetManager.get_sample_questions(), error_msg | |
| def get_sample_questions() -> List[GAIAQuestion]: | |
| """Get sample questions for testing""" | |
| sample_data = [ | |
| { | |
| "task_id": "sample_001", | |
| "question": "What is the capital of France?", | |
| "level": 1, | |
| "final_answer": "Paris" | |
| }, | |
| { | |
| "task_id": "sample_002", | |
| "question": "Calculate 144 divided by 12.", | |
| "level": 1, | |
| "final_answer": "12" | |
| }, | |
| { | |
| "task_id": "sample_003", | |
| "question": "What is the largest planet in our solar system?", | |
| "level": 1, | |
| "final_answer": "Jupiter" | |
| }, | |
| { | |
| "task_id": "sample_004", | |
| "question": "Convert 100 degrees Celsius to Fahrenheit.", | |
| "level": 2, | |
| "final_answer": "212" | |
| }, | |
| { | |
| "task_id": "sample_005", | |
| "question": "List the first three even numbers greater than zero.", | |
| "level": 1, | |
| "final_answer": "2, 4, 6" | |
| }, | |
| { | |
| "task_id": "sample_006", | |
| "question": "What year did the Berlin Wall fall?", | |
| "level": 1, | |
| "final_answer": "1989" | |
| }, | |
| { | |
| "task_id": "sample_007", | |
| "question": "What is the chemical symbol for water?", | |
| "level": 1, | |
| "final_answer": "H2O" | |
| }, | |
| { | |
| "task_id": "sample_008", | |
| "question": "How many continents are there?", | |
| "level": 1, | |
| "final_answer": "7" | |
| } | |
| ] | |
| return [GAIAQuestion.from_dict(data) for data in sample_data] | |
| # ================================ | |
| # MAIN GAIA AGENT FOR HF SPACES | |
| # ================================ | |
| class GAIASpaceAgent: | |
| """Main GAIA agent optimized for Hugging Face Spaces""" | |
| def __init__(self): | |
| self.model_manager = None | |
| self.prompt_manager = GAIAPromptManager() | |
| self.current_model = None | |
| self.evaluation_results: List[GAIAResponse] = [] | |
| def initialize_model(self, model_choice: str, progress=gr.Progress()) -> str: | |
| """Initialize model with progress tracking""" | |
| try: | |
| progress(0, desc="Initializing model manager...") | |
| self.model_manager = HFSpaceModelManager(model_choice) | |
| self.current_model = model_choice | |
| def progress_callback(value, desc): | |
| progress(value, desc=desc) | |
| result = self.model_manager.load_model(progress_callback) | |
| self.evaluation_results = [] | |
| return result | |
| except Exception as e: | |
| return f"❌ Failed to initialize model: {str(e)}" | |
| def process_single_question(self, question_text: str, progress=gr.Progress()) -> Tuple[str, str, str, float]: | |
| """Process a single question with detailed output""" | |
| if self.model_manager is None or self.model_manager.pipeline is None: | |
| return "❌ No model loaded", "", "", 0.0 | |
| start_time = time.time() | |
| try: | |
| progress(0.2, desc="Creating GAIA prompt...") | |
| prompt = self.prompt_manager.create_gaia_prompt(question_text) | |
| progress(0.4, desc="Generating response...") | |
| raw_response = self.model_manager.generate_response(prompt) | |
| progress(0.8, desc="Extracting final answer...") | |
| final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response) | |
| processing_time = time.time() - start_time | |
| progress(1.0, desc="Complete!") | |
| return final_answer, raw_response, reasoning, processing_time | |
| except Exception as e: | |
| processing_time = time.time() - start_time | |
| error_msg = f"❌ Error processing question: {str(e)}" | |
| return error_msg, "", "", processing_time | |
| def batch_evaluate(self, questions: List[GAIAQuestion], progress=gr.Progress()) -> Tuple[str, str, str]: | |
| """Evaluate multiple questions with progress tracking""" | |
| if self.model_manager is None: | |
| return "❌ No model loaded", "", "" | |
| results = [] | |
| total_questions = len(questions) | |
| progress(0, desc=f"Starting evaluation of {total_questions} questions...") | |
| for i, question in enumerate(questions): | |
| try: | |
| progress((i + 1) / total_questions, | |
| desc=f"Processing question {i + 1}/{total_questions}: {question.task_id}") | |
| start_time = time.time() | |
| prompt = self.prompt_manager.create_gaia_prompt(question.question) | |
| raw_response = self.model_manager.generate_response(prompt) | |
| final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response) | |
| processing_time = time.time() - start_time | |
| response = GAIAResponse( | |
| task_id=question.task_id, | |
| model_answer=raw_response, | |
| reasoning_trace=reasoning, | |
| final_answer=final_answer, | |
| processing_time=processing_time | |
| ) | |
| results.append(response) | |
| self.evaluation_results.append(response) | |
| except Exception as e: | |
| logger.error(f"Error processing {question.task_id}: {e}") | |
| error_response = GAIAResponse( | |
| task_id=question.task_id, | |
| model_answer=f"Error: {str(e)}", | |
| reasoning_trace="Processing failed", | |
| final_answer="ERROR", | |
| processing_time=0.0 | |
| ) | |
| results.append(error_response) | |
| self.evaluation_results.append(error_response) | |
| summary = self._generate_summary(results) | |
| detailed_results = self._generate_detailed_results(results, questions) | |
| jsonl_content = self._generate_jsonl(results) | |
| return summary, detailed_results, jsonl_content | |
| def _generate_summary(self, results: List[GAIAResponse]) -> str: | |
| """Generate evaluation summary""" | |
| total = len(results) | |
| errors = sum(1 for r in results if r.final_answer == "ERROR") | |
| successful = total - errors | |
| avg_time = sum(r.processing_time for r in results) / total if total > 0 else 0 | |
| total_time = sum(r.processing_time for r in results) | |
| summary = f""" | |
| # 📊 GAIA Evaluation Summary | |
| ## Overall Statistics | |
| - **Total Questions**: {total} | |
| - **Successful**: {successful} | |
| - **Errors**: {errors} | |
| - **Success Rate**: {(successful/total*100):.1f}% | |
| ## Performance Metrics | |
| - **Average Processing Time**: {avg_time:.2f}s | |
| - **Total Processing Time**: {total_time:.2f}s | |
| - **Questions per Minute**: {(total/(total_time/60)):.1f} | |
| ## Model Information | |
| - **Model**: {self.current_model} | |
| - **Device**: {self.model_manager.device.upper() if self.model_manager else 'Unknown'} | |
| """ | |
| return summary | |
| def _generate_detailed_results(self, results: List[GAIAResponse], questions: List[GAIAQuestion]) -> str: | |
| """Generate detailed results breakdown""" | |
| detailed = "# 📋 Detailed Results\n\n" | |
| for i, (result, question) in enumerate(zip(results, questions), 1): | |
| status = "✅" if result.final_answer != "ERROR" else "❌" | |
| detailed += f""" | |
| ## Question {i}: {question.task_id} {status} | |
| **Question**: {question.question} | |
| **Model Answer**: {result.final_answer} | |
| **Expected Answer**: {question.final_answer if question.final_answer else 'N/A'} | |
| **Processing Time**: {result.processing_time:.2f}s | |
| **Level**: {question.level} | |
| --- | |
| """ | |
| return detailed | |
| def _generate_jsonl(self, results: List[GAIAResponse]) -> str: | |
| """Generate JSONL format for download""" | |
| jsonl_lines = [] | |
| for result in results: | |
| line = { | |
| "task_id": result.task_id, | |
| "model_answer": result.model_answer, | |
| "reasoning_trace": result.reasoning_trace | |
| } | |
| jsonl_lines.append(json.dumps(line)) | |
| return '\n'.join(jsonl_lines) | |
| # ================================ | |
| # GLOBAL AGENT INSTANCE | |
| # ================================ | |
| gaia_agent = GAIASpaceAgent() | |
| # ================================ | |
| # GRADIO INTERFACE FUNCTIONS | |
| # ================================ | |
| def load_model_interface(model_choice: str, progress=gr.Progress()): | |
| """Interface function for model loading""" | |
| return gaia_agent.initialize_model(model_choice, progress) | |
| def single_question_interface(question: str, progress=gr.Progress()): | |
| """Interface function for single question processing""" | |
| if not question.strip(): | |
| return "Please enter a question", "", "", "0.00s" | |
| final_answer, full_response, reasoning, proc_time = gaia_agent.process_single_question(question, progress) | |
| return ( | |
| final_answer, | |
| full_response, | |
| reasoning, | |
| f"{proc_time:.2f}s" | |
| ) | |
| def batch_evaluate_interface(dataset_choice: str, max_questions: int, progress=gr.Progress()): | |
| """Interface function for batch evaluation""" | |
| if gaia_agent.model_manager is None: | |
| return "❌ Please load a model first", "", "" | |
| progress(0.1, desc="Loading dataset...") | |
| if dataset_choice == "Sample Questions": | |
| questions = GAIADatasetManager.get_sample_questions() | |
| status_msg = f"✅ Loaded {len(questions)} sample questions" | |
| else: | |
| questions, status_msg = GAIADatasetManager.load_gaia_dataset("test", max_questions) | |
| if max_questions and len(questions) > max_questions: | |
| questions = questions[:max_questions] | |
| progress(0.2, desc=f"{status_msg}. Starting evaluation...") | |
| summary, detailed, jsonl = gaia_agent.batch_evaluate(questions, progress) | |
| return summary, detailed, jsonl | |
| def get_model_info(model_choice: str): | |
| """Get information about selected model""" | |
| if model_choice in HFSpaceModelManager.SPACE_MODELS: | |
| config = HFSpaceModelManager.SPACE_MODELS[model_choice] | |
| return f""" | |
| **Model**: {config['name']} | |
| **Size**: {config['size']} | |
| **Speed**: {config['speed']} | |
| **Quality**: {config['quality']} | |
| **GPU Required**: {'Yes' if config['gpu_required'] else 'No'} | |
| """ | |
| return "Model information not available" | |
| # ================================ | |
| # GRADIO APP CREATION | |
| # ================================ | |
| def create_gaia_app(): | |
| """Create the main Gradio application""" | |
| with gr.Blocks( | |
| title="GAIA Benchmark AI Agent", | |
| theme=gr.themes.Soft() | |
| ) as app: | |
| gr.HTML(""" | |
| <div style="text-align: center; font-size: 2.5em; font-weight: bold; margin-bottom: 20px;"> | |
| 🧠 GAIA Benchmark AI Agent | |
| </div> | |
| <p style="text-align: center; font-size: 1.2em; color: #666;"> | |
| Evaluate AI models on the GAIA benchmark with step-by-step reasoning | |
| </p> | |
| """) | |
| with gr.Tabs(): | |
| # TAB 1: MODEL SETUP | |
| with gr.Tab("🔧 Model Setup"): | |
| gr.Markdown("## Choose and Load Your Model") | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| model_dropdown = gr.Dropdown( | |
| choices=list(HFSpaceModelManager.SPACE_MODELS.keys()), | |
| value="Fast & Light", | |
| label="Select Model" | |
| ) | |
| model_info = gr.Markdown( | |
| value=get_model_info("Fast & Light"), | |
| label="Model Information" | |
| ) | |
| load_btn = gr.Button("🚀 Load Model", variant="primary", size="lg") | |
| with gr.Column(scale=1): | |
| gpu_info = gr.Markdown(f""" | |
| ### System Info | |
| **CUDA Available**: {torch.cuda.is_available()} | |
| {f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"} | |
| """) | |
| model_status = gr.Textbox( | |
| label="Model Status", | |
| value="No model loaded", | |
| interactive=False | |
| ) | |
| model_dropdown.change( | |
| fn=get_model_info, | |
| inputs=[model_dropdown], | |
| outputs=[model_info] | |
| ) | |
| load_btn.click( | |
| fn=load_model_interface, | |
| inputs=[model_dropdown], | |
| outputs=[model_status] | |
| ) | |
| # TAB 2: SINGLE QUESTION | |
| with gr.Tab("❓ Single Question"): | |
| gr.Markdown("## Test Individual Questions") | |
| with gr.Row(): | |
| with gr.Column(): | |
| question_input = gr.Textbox( | |
| label="Enter your question", | |
| placeholder="e.g., What is the capital of France?", | |
| lines=3 | |
| ) | |
| process_btn = gr.Button("🤔 Process Question", variant="primary") | |
| gr.Markdown("### Example Questions:") | |
| example_questions = [ | |
| "What is the capital of France?", | |
| "Calculate 144 divided by 12", | |
| "What is the largest planet in our solar system?", | |
| "Convert 100 degrees Celsius to Fahrenheit" | |
| ] | |
| for example in example_questions: | |
| gr.Button(f"📝 {example}", size="sm").click( | |
| lambda x=example: x, | |
| outputs=[question_input] | |
| ) | |
| with gr.Column(): | |
| final_answer_output = gr.Textbox( | |
| label="🎯 Final Answer", | |
| interactive=False | |
| ) | |
| processing_time = gr.Textbox( | |
| label="⏱️ Processing Time", | |
| interactive=False | |
| ) | |
| with gr.Accordion("🧠 Full Response", open=False): | |
| full_response = gr.Textbox( | |
| label="Complete Model Response", | |
| lines=8, | |
| interactive=False | |
| ) | |
| with gr.Accordion("🔍 Reasoning Trace", open=False): | |
| reasoning_trace = gr.Textbox( | |
| label="Step-by-step Reasoning", | |
| lines=6, | |
| interactive=False | |
| ) | |
| process_btn.click( | |
| fn=single_question_interface, | |
| inputs=[question_input], | |
| outputs=[final_answer_output, full_response, reasoning_trace, processing_time] | |
| ) | |
| # TAB 3: BATCH EVALUATION | |
| with gr.Tab("📊 Batch Evaluation"): | |
| gr.Markdown("## Evaluate Multiple Questions") | |
| with gr.Row(): | |
| dataset_choice = gr.Radio( | |
| choices=["Sample Questions", "GAIA Test Set"], | |
| value="Sample Questions", | |
| label="Dataset Choice" | |
| ) | |
| max_questions = gr.Slider( | |
| minimum=1, | |
| maximum=50, | |
| value=5, | |
| step=1, | |
| label="Max Questions" | |
| ) | |
| evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg") | |
| with gr.Row(): | |
| with gr.Column(): | |
| summary_output = gr.Markdown( | |
| label="📊 Evaluation Summary", | |
| value="No evaluation completed yet" | |
| ) | |
| with gr.Column(): | |
| download_output = gr.File( | |
| label="💾 Download Results (JSONL)", | |
| visible=False | |
| ) | |
| with gr.Accordion("📋 Detailed Results", open=False): | |
| detailed_output = gr.Markdown( | |
| value="Run an evaluation to see detailed results" | |
| ) | |
| def batch_eval_with_download(*args): | |
| summary, detailed, jsonl_content = batch_evaluate_interface(*args) | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| filename = f"gaia_results_{timestamp}.jsonl" | |
| with open(filename, 'w') as f: | |
| f.write(jsonl_content) | |
| return summary, detailed, filename | |
| evaluate_btn.click( | |
| fn=batch_eval_with_download, | |
| inputs=[dataset_choice, max_questions], | |
| outputs=[summary_output, detailed_output, download_output] | |
| ).then( | |
| lambda: gr.update(visible=True), | |
| outputs=[download_output] | |
| ) | |
| # TAB 4: FULL BENCHMARK | |
| with gr.Tab("🏆 Full Benchmark"): | |
| gr.Markdown("## Official GAIA Leaderboard Benchmark") | |
| with gr.Row(): | |
| with gr.Column(): | |
| test_preview_btn = gr.Button("🔍 Preview Test Questions", variant="secondary") | |
| test_preview_output = gr.Markdown( | |
| value="Click above to preview official test questions" | |
| ) | |
| dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary") | |
| dataset_structure_output = gr.Markdown( | |
| value="Click above to see actual GAIA dataset structure" | |
| ) | |
| with gr.Column(): | |
| question_count = gr.Slider( | |
| minimum=10, | |
| maximum=300, | |
| value=20, | |
| step=10, | |
| label="Number of Questions" | |
| ) | |
| selection_strategy = gr.Dropdown( | |
| choices=["balanced", "random", "sequential"], | |
| value="balanced", | |
| label="Selection Strategy" | |
| ) | |
| benchmark_btn = gr.Button("🎯 Run Benchmark", variant="primary", size="lg") | |
| benchmark_status = gr.Textbox( | |
| label="📊 Benchmark Status", | |
| value="Ready to run benchmark", | |
| interactive=False | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| benchmark_report = gr.Markdown( | |
| label="📈 Benchmark Report", | |
| value="Run benchmark to see detailed results" | |
| ) | |
| with gr.Column(): | |
| submission_file = gr.File( | |
| label="💾 Download Submission File (JSONL)", | |
| visible=False | |
| ) | |
| metadata_file = gr.File( | |
| label="📋 Download Metadata File", | |
| visible=False | |
| ) | |
| # Event handlers | |
| test_preview_btn.click( | |
| fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"), | |
| outputs=[test_preview_output] | |
| ) | |
| dataset_structure_btn.click( | |
| fn=preview_dataset_structure_interface, | |
| outputs=[dataset_structure_output] | |
| ) | |
| def run_benchmark_wrapper(count, strategy, progress=gr.Progress()): | |
| return run_custom_benchmark_interface(count, strategy, progress) | |
| def show_download_files(status, report, sub_file, meta_file): | |
| return ( | |
| status, | |
| report, | |
| sub_file, | |
| meta_file, | |
| gr.update(visible=True), | |
| gr.update(visible=True) | |
| ) | |
| benchmark_btn.click( | |
| fn=run_benchmark_wrapper, | |
| inputs=[question_count, selection_strategy], | |
| outputs=[benchmark_status, benchmark_report, submission_file, metadata_file] | |
| ).then( | |
| fn=show_download_files, | |
| inputs=[benchmark_status, benchmark_report, submission_file, metadata_file], | |
| outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file] | |
| ) | |
| return app | |
| # ================================ |