Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -30,14 +30,14 @@ from transformers import (
|
|
| 30 |
from datasets import load_dataset
|
| 31 |
from huggingface_hub import HfApi, hf_hub_download
|
| 32 |
|
| 33 |
-
# Import leaderboard integration
|
| 34 |
from gaia_leaderboard_integration import (
|
| 35 |
enhanced_gaia_agent,
|
| 36 |
-
run_custom_benchmark_interface,
|
| 37 |
load_test_questions_interface,
|
| 38 |
-
preview_dataset_structure_interface,
|
| 39 |
get_leaderboard_info,
|
| 40 |
-
get_question_selection_info
|
| 41 |
)
|
| 42 |
|
| 43 |
# Setup logging
|
|
@@ -45,6 +45,16 @@ logging.basicConfig(level=logging.INFO)
|
|
| 45 |
logger = logging.getLogger(__name__)
|
| 46 |
|
| 47 |
# ================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
# CORE DATA STRUCTURES
|
| 49 |
# ================================
|
| 50 |
|
|
@@ -114,7 +124,6 @@ YOUR FINAL ANSWER should be a number OR as few words as possible OR a comma sepa
|
|
| 114 |
class HFSpaceModelManager:
|
| 115 |
"""Hugging Face Spaces optimized model manager"""
|
| 116 |
|
| 117 |
-
# Space-friendly models with different capabilities
|
| 118 |
SPACE_MODELS = {
|
| 119 |
"Fast & Light": {
|
| 120 |
"name": "microsoft/DialoGPT-medium",
|
|
@@ -160,7 +169,6 @@ class HFSpaceModelManager:
|
|
| 160 |
if progress_callback:
|
| 161 |
progress_callback(0.1, "Loading tokenizer...")
|
| 162 |
|
| 163 |
-
# Load tokenizer
|
| 164 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 165 |
if self.tokenizer.pad_token is None:
|
| 166 |
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
@@ -168,7 +176,6 @@ class HFSpaceModelManager:
|
|
| 168 |
if progress_callback:
|
| 169 |
progress_callback(0.3, "Configuring model...")
|
| 170 |
|
| 171 |
-
# Configure quantization for GPU spaces
|
| 172 |
quantization_config = None
|
| 173 |
if self.device == "cuda" and "7b" in self.model_name.lower():
|
| 174 |
quantization_config = BitsAndBytesConfig(
|
|
@@ -181,7 +188,6 @@ class HFSpaceModelManager:
|
|
| 181 |
if progress_callback:
|
| 182 |
progress_callback(0.6, "Loading model weights...")
|
| 183 |
|
| 184 |
-
# Load model
|
| 185 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 186 |
self.model_name,
|
| 187 |
quantization_config=quantization_config,
|
|
@@ -193,7 +199,6 @@ class HFSpaceModelManager:
|
|
| 193 |
if progress_callback:
|
| 194 |
progress_callback(0.9, "Creating pipeline...")
|
| 195 |
|
| 196 |
-
# Create pipeline
|
| 197 |
self.pipeline = pipeline(
|
| 198 |
"text-generation",
|
| 199 |
model=self.model,
|
|
@@ -221,7 +226,6 @@ class HFSpaceModelManager:
|
|
| 221 |
return "❌ Model not loaded. Please load a model first."
|
| 222 |
|
| 223 |
try:
|
| 224 |
-
# Truncate prompt if too long
|
| 225 |
max_input_length = 1000
|
| 226 |
if len(prompt) > max_input_length:
|
| 227 |
prompt = prompt[:max_input_length] + "..."
|
|
@@ -351,13 +355,10 @@ class GAIASpaceAgent:
|
|
| 351 |
self.model_manager = HFSpaceModelManager(model_choice)
|
| 352 |
self.current_model = model_choice
|
| 353 |
|
| 354 |
-
# Load model with progress updates
|
| 355 |
def progress_callback(value, desc):
|
| 356 |
progress(value, desc=desc)
|
| 357 |
|
| 358 |
result = self.model_manager.load_model(progress_callback)
|
| 359 |
-
|
| 360 |
-
# Clear any previous results when changing models
|
| 361 |
self.evaluation_results = []
|
| 362 |
|
| 363 |
return result
|
|
@@ -374,22 +375,15 @@ class GAIASpaceAgent:
|
|
| 374 |
|
| 375 |
try:
|
| 376 |
progress(0.2, desc="Creating GAIA prompt...")
|
| 377 |
-
|
| 378 |
-
# Create GAIA prompt
|
| 379 |
prompt = self.prompt_manager.create_gaia_prompt(question_text)
|
| 380 |
|
| 381 |
progress(0.4, desc="Generating response...")
|
| 382 |
-
|
| 383 |
-
# Generate response
|
| 384 |
raw_response = self.model_manager.generate_response(prompt)
|
| 385 |
|
| 386 |
progress(0.8, desc="Extracting final answer...")
|
| 387 |
-
|
| 388 |
-
# Extract final answer and reasoning
|
| 389 |
final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
|
| 390 |
|
| 391 |
processing_time = time.time() - start_time
|
| 392 |
-
|
| 393 |
progress(1.0, desc="Complete!")
|
| 394 |
|
| 395 |
return final_answer, raw_response, reasoning, processing_time
|
|
@@ -415,17 +409,11 @@ class GAIASpaceAgent:
|
|
| 415 |
desc=f"Processing question {i + 1}/{total_questions}: {question.task_id}")
|
| 416 |
|
| 417 |
start_time = time.time()
|
| 418 |
-
|
| 419 |
-
# Create prompt and generate response
|
| 420 |
prompt = self.prompt_manager.create_gaia_prompt(question.question)
|
| 421 |
raw_response = self.model_manager.generate_response(prompt)
|
| 422 |
-
|
| 423 |
-
# Extract final answer
|
| 424 |
final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
|
| 425 |
-
|
| 426 |
processing_time = time.time() - start_time
|
| 427 |
|
| 428 |
-
# Create response object
|
| 429 |
response = GAIAResponse(
|
| 430 |
task_id=question.task_id,
|
| 431 |
model_answer=raw_response,
|
|
@@ -449,13 +437,8 @@ class GAIASpaceAgent:
|
|
| 449 |
results.append(error_response)
|
| 450 |
self.evaluation_results.append(error_response)
|
| 451 |
|
| 452 |
-
# Generate summary
|
| 453 |
summary = self._generate_summary(results)
|
| 454 |
-
|
| 455 |
-
# Generate detailed results
|
| 456 |
detailed_results = self._generate_detailed_results(results, questions)
|
| 457 |
-
|
| 458 |
-
# Generate downloadable JSONL
|
| 459 |
jsonl_content = self._generate_jsonl(results)
|
| 460 |
|
| 461 |
return summary, detailed_results, jsonl_content
|
|
@@ -530,7 +513,6 @@ class GAIASpaceAgent:
|
|
| 530 |
# GLOBAL AGENT INSTANCE
|
| 531 |
# ================================
|
| 532 |
|
| 533 |
-
# Initialize global agent
|
| 534 |
gaia_agent = GAIASpaceAgent()
|
| 535 |
|
| 536 |
# ================================
|
|
@@ -562,20 +544,17 @@ def batch_evaluate_interface(dataset_choice: str, max_questions: int, progress=g
|
|
| 562 |
|
| 563 |
progress(0.1, desc="Loading dataset...")
|
| 564 |
|
| 565 |
-
# Load questions based on choice
|
| 566 |
if dataset_choice == "Sample Questions":
|
| 567 |
questions = GAIADatasetManager.get_sample_questions()
|
| 568 |
status_msg = f"✅ Loaded {len(questions)} sample questions"
|
| 569 |
else:
|
| 570 |
questions, status_msg = GAIADatasetManager.load_gaia_dataset("test", max_questions)
|
| 571 |
|
| 572 |
-
# Limit questions
|
| 573 |
if max_questions and len(questions) > max_questions:
|
| 574 |
questions = questions[:max_questions]
|
| 575 |
|
| 576 |
progress(0.2, desc=f"{status_msg}. Starting evaluation...")
|
| 577 |
|
| 578 |
-
# Run evaluation
|
| 579 |
summary, detailed, jsonl = gaia_agent.batch_evaluate(questions, progress)
|
| 580 |
|
| 581 |
return summary, detailed, jsonl
|
|
@@ -602,26 +581,11 @@ def create_gaia_app():
|
|
| 602 |
|
| 603 |
with gr.Blocks(
|
| 604 |
title="GAIA Benchmark AI Agent",
|
| 605 |
-
theme=gr.themes.Soft()
|
| 606 |
-
css="""
|
| 607 |
-
.gradio-container {
|
| 608 |
-
font-family: 'Arial', sans-serif;
|
| 609 |
-
}
|
| 610 |
-
.main-header {
|
| 611 |
-
text-align: center;
|
| 612 |
-
background: linear-gradient(45deg, #2196F3, #21CBF3);
|
| 613 |
-
-webkit-background-clip: text;
|
| 614 |
-
-webkit-text-fill-color: transparent;
|
| 615 |
-
font-size: 2.5em;
|
| 616 |
-
font-weight: bold;
|
| 617 |
-
margin-bottom: 20px;
|
| 618 |
-
}
|
| 619 |
-
"""
|
| 620 |
) as app:
|
| 621 |
|
| 622 |
-
# Header
|
| 623 |
gr.HTML("""
|
| 624 |
-
<div
|
| 625 |
🧠 GAIA Benchmark AI Agent
|
| 626 |
</div>
|
| 627 |
<p style="text-align: center; font-size: 1.2em; color: #666;">
|
|
@@ -631,9 +595,7 @@ def create_gaia_app():
|
|
| 631 |
|
| 632 |
with gr.Tabs():
|
| 633 |
|
| 634 |
-
# ===============================
|
| 635 |
# TAB 1: MODEL SETUP
|
| 636 |
-
# ===============================
|
| 637 |
with gr.Tab("🔧 Model Setup"):
|
| 638 |
gr.Markdown("## Choose and Load Your Model")
|
| 639 |
|
|
@@ -642,8 +604,7 @@ def create_gaia_app():
|
|
| 642 |
model_dropdown = gr.Dropdown(
|
| 643 |
choices=list(HFSpaceModelManager.SPACE_MODELS.keys()),
|
| 644 |
value="Fast & Light",
|
| 645 |
-
label="Select Model"
|
| 646 |
-
info="Choose based on your quality vs speed preference"
|
| 647 |
)
|
| 648 |
|
| 649 |
model_info = gr.Markdown(
|
|
@@ -655,7 +616,7 @@ def create_gaia_app():
|
|
| 655 |
|
| 656 |
with gr.Column(scale=1):
|
| 657 |
gpu_info = gr.Markdown(f"""
|
| 658 |
-
###
|
| 659 |
**CUDA Available**: {torch.cuda.is_available()}
|
| 660 |
{f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"}
|
| 661 |
""")
|
|
@@ -666,23 +627,19 @@ def create_gaia_app():
|
|
| 666 |
interactive=False
|
| 667 |
)
|
| 668 |
|
| 669 |
-
# Update model info when selection changes
|
| 670 |
model_dropdown.change(
|
| 671 |
fn=get_model_info,
|
| 672 |
inputs=[model_dropdown],
|
| 673 |
outputs=[model_info]
|
| 674 |
)
|
| 675 |
|
| 676 |
-
# Load model when button clicked
|
| 677 |
load_btn.click(
|
| 678 |
fn=load_model_interface,
|
| 679 |
inputs=[model_dropdown],
|
| 680 |
outputs=[model_status]
|
| 681 |
)
|
| 682 |
|
| 683 |
-
# ===============================
|
| 684 |
# TAB 2: SINGLE QUESTION
|
| 685 |
-
# ===============================
|
| 686 |
with gr.Tab("❓ Single Question"):
|
| 687 |
gr.Markdown("## Test Individual Questions")
|
| 688 |
|
|
@@ -696,8 +653,7 @@ def create_gaia_app():
|
|
| 696 |
|
| 697 |
process_btn = gr.Button("🤔 Process Question", variant="primary")
|
| 698 |
|
| 699 |
-
|
| 700 |
-
gr.Markdown("### 💡 Example Questions:")
|
| 701 |
example_questions = [
|
| 702 |
"What is the capital of France?",
|
| 703 |
"Calculate 144 divided by 12",
|
|
@@ -705,11 +661,8 @@ def create_gaia_app():
|
|
| 705 |
"Convert 100 degrees Celsius to Fahrenheit"
|
| 706 |
]
|
| 707 |
|
| 708 |
-
for
|
| 709 |
-
gr.Button(
|
| 710 |
-
f"📝 {example}",
|
| 711 |
-
size="sm"
|
| 712 |
-
).click(
|
| 713 |
lambda x=example: x,
|
| 714 |
outputs=[question_input]
|
| 715 |
)
|
|
@@ -739,16 +692,13 @@ def create_gaia_app():
|
|
| 739 |
interactive=False
|
| 740 |
)
|
| 741 |
|
| 742 |
-
# Process single question
|
| 743 |
process_btn.click(
|
| 744 |
fn=single_question_interface,
|
| 745 |
inputs=[question_input],
|
| 746 |
outputs=[final_answer_output, full_response, reasoning_trace, processing_time]
|
| 747 |
)
|
| 748 |
|
| 749 |
-
# ===============================
|
| 750 |
# TAB 3: BATCH EVALUATION
|
| 751 |
-
# ===============================
|
| 752 |
with gr.Tab("📊 Batch Evaluation"):
|
| 753 |
gr.Markdown("## Evaluate Multiple Questions")
|
| 754 |
|
|
@@ -756,8 +706,7 @@ def create_gaia_app():
|
|
| 756 |
dataset_choice = gr.Radio(
|
| 757 |
choices=["Sample Questions", "GAIA Test Set"],
|
| 758 |
value="Sample Questions",
|
| 759 |
-
label="Dataset Choice"
|
| 760 |
-
info="Start with sample questions to test your setup"
|
| 761 |
)
|
| 762 |
|
| 763 |
max_questions = gr.Slider(
|
|
@@ -765,8 +714,7 @@ def create_gaia_app():
|
|
| 765 |
maximum=50,
|
| 766 |
value=5,
|
| 767 |
step=1,
|
| 768 |
-
label="Max Questions"
|
| 769 |
-
info="Number of questions to evaluate"
|
| 770 |
)
|
| 771 |
|
| 772 |
evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg")
|
|
@@ -789,11 +737,9 @@ def create_gaia_app():
|
|
| 789 |
value="Run an evaluation to see detailed results"
|
| 790 |
)
|
| 791 |
|
| 792 |
-
# Batch evaluation
|
| 793 |
def batch_eval_with_download(*args):
|
| 794 |
summary, detailed, jsonl_content = batch_evaluate_interface(*args)
|
| 795 |
|
| 796 |
-
# Save JSONL for download
|
| 797 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 798 |
filename = f"gaia_results_{timestamp}.jsonl"
|
| 799 |
|
|
@@ -811,47 +757,29 @@ def create_gaia_app():
|
|
| 811 |
outputs=[download_output]
|
| 812 |
)
|
| 813 |
|
| 814 |
-
#
|
| 815 |
-
# TAB 4: FULL BENCHMARK (ENHANCED FOR 300 QUESTIONS)
|
| 816 |
-
# ===============================
|
| 817 |
with gr.Tab("🏆 Full Benchmark"):
|
| 818 |
gr.Markdown("## Official GAIA Leaderboard Benchmark")
|
| 819 |
|
| 820 |
with gr.Row():
|
| 821 |
with gr.Column():
|
| 822 |
-
gr.Markdown(get_leaderboard_info())
|
| 823 |
-
|
| 824 |
-
with gr.Column():
|
| 825 |
-
# Test questions preview
|
| 826 |
test_preview_btn = gr.Button("🔍 Preview Test Questions", variant="secondary")
|
| 827 |
test_preview_output = gr.Markdown(
|
| 828 |
value="Click above to preview official test questions"
|
| 829 |
)
|
| 830 |
|
| 831 |
-
# Dataset structure preview (NEW)
|
| 832 |
dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary")
|
| 833 |
dataset_structure_output = gr.Markdown(
|
| 834 |
value="Click above to see actual GAIA dataset structure"
|
| 835 |
)
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
with gr.Row():
|
| 841 |
-
# Preset buttons for common configurations
|
| 842 |
-
quick_test_btn = gr.Button("🚀 Quick Test (20 questions)", variant="secondary")
|
| 843 |
-
medium_test_btn = gr.Button("📊 Medium Test (50 questions)", variant="secondary")
|
| 844 |
-
full_benchmark_btn = gr.Button("🏆 FULL BENCHMARK (300 questions)", variant="primary", size="lg")
|
| 845 |
-
|
| 846 |
-
# Advanced configuration (collapsible)
|
| 847 |
-
with gr.Accordion("🎛️ Advanced Configuration", open=False):
|
| 848 |
-
with gr.Row():
|
| 849 |
-
custom_count = gr.Slider(
|
| 850 |
minimum=10,
|
| 851 |
-
maximum=300,
|
| 852 |
-
value=
|
| 853 |
step=10,
|
| 854 |
-
label="
|
| 855 |
)
|
| 856 |
|
| 857 |
selection_strategy = gr.Dropdown(
|
|
@@ -859,22 +787,9 @@ def create_gaia_app():
|
|
| 859 |
value="balanced",
|
| 860 |
label="Selection Strategy"
|
| 861 |
)
|
| 862 |
-
|
| 863 |
-
|
| 864 |
-
|
| 865 |
-
# Show selection info
|
| 866 |
-
selection_info = gr.Markdown(get_question_selection_info())
|
| 867 |
-
|
| 868 |
-
# Warning message for full benchmark
|
| 869 |
-
gr.Markdown("""
|
| 870 |
-
**⚠️ Full 300-Question Benchmark Warning**:
|
| 871 |
-
- **Time**: 1-3 hours depending on model and hardware
|
| 872 |
-
- **Cost**: ~$1-3 on GPU (T4 Small recommended)
|
| 873 |
-
- **Purpose**: Official leaderboard submission
|
| 874 |
-
- **Recommendation**: Test with smaller batches first
|
| 875 |
-
""")
|
| 876 |
|
| 877 |
-
# Results section
|
| 878 |
benchmark_status = gr.Textbox(
|
| 879 |
label="📊 Benchmark Status",
|
| 880 |
value="Ready to run benchmark",
|
|
@@ -889,7 +804,6 @@ def create_gaia_app():
|
|
| 889 |
)
|
| 890 |
|
| 891 |
with gr.Column():
|
| 892 |
-
# Download files
|
| 893 |
submission_file = gr.File(
|
| 894 |
label="💾 Download Submission File (JSONL)",
|
| 895 |
visible=False
|
|
@@ -899,20 +813,8 @@ def create_gaia_app():
|
|
| 899 |
label="📋 Download Metadata File",
|
| 900 |
visible=False
|
| 901 |
)
|
| 902 |
-
|
| 903 |
-
gr.Markdown("""
|
| 904 |
-
### 📤 Leaderboard Submission Steps
|
| 905 |
-
1. **Download** the JSONL file above
|
| 906 |
-
2. **Visit** [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
|
| 907 |
-
3. **Upload** your submission file
|
| 908 |
-
4. **View** your model's ranking!
|
| 909 |
-
""")
|
| 910 |
|
| 911 |
-
#
|
| 912 |
-
# EVENT HANDLERS (FIXED FUNCTION CALLS)
|
| 913 |
-
# ================================
|
| 914 |
-
|
| 915 |
-
# Preview functions
|
| 916 |
test_preview_btn.click(
|
| 917 |
fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
|
| 918 |
outputs=[test_preview_output]
|
|
@@ -923,240 +825,29 @@ def create_gaia_app():
|
|
| 923 |
outputs=[dataset_structure_output]
|
| 924 |
)
|
| 925 |
|
| 926 |
-
|
| 927 |
-
def run_quick_test(progress=gr.Progress()):
|
| 928 |
-
return run_custom_benchmark_interface(20, "balanced", progress)
|
| 929 |
-
|
| 930 |
-
def run_medium_test(progress=gr.Progress()):
|
| 931 |
-
return run_custom_benchmark_interface(50, "balanced", progress)
|
| 932 |
-
|
| 933 |
-
def run_full_300_benchmark(progress=gr.Progress()):
|
| 934 |
-
return run_custom_benchmark_interface(300, "balanced", progress)
|
| 935 |
-
|
| 936 |
-
def run_custom_benchmark_wrapper(count, strategy, progress=gr.Progress()):
|
| 937 |
return run_custom_benchmark_interface(count, strategy, progress)
|
| 938 |
|
| 939 |
-
# Helper function to show download files
|
| 940 |
def show_download_files(status, report, sub_file, meta_file):
|
| 941 |
return (
|
| 942 |
status,
|
| 943 |
report,
|
| 944 |
sub_file,
|
| 945 |
meta_file,
|
| 946 |
-
gr.update(visible=True),
|
| 947 |
-
gr.update(visible=True)
|
| 948 |
)
|
| 949 |
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
|
| 953 |
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
|
| 954 |
).then(
|
| 955 |
fn=show_download_files,
|
| 956 |
inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
|
| 957 |
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
|
| 958 |
)
|
| 959 |
-
|
| 960 |
-
medium_test_btn.click(
|
| 961 |
-
fn=run_medium_test,
|
| 962 |
-
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
|
| 963 |
-
).then(
|
| 964 |
-
fn=show_download_files,
|
| 965 |
-
inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
|
| 966 |
-
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
|
| 967 |
-
)
|
| 968 |
-
|
| 969 |
-
# FULL 300-question benchmark
|
| 970 |
-
full_benchmark_btn.click(
|
| 971 |
-
fn=run_full_300_benchmark,
|
| 972 |
-
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
|
| 973 |
-
).then(
|
| 974 |
-
fn=show_download_files,
|
| 975 |
-
inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
|
| 976 |
-
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
|
| 977 |
-
)
|
| 978 |
-
|
| 979 |
-
# Custom benchmark
|
| 980 |
-
custom_benchmark_btn.click(
|
| 981 |
-
fn=run_custom_benchmark_wrapper,
|
| 982 |
-
inputs=[custom_count, selection_strategy],
|
| 983 |
-
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
|
| 984 |
-
).then(
|
| 985 |
-
fn=show_download_files,
|
| 986 |
-
inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
|
| 987 |
-
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
|
| 988 |
-
)
|
| 989 |
-
|
| 990 |
-
# ===============================
|
| 991 |
-
# TAB 5: INFORMATION (UPDATED)
|
| 992 |
-
# ===============================
|
| 993 |
-
with gr.Tab("ℹ️ Information"):
|
| 994 |
-
gr.Markdown("""
|
| 995 |
-
# 🧠 GAIA Benchmark AI Agent
|
| 996 |
-
|
| 997 |
-
## What is GAIA?
|
| 998 |
-
GAIA (General AI Assistant) is a benchmark designed to test AI assistants on real-world questions that require:
|
| 999 |
-
- **Reasoning**: Multi-step logical thinking
|
| 1000 |
-
- **Multi-modality**: Handling text, images, and other file types
|
| 1001 |
-
- **Web browsing**: Finding and using external information
|
| 1002 |
-
- **Tool use**: Calculator, code execution, etc.
|
| 1003 |
-
|
| 1004 |
-
## 🏆 GAIA Public Leaderboard
|
| 1005 |
-
GAIA provides a **public leaderboard hosted on Hugging Face** where you can:
|
| 1006 |
-
- Test your models against **300 official testing questions**
|
| 1007 |
-
- Compare performance with state-of-the-art systems
|
| 1008 |
-
- Track progress in AI reasoning capabilities
|
| 1009 |
-
- Contribute to research community benchmarks
|
| 1010 |
-
|
| 1011 |
-
**Leaderboard URL**: [https://huggingface.co/spaces/gaia-benchmark/leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
|
| 1012 |
-
|
| 1013 |
-
## 🎯 How to Use This Space
|
| 1014 |
-
|
| 1015 |
-
### 1. Model Setup
|
| 1016 |
-
- Choose a model based on your needs (speed vs quality)
|
| 1017 |
-
- Load the model (this may take a few minutes)
|
| 1018 |
-
- Wait for "Model loaded successfully" message
|
| 1019 |
-
|
| 1020 |
-
### 2. Test Single Questions
|
| 1021 |
-
- Start with the "Single Question" tab
|
| 1022 |
-
- Try example questions to verify everything works
|
| 1023 |
-
- Enter your own questions to test model capabilities
|
| 1024 |
-
|
| 1025 |
-
### 3. Batch Evaluation
|
| 1026 |
-
- Use "Sample Questions" first to test your setup
|
| 1027 |
-
- Then try "GAIA Test Set" for real benchmark evaluation
|
| 1028 |
-
- Download results in JSONL format for submission
|
| 1029 |
-
|
| 1030 |
-
### 4. Full Benchmark (Enhanced!)
|
| 1031 |
-
- **Quick Tests**: 20 or 50 questions for rapid iteration
|
| 1032 |
-
- **Custom Configuration**: Choose exact question count and strategy
|
| 1033 |
-
- **Full 300-Question Benchmark**: Complete official evaluation
|
| 1034 |
-
- **Leaderboard Ready**: Automatic JSONL generation for submission
|
| 1035 |
-
|
| 1036 |
-
## 📊 Model Recommendations
|
| 1037 |
-
|
| 1038 |
-
| Model | Best For | Memory | Speed | Quality | 300Q Time | Cost (T4) |
|
| 1039 |
-
|-------|----------|---------|-------|---------|-----------|-----------|
|
| 1040 |
-
| Fast & Light | Quick testing | Low | Fast | Good | 45-75 min | ~$0.60-1.00 |
|
| 1041 |
-
| Balanced | General use | Medium | Medium | Better | 60-120 min | ~$1.00-2.00 |
|
| 1042 |
-
| High Quality | Best results | High | Slow | Best | 90-180 min | ~$1.50-3.00 |
|
| 1043 |
-
| Instruction Following | Complex reasoning | High | Medium | Excellent | 75-150 min | ~$1.25-2.50 |
|
| 1044 |
-
|
| 1045 |
-
## 🏅 Benchmark Performance Expectations
|
| 1046 |
-
|
| 1047 |
-
Based on current leaderboard standings, expect these performance ranges:
|
| 1048 |
-
|
| 1049 |
-
| Difficulty Level | Top Models | Good Models | Baseline Models |
|
| 1050 |
-
|------------------|------------|-------------|-----------------|
|
| 1051 |
-
| **Level 1** (Basic) | 85-95% | 70-85% | 50-70% |
|
| 1052 |
-
| **Level 2** (Intermediate) | 65-80% | 45-65% | 25-45% |
|
| 1053 |
-
| **Level 3** (Advanced) | 35-60% | 20-35% | 10-20% |
|
| 1054 |
-
| **Overall Average** | 65-75% | 45-65% | 30-45% |
|
| 1055 |
-
|
| 1056 |
-
## 🚀 Flexible Benchmarking Features
|
| 1057 |
-
|
| 1058 |
-
### 🎯 **Custom Question Selection**
|
| 1059 |
-
- **Question Count**: Choose 10-300 questions
|
| 1060 |
-
- **Selection Strategies**: Balanced, Random, Sequential
|
| 1061 |
-
- **Level Distribution**: Automatic balancing across difficulties
|
| 1062 |
-
- **Reproducible**: Consistent results with same settings
|
| 1063 |
-
|
| 1064 |
-
### 📊 **Smart Sampling**
|
| 1065 |
-
- **Balanced**: Realistic distribution (40% L1, 35% L2, 25% L3)
|
| 1066 |
-
- **Representative**: Questions from all difficulty levels
|
| 1067 |
-
- **Efficient**: Test fewer questions while maintaining quality
|
| 1068 |
-
|
| 1069 |
-
### ⚡ **Quick Options**
|
| 1070 |
-
- **Quick Test (20Q)**: 5-15 minutes, ~$0.10-0.25
|
| 1071 |
-
- **Medium Test (50Q)**: 15-30 minutes, ~$0.25-0.50
|
| 1072 |
-
- **Full Benchmark (300Q)**: 1-3 hours, ~$1-3
|
| 1073 |
-
|
| 1074 |
-
## 🔄 Continuous Benchmarking Workflow
|
| 1075 |
-
|
| 1076 |
-
1. **Development**: Start with Quick Test (20 questions)
|
| 1077 |
-
2. **Validation**: Use Medium Test (50 questions) for validation
|
| 1078 |
-
3. **Optimization**: Iterate on model improvements
|
| 1079 |
-
4. **Benchmarking**: Run Full Benchmark (300 questions) when ready
|
| 1080 |
-
5. **Submission**: Upload to official GAIA leaderboard
|
| 1081 |
-
6. **Analysis**: Compare with other models and iterate
|
| 1082 |
-
|
| 1083 |
-
## 📋 Official Dataset Integration
|
| 1084 |
-
|
| 1085 |
-
### **Metadata.jsonl Structure**
|
| 1086 |
-
- **Questions**: Stored in `2023/validation/metadata.jsonl` and `2023/test/metadata.jsonl`
|
| 1087 |
-
- **Additional Files**: Some questions reference images, documents, or data files
|
| 1088 |
-
- **Format**: Each line contains one question in JSON format
|
| 1089 |
-
- **Fields**: `task_id`, `Question`, `Level`, `file_name` (optional), `Final answer` (validation only)
|
| 1090 |
-
|
| 1091 |
-
### **Submission Format**
|
| 1092 |
-
Results are saved in official GAIA leaderboard format:
|
| 1093 |
-
```json
|
| 1094 |
-
{"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
|
| 1095 |
-
{"task_id": "gaia_002", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
|
| 1096 |
-
```
|
| 1097 |
-
|
| 1098 |
-
## ⚡ Pro Tips for Best Results
|
| 1099 |
-
|
| 1100 |
-
### **Performance Optimization**
|
| 1101 |
-
1. **Start Small**: Always test with Quick Test first
|
| 1102 |
-
2. **Choose Wisely**: Balance speed vs quality based on your goals
|
| 1103 |
-
3. **Monitor Resources**: Use GPU acceleration for larger models
|
| 1104 |
-
4. **Validate Format**: Ensure JSONL files are properly formatted
|
| 1105 |
-
|
| 1106 |
-
### **Leaderboard Strategy**
|
| 1107 |
-
1. **Baseline First**: Get initial results with Quick Test
|
| 1108 |
-
2. **Iterate Quickly**: Test improvements on Medium Test
|
| 1109 |
-
3. **Full Benchmark**: Run complete evaluation when ready
|
| 1110 |
-
4. **Compare Results**: Analyze performance across difficulty levels
|
| 1111 |
-
5. **Document Approach**: Include model details and methodology
|
| 1112 |
-
|
| 1113 |
-
### **Cost Management**
|
| 1114 |
-
- **Development**: Use Quick Test (20Q) for rapid iteration (~$0.10-0.25)
|
| 1115 |
-
- **Validation**: Use Medium Test (50Q) for validation (~$0.25-0.50)
|
| 1116 |
-
- **Production**: Use Full Benchmark (300Q) for final submission (~$1-3)
|
| 1117 |
-
- **Hardware**: T4 Small GPU recommended for best price/performance
|
| 1118 |
-
|
| 1119 |
-
### **Common Pitfalls to Avoid**
|
| 1120 |
-
- Don't run full benchmark on untested models
|
| 1121 |
-
- Ensure stable internet connection for long evaluations
|
| 1122 |
-
- Verify submission file format before uploading
|
| 1123 |
-
- Check GPU memory usage for large models
|
| 1124 |
-
- Save intermediate results during long runs
|
| 1125 |
-
|
| 1126 |
-
## 🎯 Getting Started Checklist
|
| 1127 |
-
|
| 1128 |
-
- [ ] **Load Model**: Choose and load a model in "Model Setup"
|
| 1129 |
-
- [ ] **Test Single**: Try example questions in "Single Question"
|
| 1130 |
-
- [ ] **Quick Test**: Run 20-question benchmark to verify setup
|
| 1131 |
-
- [ ] **Preview Dataset**: Check "Preview Test Questions" in Full Benchmark
|
| 1132 |
-
- [ ] **Medium Test**: Run 50-question validation benchmark
|
| 1133 |
-
- [ ] **Full Benchmark**: Run complete 300-question evaluation when ready
|
| 1134 |
-
- [ ] **Download Files**: Get JSONL submission and metadata files
|
| 1135 |
-
- [ ] **Submit**: Upload to GAIA leaderboard
|
| 1136 |
-
- [ ] **Compare**: Analyze your results against other models!
|
| 1137 |
-
|
| 1138 |
-
## 🔗 Resources
|
| 1139 |
-
- [GAIA Paper](https://arxiv.org/abs/2311.12983) - Original research paper
|
| 1140 |
-
- [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard) - Official rankings
|
| 1141 |
-
- [GAIA Dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA) - Official dataset repository
|
| 1142 |
-
- [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces) - Deployment documentation
|
| 1143 |
-
|
| 1144 |
-
---
|
| 1145 |
-
|
| 1146 |
-
**Ready to start benchmarking?** Begin with the Model Setup tab, then progress through Quick Test → Medium Test → Full Benchmark. Good luck climbing the leaderboard! 🚀
|
| 1147 |
-
""")
|
| 1148 |
|
| 1149 |
return app
|
| 1150 |
|
| 1151 |
-
# ================================
|
| 1152 |
-
# MAIN APPLICATION
|
| 1153 |
-
# ================================
|
| 1154 |
-
|
| 1155 |
-
if __name__ == "__main__":
|
| 1156 |
-
# Create and launch the Gradio app
|
| 1157 |
-
app = create_gaia_app()
|
| 1158 |
-
app.launch(
|
| 1159 |
-
server_name="0.0.0.0",
|
| 1160 |
-
server_port=7860,
|
| 1161 |
-
share=False
|
| 1162 |
-
)
|
|
|
|
| 30 |
from datasets import load_dataset
|
| 31 |
from huggingface_hub import HfApi, hf_hub_download
|
| 32 |
|
| 33 |
+
# Import leaderboard integration
|
| 34 |
from gaia_leaderboard_integration import (
|
| 35 |
enhanced_gaia_agent,
|
| 36 |
+
run_custom_benchmark_interface,
|
| 37 |
load_test_questions_interface,
|
| 38 |
+
preview_dataset_structure_interface,
|
| 39 |
get_leaderboard_info,
|
| 40 |
+
get_question_selection_info
|
| 41 |
)
|
| 42 |
|
| 43 |
# Setup logging
|
|
|
|
| 45 |
logger = logging.getLogger(__name__)
|
| 46 |
|
| 47 |
# ================================
|
| 48 |
+
# MAIN APPLICATION
|
| 49 |
+
# ================================
|
| 50 |
+
|
| 51 |
+
if __name__ == "__main__":
|
| 52 |
+
app = create_gaia_app()
|
| 53 |
+
app.launch(
|
| 54 |
+
server_name="0.0.0.0",
|
| 55 |
+
server_port=7860,
|
| 56 |
+
share=False
|
| 57 |
+
)
|
| 58 |
# CORE DATA STRUCTURES
|
| 59 |
# ================================
|
| 60 |
|
|
|
|
| 124 |
class HFSpaceModelManager:
|
| 125 |
"""Hugging Face Spaces optimized model manager"""
|
| 126 |
|
|
|
|
| 127 |
SPACE_MODELS = {
|
| 128 |
"Fast & Light": {
|
| 129 |
"name": "microsoft/DialoGPT-medium",
|
|
|
|
| 169 |
if progress_callback:
|
| 170 |
progress_callback(0.1, "Loading tokenizer...")
|
| 171 |
|
|
|
|
| 172 |
self.tokenizer = AutoTokenizer.from_pretrained(self.model_name)
|
| 173 |
if self.tokenizer.pad_token is None:
|
| 174 |
self.tokenizer.pad_token = self.tokenizer.eos_token
|
|
|
|
| 176 |
if progress_callback:
|
| 177 |
progress_callback(0.3, "Configuring model...")
|
| 178 |
|
|
|
|
| 179 |
quantization_config = None
|
| 180 |
if self.device == "cuda" and "7b" in self.model_name.lower():
|
| 181 |
quantization_config = BitsAndBytesConfig(
|
|
|
|
| 188 |
if progress_callback:
|
| 189 |
progress_callback(0.6, "Loading model weights...")
|
| 190 |
|
|
|
|
| 191 |
self.model = AutoModelForCausalLM.from_pretrained(
|
| 192 |
self.model_name,
|
| 193 |
quantization_config=quantization_config,
|
|
|
|
| 199 |
if progress_callback:
|
| 200 |
progress_callback(0.9, "Creating pipeline...")
|
| 201 |
|
|
|
|
| 202 |
self.pipeline = pipeline(
|
| 203 |
"text-generation",
|
| 204 |
model=self.model,
|
|
|
|
| 226 |
return "❌ Model not loaded. Please load a model first."
|
| 227 |
|
| 228 |
try:
|
|
|
|
| 229 |
max_input_length = 1000
|
| 230 |
if len(prompt) > max_input_length:
|
| 231 |
prompt = prompt[:max_input_length] + "..."
|
|
|
|
| 355 |
self.model_manager = HFSpaceModelManager(model_choice)
|
| 356 |
self.current_model = model_choice
|
| 357 |
|
|
|
|
| 358 |
def progress_callback(value, desc):
|
| 359 |
progress(value, desc=desc)
|
| 360 |
|
| 361 |
result = self.model_manager.load_model(progress_callback)
|
|
|
|
|
|
|
| 362 |
self.evaluation_results = []
|
| 363 |
|
| 364 |
return result
|
|
|
|
| 375 |
|
| 376 |
try:
|
| 377 |
progress(0.2, desc="Creating GAIA prompt...")
|
|
|
|
|
|
|
| 378 |
prompt = self.prompt_manager.create_gaia_prompt(question_text)
|
| 379 |
|
| 380 |
progress(0.4, desc="Generating response...")
|
|
|
|
|
|
|
| 381 |
raw_response = self.model_manager.generate_response(prompt)
|
| 382 |
|
| 383 |
progress(0.8, desc="Extracting final answer...")
|
|
|
|
|
|
|
| 384 |
final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
|
| 385 |
|
| 386 |
processing_time = time.time() - start_time
|
|
|
|
| 387 |
progress(1.0, desc="Complete!")
|
| 388 |
|
| 389 |
return final_answer, raw_response, reasoning, processing_time
|
|
|
|
| 409 |
desc=f"Processing question {i + 1}/{total_questions}: {question.task_id}")
|
| 410 |
|
| 411 |
start_time = time.time()
|
|
|
|
|
|
|
| 412 |
prompt = self.prompt_manager.create_gaia_prompt(question.question)
|
| 413 |
raw_response = self.model_manager.generate_response(prompt)
|
|
|
|
|
|
|
| 414 |
final_answer, reasoning = self.prompt_manager.extract_final_answer(raw_response)
|
|
|
|
| 415 |
processing_time = time.time() - start_time
|
| 416 |
|
|
|
|
| 417 |
response = GAIAResponse(
|
| 418 |
task_id=question.task_id,
|
| 419 |
model_answer=raw_response,
|
|
|
|
| 437 |
results.append(error_response)
|
| 438 |
self.evaluation_results.append(error_response)
|
| 439 |
|
|
|
|
| 440 |
summary = self._generate_summary(results)
|
|
|
|
|
|
|
| 441 |
detailed_results = self._generate_detailed_results(results, questions)
|
|
|
|
|
|
|
| 442 |
jsonl_content = self._generate_jsonl(results)
|
| 443 |
|
| 444 |
return summary, detailed_results, jsonl_content
|
|
|
|
| 513 |
# GLOBAL AGENT INSTANCE
|
| 514 |
# ================================
|
| 515 |
|
|
|
|
| 516 |
gaia_agent = GAIASpaceAgent()
|
| 517 |
|
| 518 |
# ================================
|
|
|
|
| 544 |
|
| 545 |
progress(0.1, desc="Loading dataset...")
|
| 546 |
|
|
|
|
| 547 |
if dataset_choice == "Sample Questions":
|
| 548 |
questions = GAIADatasetManager.get_sample_questions()
|
| 549 |
status_msg = f"✅ Loaded {len(questions)} sample questions"
|
| 550 |
else:
|
| 551 |
questions, status_msg = GAIADatasetManager.load_gaia_dataset("test", max_questions)
|
| 552 |
|
|
|
|
| 553 |
if max_questions and len(questions) > max_questions:
|
| 554 |
questions = questions[:max_questions]
|
| 555 |
|
| 556 |
progress(0.2, desc=f"{status_msg}. Starting evaluation...")
|
| 557 |
|
|
|
|
| 558 |
summary, detailed, jsonl = gaia_agent.batch_evaluate(questions, progress)
|
| 559 |
|
| 560 |
return summary, detailed, jsonl
|
|
|
|
| 581 |
|
| 582 |
with gr.Blocks(
|
| 583 |
title="GAIA Benchmark AI Agent",
|
| 584 |
+
theme=gr.themes.Soft()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 585 |
) as app:
|
| 586 |
|
|
|
|
| 587 |
gr.HTML("""
|
| 588 |
+
<div style="text-align: center; font-size: 2.5em; font-weight: bold; margin-bottom: 20px;">
|
| 589 |
🧠 GAIA Benchmark AI Agent
|
| 590 |
</div>
|
| 591 |
<p style="text-align: center; font-size: 1.2em; color: #666;">
|
|
|
|
| 595 |
|
| 596 |
with gr.Tabs():
|
| 597 |
|
|
|
|
| 598 |
# TAB 1: MODEL SETUP
|
|
|
|
| 599 |
with gr.Tab("🔧 Model Setup"):
|
| 600 |
gr.Markdown("## Choose and Load Your Model")
|
| 601 |
|
|
|
|
| 604 |
model_dropdown = gr.Dropdown(
|
| 605 |
choices=list(HFSpaceModelManager.SPACE_MODELS.keys()),
|
| 606 |
value="Fast & Light",
|
| 607 |
+
label="Select Model"
|
|
|
|
| 608 |
)
|
| 609 |
|
| 610 |
model_info = gr.Markdown(
|
|
|
|
| 616 |
|
| 617 |
with gr.Column(scale=1):
|
| 618 |
gpu_info = gr.Markdown(f"""
|
| 619 |
+
### System Info
|
| 620 |
**CUDA Available**: {torch.cuda.is_available()}
|
| 621 |
{f"**GPU**: {torch.cuda.get_device_name(0)}" if torch.cuda.is_available() else "**Device**: CPU"}
|
| 622 |
""")
|
|
|
|
| 627 |
interactive=False
|
| 628 |
)
|
| 629 |
|
|
|
|
| 630 |
model_dropdown.change(
|
| 631 |
fn=get_model_info,
|
| 632 |
inputs=[model_dropdown],
|
| 633 |
outputs=[model_info]
|
| 634 |
)
|
| 635 |
|
|
|
|
| 636 |
load_btn.click(
|
| 637 |
fn=load_model_interface,
|
| 638 |
inputs=[model_dropdown],
|
| 639 |
outputs=[model_status]
|
| 640 |
)
|
| 641 |
|
|
|
|
| 642 |
# TAB 2: SINGLE QUESTION
|
|
|
|
| 643 |
with gr.Tab("❓ Single Question"):
|
| 644 |
gr.Markdown("## Test Individual Questions")
|
| 645 |
|
|
|
|
| 653 |
|
| 654 |
process_btn = gr.Button("🤔 Process Question", variant="primary")
|
| 655 |
|
| 656 |
+
gr.Markdown("### Example Questions:")
|
|
|
|
| 657 |
example_questions = [
|
| 658 |
"What is the capital of France?",
|
| 659 |
"Calculate 144 divided by 12",
|
|
|
|
| 661 |
"Convert 100 degrees Celsius to Fahrenheit"
|
| 662 |
]
|
| 663 |
|
| 664 |
+
for example in example_questions:
|
| 665 |
+
gr.Button(f"📝 {example}", size="sm").click(
|
|
|
|
|
|
|
|
|
|
| 666 |
lambda x=example: x,
|
| 667 |
outputs=[question_input]
|
| 668 |
)
|
|
|
|
| 692 |
interactive=False
|
| 693 |
)
|
| 694 |
|
|
|
|
| 695 |
process_btn.click(
|
| 696 |
fn=single_question_interface,
|
| 697 |
inputs=[question_input],
|
| 698 |
outputs=[final_answer_output, full_response, reasoning_trace, processing_time]
|
| 699 |
)
|
| 700 |
|
|
|
|
| 701 |
# TAB 3: BATCH EVALUATION
|
|
|
|
| 702 |
with gr.Tab("📊 Batch Evaluation"):
|
| 703 |
gr.Markdown("## Evaluate Multiple Questions")
|
| 704 |
|
|
|
|
| 706 |
dataset_choice = gr.Radio(
|
| 707 |
choices=["Sample Questions", "GAIA Test Set"],
|
| 708 |
value="Sample Questions",
|
| 709 |
+
label="Dataset Choice"
|
|
|
|
| 710 |
)
|
| 711 |
|
| 712 |
max_questions = gr.Slider(
|
|
|
|
| 714 |
maximum=50,
|
| 715 |
value=5,
|
| 716 |
step=1,
|
| 717 |
+
label="Max Questions"
|
|
|
|
| 718 |
)
|
| 719 |
|
| 720 |
evaluate_btn = gr.Button("🚀 Start Batch Evaluation", variant="primary", size="lg")
|
|
|
|
| 737 |
value="Run an evaluation to see detailed results"
|
| 738 |
)
|
| 739 |
|
|
|
|
| 740 |
def batch_eval_with_download(*args):
|
| 741 |
summary, detailed, jsonl_content = batch_evaluate_interface(*args)
|
| 742 |
|
|
|
|
| 743 |
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
|
| 744 |
filename = f"gaia_results_{timestamp}.jsonl"
|
| 745 |
|
|
|
|
| 757 |
outputs=[download_output]
|
| 758 |
)
|
| 759 |
|
| 760 |
+
# TAB 4: FULL BENCHMARK
|
|
|
|
|
|
|
| 761 |
with gr.Tab("🏆 Full Benchmark"):
|
| 762 |
gr.Markdown("## Official GAIA Leaderboard Benchmark")
|
| 763 |
|
| 764 |
with gr.Row():
|
| 765 |
with gr.Column():
|
|
|
|
|
|
|
|
|
|
|
|
|
| 766 |
test_preview_btn = gr.Button("🔍 Preview Test Questions", variant="secondary")
|
| 767 |
test_preview_output = gr.Markdown(
|
| 768 |
value="Click above to preview official test questions"
|
| 769 |
)
|
| 770 |
|
|
|
|
| 771 |
dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary")
|
| 772 |
dataset_structure_output = gr.Markdown(
|
| 773 |
value="Click above to see actual GAIA dataset structure"
|
| 774 |
)
|
| 775 |
+
|
| 776 |
+
with gr.Column():
|
| 777 |
+
question_count = gr.Slider(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 778 |
minimum=10,
|
| 779 |
+
maximum=300,
|
| 780 |
+
value=20,
|
| 781 |
step=10,
|
| 782 |
+
label="Number of Questions"
|
| 783 |
)
|
| 784 |
|
| 785 |
selection_strategy = gr.Dropdown(
|
|
|
|
| 787 |
value="balanced",
|
| 788 |
label="Selection Strategy"
|
| 789 |
)
|
| 790 |
+
|
| 791 |
+
benchmark_btn = gr.Button("🎯 Run Benchmark", variant="primary", size="lg")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 792 |
|
|
|
|
| 793 |
benchmark_status = gr.Textbox(
|
| 794 |
label="📊 Benchmark Status",
|
| 795 |
value="Ready to run benchmark",
|
|
|
|
| 804 |
)
|
| 805 |
|
| 806 |
with gr.Column():
|
|
|
|
| 807 |
submission_file = gr.File(
|
| 808 |
label="💾 Download Submission File (JSONL)",
|
| 809 |
visible=False
|
|
|
|
| 813 |
label="📋 Download Metadata File",
|
| 814 |
visible=False
|
| 815 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 816 |
|
| 817 |
+
# Event handlers
|
|
|
|
|
|
|
|
|
|
|
|
|
| 818 |
test_preview_btn.click(
|
| 819 |
fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
|
| 820 |
outputs=[test_preview_output]
|
|
|
|
| 825 |
outputs=[dataset_structure_output]
|
| 826 |
)
|
| 827 |
|
| 828 |
+
def run_benchmark_wrapper(count, strategy, progress=gr.Progress()):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 829 |
return run_custom_benchmark_interface(count, strategy, progress)
|
| 830 |
|
|
|
|
| 831 |
def show_download_files(status, report, sub_file, meta_file):
|
| 832 |
return (
|
| 833 |
status,
|
| 834 |
report,
|
| 835 |
sub_file,
|
| 836 |
meta_file,
|
| 837 |
+
gr.update(visible=True),
|
| 838 |
+
gr.update(visible=True)
|
| 839 |
)
|
| 840 |
|
| 841 |
+
benchmark_btn.click(
|
| 842 |
+
fn=run_benchmark_wrapper,
|
| 843 |
+
inputs=[question_count, selection_strategy],
|
| 844 |
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
|
| 845 |
).then(
|
| 846 |
fn=show_download_files,
|
| 847 |
inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
|
| 848 |
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
|
| 849 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 850 |
|
| 851 |
return app
|
| 852 |
|
| 853 |
+
# ================================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|