UNIT4_FINAL_ASIGNEMENT

Running

App Files Files Community

Ashokdll commited on Jun 4

Commit

ba63dbb

verified ·

1 Parent(s): 8fea07e

Update app.py

Browse files

Files changed (1) hide show

app.py +150 -104

app.py CHANGED Viewed

@@ -30,12 +30,14 @@ from transformers import (
 from datasets import load_dataset
 from huggingface_hub import HfApi, hf_hub_download
-# Import leaderboard integration
 from gaia_leaderboard_integration import (
     enhanced_gaia_agent,
-    run_leaderboard_benchmark_interface,
     load_test_questions_interface,
-    get_leaderboard_info
 )
 # Setup logging
@@ -809,10 +811,8 @@ def create_gaia_app():
                     outputs=[download_output]
                 )
-            # Add this to your Full Benchmark tab in app.py
             # ===============================
-            # TAB 4: FULL BENCHMARK (UPDATED FOR 300 QUESTIONS)
             # ===============================
             with gr.Tab("🏆 Full Benchmark"):
                 gr.Markdown("## Official GAIA Leaderboard Benchmark")
@@ -834,52 +834,36 @@ def create_gaia_app():
                             value="Click above to see actual GAIA dataset structure"
                         )
-                # Benchmark Configuration Section (NEW)
-                gr.Markdown("### 🎛️ Benchmark Configuration")
                 with gr.Row():
-                    with gr.Column():
-                        # Question count selection
-                        question_count = gr.Slider(
                             minimum=10,
-                            maximum=300,
                             value=50,
                             step=10,
-                            label="Number of Questions",
-                            info="Choose how many questions to evaluate (300 = full benchmark)"
                         )
-                        # Selection strategy
                         selection_strategy = gr.Dropdown(
                             choices=["balanced", "random", "sequential"],
                             value="balanced",
-                            label="Question Selection Strategy",
-                            info="Balanced recommended for representative evaluation"
-                        )
-                    with gr.Column():
-                        # Configuration info
-                        config_info = gr.Markdown(
-                            value=get_question_selection_info()
                         )
-                # Benchmark execution
-                gr.Markdown("### 🚀 Run Benchmark")
-                with gr.Row():
-                    # Custom benchmark button
-                    custom_benchmark_btn = gr.Button(
-                        "🎯 Start Custom Benchmark",
-                        variant="primary",
-                        size="lg"
-                    )
-                    # Full 300-question benchmark button
-                    full_benchmark_btn = gr.Button(
-                        "🏆 Start FULL 300-Question Benchmark",
-                        variant="secondary",
-                        size="lg"
-                    )
                 # Warning message for full benchmark
                 gr.Markdown("""
@@ -890,7 +874,7 @@ def create_gaia_app():
                 - **Recommendation**: Test with smaller batches first
                 """)
-                # Results section
                 benchmark_status = gr.Textbox(
                     label="📊 Benchmark Status",
                     value="Ready to run benchmark",
@@ -900,7 +884,7 @@ def create_gaia_app():
                 with gr.Row():
                     with gr.Column():
                         benchmark_report = gr.Markdown(
-                            label="📈 Benchmark Report",
                             value="Run benchmark to see detailed results"
                         )
@@ -912,69 +896,97 @@ def create_gaia_app():
                         )
                         metadata_file = gr.File(
-                            label="📋 Download Metadata File",
                             visible=False
                         )
                         gr.Markdown("""
-                        ### 📤 Leaderboard Submission
-                        1. Download the JSONL file above
-                        2. Visit [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
-                        3. Upload your submission file
-                        4. View your model's ranking!
                         """)
-                # Event handlers
                 test_preview_btn.click(
                     fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
                     outputs=[test_preview_output]
                 )
-                # NEW: Dataset structure preview
                 dataset_structure_btn.click(
                     fn=preview_dataset_structure_interface,
-                    outputs=[dataset_structure_output]
                 )
-                # Custom benchmark with user settings
-                def run_custom_benchmark_with_settings(num_questions, strategy, progress=gr.Progress()):
-                    return run_custom_benchmark_interface(num_questions, strategy, progress)
-                # Full 300-question benchmark
                 def run_full_300_benchmark(progress=gr.Progress()):
                     return run_custom_benchmark_interface(300, "balanced", progress)
-                def benchmark_with_files(*args):
-                    status, report, sub_file, meta_file = args[0], args[1], args[2], args[3]
                     return (
-                        status,
                         report,
-                        sub_file,
                         meta_file,
                         gr.update(visible=True),  # Show submission file
                         gr.update(visible=True)   # Show metadata file
                     )
-                # Custom benchmark event
-                custom_benchmark_btn.click(
-                    fn=run_custom_benchmark_with_settings,
-                    inputs=[question_count, selection_strategy],
                     outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
                 ).then(
-                    fn=benchmark_with_files,
                     inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
                     outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
                 )
-                # Full 300-question benchmark event
                 full_benchmark_btn.click(
                     fn=run_full_300_benchmark,
                     outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
                 ).then(
-                    fn=benchmark_with_files,
                     inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
                     outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
                 )
             # ===============================
             # TAB 5: INFORMATION (UPDATED)
             # ===============================
@@ -1015,19 +1027,20 @@ def create_gaia_app():
                 - Then try "GAIA Test Set" for real benchmark evaluation
                 - Download results in JSONL format for submission
-                ### 4. Full Benchmark (NEW!)
-                - Run complete evaluation on all 300 official test questions
-                - Get leaderboard-ready submission files
-                - Upload directly to GAIA leaderboard for ranking
                 ## 📊 Model Recommendations
-                | Model | Best For | Memory | Speed | Quality | Leaderboard Ready |
-                |-------|----------|---------|-------|---------|------------------|
-                | Fast & Light | Quick testing | Low | Fast | Good | ✅ |
-                | Balanced | General use | Medium | Medium | Better | ✅ |
-                | High Quality | Best results | High | Slow | Best | ✅ |
-                | Instruction Following | Complex reasoning | High | Medium | Excellent | ✅ |
                 ## 🏅 Benchmark Performance Expectations
@@ -1040,22 +1053,42 @@ def create_gaia_app():
                 | **Level 3** (Advanced) | 35-60% | 20-35% | 10-20% |
                 | **Overall Average** | 65-75% | 45-65% | 30-45% |
-                ## 🚀 Continuous Benchmarking Workflow
-                1. **Development**: Test with sample questions
-                2. **Validation**: Run batch evaluation (10-50 questions)
-                3. **Benchmarking**: Full evaluation (300 questions)
-                4. **Submission**: Upload to leaderboard
-                5. **Analysis**: Compare with other models
-                6. **Iteration**: Improve and re-benchmark
-                ## 🔗 Resources
-                - [GAIA Paper](https://arxiv.org/abs/2311.12983) - Original research paper
-                - [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard) - Official rankings
-                - [GAIA Dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA) - Training/validation data
-                - [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces) - Deployment documentation
-                ## 📋 Submission Format
                 Results are saved in official GAIA leaderboard format:
                 ```json
                 {"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
@@ -1064,20 +1097,26 @@ def create_gaia_app():
                 ## ⚡ Pro Tips for Best Results
-                ### Performance Optimization
-                1. **Start Small**: Always test with sample questions first
                 2. **Choose Wisely**: Balance speed vs quality based on your goals
                 3. **Monitor Resources**: Use GPU acceleration for larger models
                 4. **Validate Format**: Ensure JSONL files are properly formatted
-                ### Leaderboard Strategy
-                1. **Baseline First**: Get initial results with fast model
-                2. **Iterate Quickly**: Test improvements on small batches
                 3. **Full Benchmark**: Run complete evaluation when ready
                 4. **Compare Results**: Analyze performance across difficulty levels
                 5. **Document Approach**: Include model details and methodology
-                ### Common Pitfalls to Avoid
                 - Don't run full benchmark on untested models
                 - Ensure stable internet connection for long evaluations
                 - Verify submission file format before uploading
@@ -1086,18 +1125,25 @@ def create_gaia_app():
                 ## 🎯 Getting Started Checklist
-                - [ ] Load and test a model in "Model Setup"
-                - [ ] Try example questions in "Single Question"
-                - [ ] Run small batch in "Batch Evaluation"
-                - [ ] Review test questions in "Full Benchmark"
-                - [ ] Run complete benchmark when ready
-                - [ ] Download submission files
-                - [ ] Upload to GAIA leaderboard
-                - [ ] Compare your results with others!
                 ---
-                **Ready to start benchmarking?** Begin with the Model Setup tab and work your way through each stage. Good luck! 🚀
                 """)
         return app

 from datasets import load_dataset
 from huggingface_hub import HfApi, hf_hub_download
+# Import leaderboard integration (CORRECTED IMPORTS)
 from gaia_leaderboard_integration import (
     enhanced_gaia_agent,
+    run_custom_benchmark_interface,  # ← FIXED: was run_leaderboard_benchmark_interface
     load_test_questions_interface,
+    preview_dataset_structure_interface,  # ← NEW FUNCTION
+    get_leaderboard_info,
+    get_question_selection_info  # ← NEW FUNCTION
 )
 # Setup logging
                     outputs=[download_output]
                 )
             # ===============================
+            # TAB 4: FULL BENCHMARK (ENHANCED FOR 300 QUESTIONS)
             # ===============================
             with gr.Tab("🏆 Full Benchmark"):
                 gr.Markdown("## Official GAIA Leaderboard Benchmark")
                             value="Click above to see actual GAIA dataset structure"
                         )
+                # Quick benchmark options
+                gr.Markdown("### 🎯 Quick Benchmark Options")
                 with gr.Row():
+                    # Preset buttons for common configurations
+                    quick_test_btn = gr.Button("🚀 Quick Test (20 questions)", variant="secondary")
+                    medium_test_btn = gr.Button("📊 Medium Test (50 questions)", variant="secondary")
+                    full_benchmark_btn = gr.Button("🏆 FULL BENCHMARK (300 questions)", variant="primary", size="lg")
+                # Advanced configuration (collapsible)
+                with gr.Accordion("🎛️ Advanced Configuration", open=False):
+                    with gr.Row():
+                        custom_count = gr.Slider(
                             minimum=10,
+                            maximum=300,
                             value=50,
                             step=10,
+                            label="Custom Question Count"
                         )
                         selection_strategy = gr.Dropdown(
                             choices=["balanced", "random", "sequential"],
                             value="balanced",
+                            label="Selection Strategy"
                         )
+                    custom_benchmark_btn = gr.Button("🎯 Run Custom Benchmark", variant="secondary")
+                    # Show selection info
+                    selection_info = gr.Markdown(get_question_selection_info())
                 # Warning message for full benchmark
                 gr.Markdown("""
                 - **Recommendation**: Test with smaller batches first
                 """)
+                # Results section
                 benchmark_status = gr.Textbox(
                     label="📊 Benchmark Status",
                     value="Ready to run benchmark",
                 with gr.Row():
                     with gr.Column():
                         benchmark_report = gr.Markdown(
+                            label="📈 Benchmark Report",
                             value="Run benchmark to see detailed results"
                         )
                         )
                         metadata_file = gr.File(
+                            label="📋 Download Metadata File",
                             visible=False
                         )
                         gr.Markdown("""
+                        ### 📤 Leaderboard Submission Steps
+                        1. **Download** the JSONL file above
+                        2. **Visit** [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
+                        3. **Upload** your submission file
+                        4. **View** your model's ranking!
                         """)
+                # ================================
+                # EVENT HANDLERS (FIXED FUNCTION CALLS)
+                # ================================
+                # Preview functions
                 test_preview_btn.click(
                     fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
                     outputs=[test_preview_output]
                 )
                 dataset_structure_btn.click(
                     fn=preview_dataset_structure_interface,
+                    outputs=[dataset_structure_output]
                 )
+                # Quick benchmark functions
+                def run_quick_test(progress=gr.Progress()):
+                    return run_custom_benchmark_interface(20, "balanced", progress)
+                def run_medium_test(progress=gr.Progress()):
+                    return run_custom_benchmark_interface(50, "balanced", progress)
                 def run_full_300_benchmark(progress=gr.Progress()):
                     return run_custom_benchmark_interface(300, "balanced", progress)
+                def run_custom_benchmark_wrapper(count, strategy, progress=gr.Progress()):
+                    return run_custom_benchmark_interface(count, strategy, progress)
+                # Helper function to show download files
+                def show_download_files(status, report, sub_file, meta_file):
                     return (
+                        status,
                         report,
+                        sub_file,
                         meta_file,
                         gr.update(visible=True),  # Show submission file
                         gr.update(visible=True)   # Show metadata file
                     )
+                # Quick test events
+                quick_test_btn.click(
+                    fn=run_quick_test,
                     outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
                 ).then(
+                    fn=show_download_files,
                     inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
                     outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
                 )
+                medium_test_btn.click(
+                    fn=run_medium_test,
+                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
+                ).then(
+                    fn=show_download_files,
+                    inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
+                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
+                )
+                # FULL 300-question benchmark
                 full_benchmark_btn.click(
                     fn=run_full_300_benchmark,
                     outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
                 ).then(
+                    fn=show_download_files,
+                    inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
+                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
+                )
+                # Custom benchmark
+                custom_benchmark_btn.click(
+                    fn=run_custom_benchmark_wrapper,
+                    inputs=[custom_count, selection_strategy],
+                    outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
+                ).then(
+                    fn=show_download_files,
                     inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
                     outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
                 )
             # ===============================
             # TAB 5: INFORMATION (UPDATED)
             # ===============================
                 - Then try "GAIA Test Set" for real benchmark evaluation
                 - Download results in JSONL format for submission
+                ### 4. Full Benchmark (Enhanced!)
+                - **Quick Tests**: 20 or 50 questions for rapid iteration
+                - **Custom Configuration**: Choose exact question count and strategy
+                - **Full 300-Question Benchmark**: Complete official evaluation
+                - **Leaderboard Ready**: Automatic JSONL generation for submission
                 ## 📊 Model Recommendations
+                | Model | Best For | Memory | Speed | Quality | 300Q Time | Cost (T4) |
+                |-------|----------|---------|-------|---------|-----------|-----------|
+                | Fast & Light | Quick testing | Low | Fast | Good | 45-75 min | ~$0.60-1.00 |
+                | Balanced | General use | Medium | Medium | Better | 60-120 min | ~$1.00-2.00 |
+                | High Quality | Best results | High | Slow | Best | 90-180 min | ~$1.50-3.00 |
+                | Instruction Following | Complex reasoning | High | Medium | Excellent | 75-150 min | ~$1.25-2.50 |
                 ## 🏅 Benchmark Performance Expectations
                 | **Level 3** (Advanced) | 35-60% | 20-35% | 10-20% |
                 | **Overall Average** | 65-75% | 45-65% | 30-45% |
+                ## 🚀 Flexible Benchmarking Features
+                ### 🎯 **Custom Question Selection**
+                - **Question Count**: Choose 10-300 questions
+                - **Selection Strategies**: Balanced, Random, Sequential
+                - **Level Distribution**: Automatic balancing across difficulties
+                - **Reproducible**: Consistent results with same settings
+                ### 📊 **Smart Sampling**
+                - **Balanced**: Realistic distribution (40% L1, 35% L2, 25% L3)
+                - **Representative**: Questions from all difficulty levels
+                - **Efficient**: Test fewer questions while maintaining quality
+                ### ⚡ **Quick Options**
+                - **Quick Test (20Q)**: 5-15 minutes, ~$0.10-0.25
+                - **Medium Test (50Q)**: 15-30 minutes, ~$0.25-0.50
+                - **Full Benchmark (300Q)**: 1-3 hours, ~$1-3
+                ## 🔄 Continuous Benchmarking Workflow
+                1. **Development**: Start with Quick Test (20 questions)
+                2. **Validation**: Use Medium Test (50 questions) for validation
+                3. **Optimization**: Iterate on model improvements
+                4. **Benchmarking**: Run Full Benchmark (300 questions) when ready
+                5. **Submission**: Upload to official GAIA leaderboard
+                6. **Analysis**: Compare with other models and iterate
+                ## 📋 Official Dataset Integration
+                ### **Metadata.jsonl Structure**
+                - **Questions**: Stored in `2023/validation/metadata.jsonl` and `2023/test/metadata.jsonl`
+                - **Additional Files**: Some questions reference images, documents, or data files
+                - **Format**: Each line contains one question in JSON format
+                - **Fields**: `task_id`, `Question`, `Level`, `file_name` (optional), `Final answer` (validation only)
+                ### **Submission Format**
                 Results are saved in official GAIA leaderboard format:
                 ```json
                 {"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
                 ## ⚡ Pro Tips for Best Results
+                ### **Performance Optimization**
+                1. **Start Small**: Always test with Quick Test first
                 2. **Choose Wisely**: Balance speed vs quality based on your goals
                 3. **Monitor Resources**: Use GPU acceleration for larger models
                 4. **Validate Format**: Ensure JSONL files are properly formatted
+                ### **Leaderboard Strategy**
+                1. **Baseline First**: Get initial results with Quick Test
+                2. **Iterate Quickly**: Test improvements on Medium Test
                 3. **Full Benchmark**: Run complete evaluation when ready
                 4. **Compare Results**: Analyze performance across difficulty levels
                 5. **Document Approach**: Include model details and methodology
+                ### **Cost Management**
+                - **Development**: Use Quick Test (20Q) for rapid iteration (~$0.10-0.25)
+                - **Validation**: Use Medium Test (50Q) for validation (~$0.25-0.50)
+                - **Production**: Use Full Benchmark (300Q) for final submission (~$1-3)
+                - **Hardware**: T4 Small GPU recommended for best price/performance
+                ### **Common Pitfalls to Avoid**
                 - Don't run full benchmark on untested models
                 - Ensure stable internet connection for long evaluations
                 - Verify submission file format before uploading
                 ## 🎯 Getting Started Checklist
+                - [ ] **Load Model**: Choose and load a model in "Model Setup"
+                - [ ] **Test Single**: Try example questions in "Single Question"
+                - [ ] **Quick Test**: Run 20-question benchmark to verify setup
+                - [ ] **Preview Dataset**: Check "Preview Test Questions" in Full Benchmark
+                - [ ] **Medium Test**: Run 50-question validation benchmark
+                - [ ] **Full Benchmark**: Run complete 300-question evaluation when ready
+                - [ ] **Download Files**: Get JSONL submission and metadata files
+                - [ ] **Submit**: Upload to GAIA leaderboard
+                - [ ] **Compare**: Analyze your results against other models!
+                ## 🔗 Resources
+                - [GAIA Paper](https://arxiv.org/abs/2311.12983) - Original research paper
+                - [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard) - Official rankings
+                - [GAIA Dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA) - Official dataset repository
+                - [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces) - Deployment documentation
                 ---
+                **Ready to start benchmarking?** Begin with the Model Setup tab, then progress through Quick Test → Medium Test → Full Benchmark. Good luck climbing the leaderboard! 🚀
                 """)
         return app