Spaces:
				
			
			
	
			
			
					
		Running
		
	
	
	
			
			
	
	
	
	
		
		
					
		Running
		
	Update app.py
Browse files
    	
        app.py
    CHANGED
    
    | 
         @@ -30,12 +30,14 @@ from transformers import ( 
     | 
|
| 30 | 
         
             
            from datasets import load_dataset
         
     | 
| 31 | 
         
             
            from huggingface_hub import HfApi, hf_hub_download
         
     | 
| 32 | 
         | 
| 33 | 
         
            -
            # Import leaderboard integration
         
     | 
| 34 | 
         
             
            from gaia_leaderboard_integration import (
         
     | 
| 35 | 
         
             
                enhanced_gaia_agent, 
         
     | 
| 36 | 
         
            -
                run_leaderboard_benchmark_interface 
     | 
| 37 | 
         
             
                load_test_questions_interface, 
         
     | 
| 38 | 
         
            -
                 
     | 
| 
         | 
|
| 
         | 
|
| 39 | 
         
             
            )
         
     | 
| 40 | 
         | 
| 41 | 
         
             
            # Setup logging
         
     | 
| 
         @@ -809,10 +811,8 @@ def create_gaia_app(): 
     | 
|
| 809 | 
         
             
                                outputs=[download_output]
         
     | 
| 810 | 
         
             
                            )
         
     | 
| 811 | 
         | 
| 812 | 
         
            -
                        # Add this to your Full Benchmark tab in app.py
         
     | 
| 813 | 
         
            -
             
     | 
| 814 | 
         
             
                        # ===============================
         
     | 
| 815 | 
         
            -
                        # TAB 4: FULL BENCHMARK ( 
     | 
| 816 | 
         
             
                        # ===============================
         
     | 
| 817 | 
         
             
                        with gr.Tab("🏆 Full Benchmark"):
         
     | 
| 818 | 
         
             
                            gr.Markdown("## Official GAIA Leaderboard Benchmark")
         
     | 
| 
         @@ -834,52 +834,36 @@ def create_gaia_app(): 
     | 
|
| 834 | 
         
             
                                        value="Click above to see actual GAIA dataset structure"
         
     | 
| 835 | 
         
             
                                    )
         
     | 
| 836 | 
         | 
| 837 | 
         
            -
                            #  
     | 
| 838 | 
         
            -
                            gr.Markdown("###  
     | 
| 839 | 
         | 
| 840 | 
         
             
                            with gr.Row():
         
     | 
| 841 | 
         
            -
                                 
     | 
| 842 | 
         
            -
             
     | 
| 843 | 
         
            -
             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 844 | 
         
             
                                        minimum=10,
         
     | 
| 845 | 
         
            -
                                        maximum=300,
         
     | 
| 846 | 
         
             
                                        value=50,
         
     | 
| 847 | 
         
             
                                        step=10,
         
     | 
| 848 | 
         
            -
                                        label=" 
     | 
| 849 | 
         
            -
                                        info="Choose how many questions to evaluate (300 = full benchmark)"
         
     | 
| 850 | 
         
             
                                    )
         
     | 
| 851 | 
         | 
| 852 | 
         
            -
                                    # Selection strategy
         
     | 
| 853 | 
         
             
                                    selection_strategy = gr.Dropdown(
         
     | 
| 854 | 
         
             
                                        choices=["balanced", "random", "sequential"],
         
     | 
| 855 | 
         
             
                                        value="balanced",
         
     | 
| 856 | 
         
            -
                                        label=" 
     | 
| 857 | 
         
            -
                                        info="Balanced recommended for representative evaluation"
         
     | 
| 858 | 
         
            -
                                    )
         
     | 
| 859 | 
         
            -
                                    
         
     | 
| 860 | 
         
            -
                                with gr.Column():
         
     | 
| 861 | 
         
            -
                                    # Configuration info
         
     | 
| 862 | 
         
            -
                                    config_info = gr.Markdown(
         
     | 
| 863 | 
         
            -
                                        value=get_question_selection_info()
         
     | 
| 864 | 
         
             
                                    )
         
     | 
| 865 | 
         
            -
                            
         
     | 
| 866 | 
         
            -
                            # Benchmark execution
         
     | 
| 867 | 
         
            -
                            gr.Markdown("### 🚀 Run Benchmark")
         
     | 
| 868 | 
         
            -
                            
         
     | 
| 869 | 
         
            -
                            with gr.Row():
         
     | 
| 870 | 
         
            -
                                # Custom benchmark button
         
     | 
| 871 | 
         
            -
                                custom_benchmark_btn = gr.Button(
         
     | 
| 872 | 
         
            -
                                    "🎯 Start Custom Benchmark", 
         
     | 
| 873 | 
         
            -
                                    variant="primary", 
         
     | 
| 874 | 
         
            -
                                    size="lg"
         
     | 
| 875 | 
         
            -
                                )
         
     | 
| 876 | 
         | 
| 877 | 
         
            -
                                 
     | 
| 878 | 
         
            -
                                 
     | 
| 879 | 
         
            -
             
     | 
| 880 | 
         
            -
             
     | 
| 881 | 
         
            -
                                    size="lg"
         
     | 
| 882 | 
         
            -
                                )
         
     | 
| 883 | 
         | 
| 884 | 
         
             
                            # Warning message for full benchmark
         
     | 
| 885 | 
         
             
                            gr.Markdown("""
         
     | 
| 
         @@ -890,7 +874,7 @@ def create_gaia_app(): 
     | 
|
| 890 | 
         
             
                            - **Recommendation**: Test with smaller batches first
         
     | 
| 891 | 
         
             
                            """)
         
     | 
| 892 | 
         | 
| 893 | 
         
            -
                            # Results section
         
     | 
| 894 | 
         
             
                            benchmark_status = gr.Textbox(
         
     | 
| 895 | 
         
             
                                label="📊 Benchmark Status",
         
     | 
| 896 | 
         
             
                                value="Ready to run benchmark",
         
     | 
| 
         @@ -900,7 +884,7 @@ def create_gaia_app(): 
     | 
|
| 900 | 
         
             
                            with gr.Row():
         
     | 
| 901 | 
         
             
                                with gr.Column():
         
     | 
| 902 | 
         
             
                                    benchmark_report = gr.Markdown(
         
     | 
| 903 | 
         
            -
                                        label="📈 Benchmark Report",
         
     | 
| 904 | 
         
             
                                        value="Run benchmark to see detailed results"
         
     | 
| 905 | 
         
             
                                    )
         
     | 
| 906 | 
         | 
| 
         @@ -912,69 +896,97 @@ def create_gaia_app(): 
     | 
|
| 912 | 
         
             
                                    )
         
     | 
| 913 | 
         | 
| 914 | 
         
             
                                    metadata_file = gr.File(
         
     | 
| 915 | 
         
            -
                                        label="📋 Download Metadata File",
         
     | 
| 916 | 
         
             
                                        visible=False
         
     | 
| 917 | 
         
             
                                    )
         
     | 
| 918 | 
         | 
| 919 | 
         
             
                                    gr.Markdown("""
         
     | 
| 920 | 
         
            -
                                    ### 📤 Leaderboard Submission
         
     | 
| 921 | 
         
            -
                                    1. Download the JSONL file above
         
     | 
| 922 | 
         
            -
                                    2. Visit [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
         
     | 
| 923 | 
         
            -
                                    3. Upload your submission file
         
     | 
| 924 | 
         
            -
                                    4. View your model's ranking!
         
     | 
| 925 | 
         
             
                                    """)
         
     | 
| 926 | 
         | 
| 927 | 
         
            -
                            #  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 928 | 
         
             
                            test_preview_btn.click(
         
     | 
| 929 | 
         
             
                                fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
         
     | 
| 930 | 
         
             
                                outputs=[test_preview_output]
         
     | 
| 931 | 
         
             
                            )
         
     | 
| 932 | 
         | 
| 933 | 
         
            -
                            # NEW: Dataset structure preview
         
     | 
| 934 | 
         
             
                            dataset_structure_btn.click(
         
     | 
| 935 | 
         
             
                                fn=preview_dataset_structure_interface,
         
     | 
| 936 | 
         
            -
                                outputs=[dataset_structure_output]
         
     | 
| 937 | 
         
             
                            )
         
     | 
| 938 | 
         | 
| 939 | 
         
            -
                            #  
     | 
| 940 | 
         
            -
                            def  
     | 
| 941 | 
         
            -
                                return run_custom_benchmark_interface( 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 942 | 
         | 
| 943 | 
         
            -
                            # Full 300-question benchmark
         
     | 
| 944 | 
         
             
                            def run_full_300_benchmark(progress=gr.Progress()):
         
     | 
| 945 | 
         
             
                                return run_custom_benchmark_interface(300, "balanced", progress)
         
     | 
| 946 | 
         | 
| 947 | 
         
            -
                            def  
     | 
| 948 | 
         
            -
                                 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 949 | 
         
             
                                return (
         
     | 
| 950 | 
         
            -
                                    status, 
     | 
| 951 | 
         
             
                                    report, 
         
     | 
| 952 | 
         
            -
                                    sub_file, 
     | 
| 953 | 
         
             
                                    meta_file,
         
     | 
| 954 | 
         
             
                                    gr.update(visible=True),  # Show submission file
         
     | 
| 955 | 
         
             
                                    gr.update(visible=True)   # Show metadata file
         
     | 
| 956 | 
         
             
                                )
         
     | 
| 957 | 
         | 
| 958 | 
         
            -
                            #  
     | 
| 959 | 
         
            -
                             
     | 
| 960 | 
         
            -
                                fn= 
     | 
| 961 | 
         
            -
                                inputs=[question_count, selection_strategy],
         
     | 
| 962 | 
         
             
                                outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
         
     | 
| 963 | 
         
             
                            ).then(
         
     | 
| 964 | 
         
            -
                                fn= 
     | 
| 965 | 
         
             
                                inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
         
     | 
| 966 | 
         
             
                                outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
         
     | 
| 967 | 
         
             
                            )
         
     | 
| 968 | 
         | 
| 969 | 
         
            -
                             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 970 | 
         
             
                            full_benchmark_btn.click(
         
     | 
| 971 | 
         
             
                                fn=run_full_300_benchmark,
         
     | 
| 972 | 
         
             
                                outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
         
     | 
| 973 | 
         
             
                            ).then(
         
     | 
| 974 | 
         
            -
                                fn= 
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 975 | 
         
             
                                inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
         
     | 
| 976 | 
         
             
                                outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
         
     | 
| 977 | 
         
             
                            )
         
     | 
| 
         | 
|
| 978 | 
         
             
                        # ===============================
         
     | 
| 979 | 
         
             
                        # TAB 5: INFORMATION (UPDATED)
         
     | 
| 980 | 
         
             
                        # ===============================
         
     | 
| 
         @@ -1015,19 +1027,20 @@ def create_gaia_app(): 
     | 
|
| 1015 | 
         
             
                            - Then try "GAIA Test Set" for real benchmark evaluation
         
     | 
| 1016 | 
         
             
                            - Download results in JSONL format for submission
         
     | 
| 1017 | 
         | 
| 1018 | 
         
            -
                            ### 4. Full Benchmark ( 
     | 
| 1019 | 
         
            -
                            -  
     | 
| 1020 | 
         
            -
                            -  
     | 
| 1021 | 
         
            -
                            -  
     | 
| 
         | 
|
| 1022 | 
         | 
| 1023 | 
         
             
                            ## 📊 Model Recommendations
         
     | 
| 1024 | 
         | 
| 1025 | 
         
            -
                            | Model | Best For | Memory | Speed | Quality |  
     | 
| 1026 | 
         
            -
                             
     | 
| 1027 | 
         
            -
                            | Fast & Light | Quick testing | Low | Fast | Good |  
     | 
| 1028 | 
         
            -
                            | Balanced | General use | Medium | Medium | Better |  
     | 
| 1029 | 
         
            -
                            | High Quality | Best results | High | Slow | Best |  
     | 
| 1030 | 
         
            -
                            | Instruction Following | Complex reasoning | High | Medium | Excellent |  
     | 
| 1031 | 
         | 
| 1032 | 
         
             
                            ## 🏅 Benchmark Performance Expectations
         
     | 
| 1033 | 
         | 
| 
         @@ -1040,22 +1053,42 @@ def create_gaia_app(): 
     | 
|
| 1040 | 
         
             
                            | **Level 3** (Advanced) | 35-60% | 20-35% | 10-20% |
         
     | 
| 1041 | 
         
             
                            | **Overall Average** | 65-75% | 45-65% | 30-45% |
         
     | 
| 1042 | 
         | 
| 1043 | 
         
            -
                            ## 🚀  
     | 
| 1044 | 
         | 
| 1045 | 
         
            -
                             
     | 
| 1046 | 
         
            -
                             
     | 
| 1047 | 
         
            -
                             
     | 
| 1048 | 
         
            -
                             
     | 
| 1049 | 
         
            -
                             
     | 
| 1050 | 
         
            -
                            6. **Iteration**: Improve and re-benchmark
         
     | 
| 1051 | 
         | 
| 1052 | 
         
            -
                             
     | 
| 1053 | 
         
            -
                            -  
     | 
| 1054 | 
         
            -
                            -  
     | 
| 1055 | 
         
            -
                            -  
     | 
| 1056 | 
         
            -
                            - [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces) - Deployment documentation
         
     | 
| 1057 | 
         | 
| 1058 | 
         
            -
                             
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1059 | 
         
             
                            Results are saved in official GAIA leaderboard format:
         
     | 
| 1060 | 
         
             
                            ```json
         
     | 
| 1061 | 
         
             
                            {"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
         
     | 
| 
         @@ -1064,20 +1097,26 @@ def create_gaia_app(): 
     | 
|
| 1064 | 
         | 
| 1065 | 
         
             
                            ## ⚡ Pro Tips for Best Results
         
     | 
| 1066 | 
         | 
| 1067 | 
         
            -
                            ### Performance Optimization
         
     | 
| 1068 | 
         
            -
                            1. **Start Small**: Always test with  
     | 
| 1069 | 
         
             
                            2. **Choose Wisely**: Balance speed vs quality based on your goals
         
     | 
| 1070 | 
         
             
                            3. **Monitor Resources**: Use GPU acceleration for larger models
         
     | 
| 1071 | 
         
             
                            4. **Validate Format**: Ensure JSONL files are properly formatted
         
     | 
| 1072 | 
         | 
| 1073 | 
         
            -
                            ### Leaderboard Strategy
         
     | 
| 1074 | 
         
            -
                            1. **Baseline First**: Get initial results with  
     | 
| 1075 | 
         
            -
                            2. **Iterate Quickly**: Test improvements on  
     | 
| 1076 | 
         
             
                            3. **Full Benchmark**: Run complete evaluation when ready
         
     | 
| 1077 | 
         
             
                            4. **Compare Results**: Analyze performance across difficulty levels
         
     | 
| 1078 | 
         
             
                            5. **Document Approach**: Include model details and methodology
         
     | 
| 1079 | 
         | 
| 1080 | 
         
            -
                            ###  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1081 | 
         
             
                            - Don't run full benchmark on untested models
         
     | 
| 1082 | 
         
             
                            - Ensure stable internet connection for long evaluations
         
     | 
| 1083 | 
         
             
                            - Verify submission file format before uploading
         
     | 
| 
         @@ -1086,18 +1125,25 @@ def create_gaia_app(): 
     | 
|
| 1086 | 
         | 
| 1087 | 
         
             
                            ## 🎯 Getting Started Checklist
         
     | 
| 1088 | 
         | 
| 1089 | 
         
            -
                            - [ ] Load and  
     | 
| 1090 | 
         
            -
                            - [ ] Try example questions in "Single Question"
         
     | 
| 1091 | 
         
            -
                            - [ ] Run  
     | 
| 1092 | 
         
            -
                            - [ ]  
     | 
| 1093 | 
         
            -
                            - [ ] Run  
     | 
| 1094 | 
         
            -
                            - [ ]  
     | 
| 1095 | 
         
            -
                            - [ ]  
     | 
| 1096 | 
         
            -
                            - [ ]  
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 1097 | 
         | 
| 1098 | 
         
             
                            ---
         
     | 
| 1099 | 
         | 
| 1100 | 
         
            -
                            **Ready to start benchmarking?** Begin with the Model Setup tab  
     | 
| 1101 | 
         
             
                            """)
         
     | 
| 1102 | 
         | 
| 1103 | 
         
             
                    return app
         
     | 
| 
         | 
|
| 30 | 
         
             
            from datasets import load_dataset
         
     | 
| 31 | 
         
             
            from huggingface_hub import HfApi, hf_hub_download
         
     | 
| 32 | 
         | 
| 33 | 
         
            +
            # Import leaderboard integration (CORRECTED IMPORTS)
         
     | 
| 34 | 
         
             
            from gaia_leaderboard_integration import (
         
     | 
| 35 | 
         
             
                enhanced_gaia_agent, 
         
     | 
| 36 | 
         
            +
                run_custom_benchmark_interface,  # ← FIXED: was run_leaderboard_benchmark_interface
         
     | 
| 37 | 
         
             
                load_test_questions_interface, 
         
     | 
| 38 | 
         
            +
                preview_dataset_structure_interface,  # ← NEW FUNCTION
         
     | 
| 39 | 
         
            +
                get_leaderboard_info,
         
     | 
| 40 | 
         
            +
                get_question_selection_info  # ← NEW FUNCTION
         
     | 
| 41 | 
         
             
            )
         
     | 
| 42 | 
         | 
| 43 | 
         
             
            # Setup logging
         
     | 
| 
         | 
|
| 811 | 
         
             
                                outputs=[download_output]
         
     | 
| 812 | 
         
             
                            )
         
     | 
| 813 | 
         | 
| 
         | 
|
| 
         | 
|
| 814 | 
         
             
                        # ===============================
         
     | 
| 815 | 
         
            +
                        # TAB 4: FULL BENCHMARK (ENHANCED FOR 300 QUESTIONS)
         
     | 
| 816 | 
         
             
                        # ===============================
         
     | 
| 817 | 
         
             
                        with gr.Tab("🏆 Full Benchmark"):
         
     | 
| 818 | 
         
             
                            gr.Markdown("## Official GAIA Leaderboard Benchmark")
         
     | 
| 
         | 
|
| 834 | 
         
             
                                        value="Click above to see actual GAIA dataset structure"
         
     | 
| 835 | 
         
             
                                    )
         
     | 
| 836 | 
         | 
| 837 | 
         
            +
                            # Quick benchmark options
         
     | 
| 838 | 
         
            +
                            gr.Markdown("### 🎯 Quick Benchmark Options")
         
     | 
| 839 | 
         | 
| 840 | 
         
             
                            with gr.Row():
         
     | 
| 841 | 
         
            +
                                # Preset buttons for common configurations
         
     | 
| 842 | 
         
            +
                                quick_test_btn = gr.Button("🚀 Quick Test (20 questions)", variant="secondary")
         
     | 
| 843 | 
         
            +
                                medium_test_btn = gr.Button("📊 Medium Test (50 questions)", variant="secondary") 
         
     | 
| 844 | 
         
            +
                                full_benchmark_btn = gr.Button("🏆 FULL BENCHMARK (300 questions)", variant="primary", size="lg")
         
     | 
| 845 | 
         
            +
                            
         
     | 
| 846 | 
         
            +
                            # Advanced configuration (collapsible)
         
     | 
| 847 | 
         
            +
                            with gr.Accordion("🎛️ Advanced Configuration", open=False):
         
     | 
| 848 | 
         
            +
                                with gr.Row():
         
     | 
| 849 | 
         
            +
                                    custom_count = gr.Slider(
         
     | 
| 850 | 
         
             
                                        minimum=10,
         
     | 
| 851 | 
         
            +
                                        maximum=300, 
         
     | 
| 852 | 
         
             
                                        value=50,
         
     | 
| 853 | 
         
             
                                        step=10,
         
     | 
| 854 | 
         
            +
                                        label="Custom Question Count"
         
     | 
| 
         | 
|
| 855 | 
         
             
                                    )
         
     | 
| 856 | 
         | 
| 
         | 
|
| 857 | 
         
             
                                    selection_strategy = gr.Dropdown(
         
     | 
| 858 | 
         
             
                                        choices=["balanced", "random", "sequential"],
         
     | 
| 859 | 
         
             
                                        value="balanced",
         
     | 
| 860 | 
         
            +
                                        label="Selection Strategy"
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 861 | 
         
             
                                    )
         
     | 
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 
         | 
|
| 862 | 
         | 
| 863 | 
         
            +
                                custom_benchmark_btn = gr.Button("🎯 Run Custom Benchmark", variant="secondary")
         
     | 
| 864 | 
         
            +
                                
         
     | 
| 865 | 
         
            +
                                # Show selection info
         
     | 
| 866 | 
         
            +
                                selection_info = gr.Markdown(get_question_selection_info())
         
     | 
| 
         | 
|
| 
         | 
|
| 867 | 
         | 
| 868 | 
         
             
                            # Warning message for full benchmark
         
     | 
| 869 | 
         
             
                            gr.Markdown("""
         
     | 
| 
         | 
|
| 874 | 
         
             
                            - **Recommendation**: Test with smaller batches first
         
     | 
| 875 | 
         
             
                            """)
         
     | 
| 876 | 
         | 
| 877 | 
         
            +
                            # Results section  
         
     | 
| 878 | 
         
             
                            benchmark_status = gr.Textbox(
         
     | 
| 879 | 
         
             
                                label="📊 Benchmark Status",
         
     | 
| 880 | 
         
             
                                value="Ready to run benchmark",
         
     | 
| 
         | 
|
| 884 | 
         
             
                            with gr.Row():
         
     | 
| 885 | 
         
             
                                with gr.Column():
         
     | 
| 886 | 
         
             
                                    benchmark_report = gr.Markdown(
         
     | 
| 887 | 
         
            +
                                        label="📈 Benchmark Report", 
         
     | 
| 888 | 
         
             
                                        value="Run benchmark to see detailed results"
         
     | 
| 889 | 
         
             
                                    )
         
     | 
| 890 | 
         | 
| 
         | 
|
| 896 | 
         
             
                                    )
         
     | 
| 897 | 
         | 
| 898 | 
         
             
                                    metadata_file = gr.File(
         
     | 
| 899 | 
         
            +
                                        label="📋 Download Metadata File", 
         
     | 
| 900 | 
         
             
                                        visible=False
         
     | 
| 901 | 
         
             
                                    )
         
     | 
| 902 | 
         | 
| 903 | 
         
             
                                    gr.Markdown("""
         
     | 
| 904 | 
         
            +
                                    ### 📤 Leaderboard Submission Steps
         
     | 
| 905 | 
         
            +
                                    1. **Download** the JSONL file above
         
     | 
| 906 | 
         
            +
                                    2. **Visit** [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
         
     | 
| 907 | 
         
            +
                                    3. **Upload** your submission file
         
     | 
| 908 | 
         
            +
                                    4. **View** your model's ranking!
         
     | 
| 909 | 
         
             
                                    """)
         
     | 
| 910 | 
         | 
| 911 | 
         
            +
                            # ================================
         
     | 
| 912 | 
         
            +
                            # EVENT HANDLERS (FIXED FUNCTION CALLS)
         
     | 
| 913 | 
         
            +
                            # ================================
         
     | 
| 914 | 
         
            +
                            
         
     | 
| 915 | 
         
            +
                            # Preview functions
         
     | 
| 916 | 
         
             
                            test_preview_btn.click(
         
     | 
| 917 | 
         
             
                                fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
         
     | 
| 918 | 
         
             
                                outputs=[test_preview_output]
         
     | 
| 919 | 
         
             
                            )
         
     | 
| 920 | 
         | 
| 
         | 
|
| 921 | 
         
             
                            dataset_structure_btn.click(
         
     | 
| 922 | 
         
             
                                fn=preview_dataset_structure_interface,
         
     | 
| 923 | 
         
            +
                                outputs=[dataset_structure_output] 
         
     | 
| 924 | 
         
             
                            )
         
     | 
| 925 | 
         | 
| 926 | 
         
            +
                            # Quick benchmark functions
         
     | 
| 927 | 
         
            +
                            def run_quick_test(progress=gr.Progress()):
         
     | 
| 928 | 
         
            +
                                return run_custom_benchmark_interface(20, "balanced", progress)
         
     | 
| 929 | 
         
            +
                            
         
     | 
| 930 | 
         
            +
                            def run_medium_test(progress=gr.Progress()):
         
     | 
| 931 | 
         
            +
                                return run_custom_benchmark_interface(50, "balanced", progress)
         
     | 
| 932 | 
         | 
| 
         | 
|
| 933 | 
         
             
                            def run_full_300_benchmark(progress=gr.Progress()):
         
     | 
| 934 | 
         
             
                                return run_custom_benchmark_interface(300, "balanced", progress)
         
     | 
| 935 | 
         | 
| 936 | 
         
            +
                            def run_custom_benchmark_wrapper(count, strategy, progress=gr.Progress()):
         
     | 
| 937 | 
         
            +
                                return run_custom_benchmark_interface(count, strategy, progress)
         
     | 
| 938 | 
         
            +
                            
         
     | 
| 939 | 
         
            +
                            # Helper function to show download files
         
     | 
| 940 | 
         
            +
                            def show_download_files(status, report, sub_file, meta_file):
         
     | 
| 941 | 
         
             
                                return (
         
     | 
| 942 | 
         
            +
                                    status,
         
     | 
| 943 | 
         
             
                                    report, 
         
     | 
| 944 | 
         
            +
                                    sub_file,
         
     | 
| 945 | 
         
             
                                    meta_file,
         
     | 
| 946 | 
         
             
                                    gr.update(visible=True),  # Show submission file
         
     | 
| 947 | 
         
             
                                    gr.update(visible=True)   # Show metadata file
         
     | 
| 948 | 
         
             
                                )
         
     | 
| 949 | 
         | 
| 950 | 
         
            +
                            # Quick test events
         
     | 
| 951 | 
         
            +
                            quick_test_btn.click(
         
     | 
| 952 | 
         
            +
                                fn=run_quick_test,
         
     | 
| 
         | 
|
| 953 | 
         
             
                                outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
         
     | 
| 954 | 
         
             
                            ).then(
         
     | 
| 955 | 
         
            +
                                fn=show_download_files,
         
     | 
| 956 | 
         
             
                                inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
         
     | 
| 957 | 
         
             
                                outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
         
     | 
| 958 | 
         
             
                            )
         
     | 
| 959 | 
         | 
| 960 | 
         
            +
                            medium_test_btn.click(
         
     | 
| 961 | 
         
            +
                                fn=run_medium_test,
         
     | 
| 962 | 
         
            +
                                outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
         
     | 
| 963 | 
         
            +
                            ).then(
         
     | 
| 964 | 
         
            +
                                fn=show_download_files,
         
     | 
| 965 | 
         
            +
                                inputs=[benchmark_status, benchmark_report, submission_file, metadata_file], 
         
     | 
| 966 | 
         
            +
                                outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
         
     | 
| 967 | 
         
            +
                            )
         
     | 
| 968 | 
         
            +
                            
         
     | 
| 969 | 
         
            +
                            # FULL 300-question benchmark
         
     | 
| 970 | 
         
             
                            full_benchmark_btn.click(
         
     | 
| 971 | 
         
             
                                fn=run_full_300_benchmark,
         
     | 
| 972 | 
         
             
                                outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
         
     | 
| 973 | 
         
             
                            ).then(
         
     | 
| 974 | 
         
            +
                                fn=show_download_files,
         
     | 
| 975 | 
         
            +
                                inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
         
     | 
| 976 | 
         
            +
                                outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
         
     | 
| 977 | 
         
            +
                            )
         
     | 
| 978 | 
         
            +
                            
         
     | 
| 979 | 
         
            +
                            # Custom benchmark
         
     | 
| 980 | 
         
            +
                            custom_benchmark_btn.click(
         
     | 
| 981 | 
         
            +
                                fn=run_custom_benchmark_wrapper,
         
     | 
| 982 | 
         
            +
                                inputs=[custom_count, selection_strategy],
         
     | 
| 983 | 
         
            +
                                outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
         
     | 
| 984 | 
         
            +
                            ).then(
         
     | 
| 985 | 
         
            +
                                fn=show_download_files,
         
     | 
| 986 | 
         
             
                                inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
         
     | 
| 987 | 
         
             
                                outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
         
     | 
| 988 | 
         
             
                            )
         
     | 
| 989 | 
         
            +
                        
         
     | 
| 990 | 
         
             
                        # ===============================
         
     | 
| 991 | 
         
             
                        # TAB 5: INFORMATION (UPDATED)
         
     | 
| 992 | 
         
             
                        # ===============================
         
     | 
| 
         | 
|
| 1027 | 
         
             
                            - Then try "GAIA Test Set" for real benchmark evaluation
         
     | 
| 1028 | 
         
             
                            - Download results in JSONL format for submission
         
     | 
| 1029 | 
         | 
| 1030 | 
         
            +
                            ### 4. Full Benchmark (Enhanced!)
         
     | 
| 1031 | 
         
            +
                            - **Quick Tests**: 20 or 50 questions for rapid iteration
         
     | 
| 1032 | 
         
            +
                            - **Custom Configuration**: Choose exact question count and strategy
         
     | 
| 1033 | 
         
            +
                            - **Full 300-Question Benchmark**: Complete official evaluation
         
     | 
| 1034 | 
         
            +
                            - **Leaderboard Ready**: Automatic JSONL generation for submission
         
     | 
| 1035 | 
         | 
| 1036 | 
         
             
                            ## 📊 Model Recommendations
         
     | 
| 1037 | 
         | 
| 1038 | 
         
            +
                            | Model | Best For | Memory | Speed | Quality | 300Q Time | Cost (T4) |
         
     | 
| 1039 | 
         
            +
                            |-------|----------|---------|-------|---------|-----------|-----------|
         
     | 
| 1040 | 
         
            +
                            | Fast & Light | Quick testing | Low | Fast | Good | 45-75 min | ~$0.60-1.00 |
         
     | 
| 1041 | 
         
            +
                            | Balanced | General use | Medium | Medium | Better | 60-120 min | ~$1.00-2.00 |  
         
     | 
| 1042 | 
         
            +
                            | High Quality | Best results | High | Slow | Best | 90-180 min | ~$1.50-3.00 |
         
     | 
| 1043 | 
         
            +
                            | Instruction Following | Complex reasoning | High | Medium | Excellent | 75-150 min | ~$1.25-2.50 |
         
     | 
| 1044 | 
         | 
| 1045 | 
         
             
                            ## 🏅 Benchmark Performance Expectations
         
     | 
| 1046 | 
         | 
| 
         | 
|
| 1053 | 
         
             
                            | **Level 3** (Advanced) | 35-60% | 20-35% | 10-20% |
         
     | 
| 1054 | 
         
             
                            | **Overall Average** | 65-75% | 45-65% | 30-45% |
         
     | 
| 1055 | 
         | 
| 1056 | 
         
            +
                            ## 🚀 Flexible Benchmarking Features
         
     | 
| 1057 | 
         | 
| 1058 | 
         
            +
                            ### 🎯 **Custom Question Selection**
         
     | 
| 1059 | 
         
            +
                            - **Question Count**: Choose 10-300 questions
         
     | 
| 1060 | 
         
            +
                            - **Selection Strategies**: Balanced, Random, Sequential
         
     | 
| 1061 | 
         
            +
                            - **Level Distribution**: Automatic balancing across difficulties
         
     | 
| 1062 | 
         
            +
                            - **Reproducible**: Consistent results with same settings
         
     | 
| 
         | 
|
| 1063 | 
         | 
| 1064 | 
         
            +
                            ### 📊 **Smart Sampling**
         
     | 
| 1065 | 
         
            +
                            - **Balanced**: Realistic distribution (40% L1, 35% L2, 25% L3)
         
     | 
| 1066 | 
         
            +
                            - **Representative**: Questions from all difficulty levels
         
     | 
| 1067 | 
         
            +
                            - **Efficient**: Test fewer questions while maintaining quality
         
     | 
| 
         | 
|
| 1068 | 
         | 
| 1069 | 
         
            +
                            ### ⚡ **Quick Options**
         
     | 
| 1070 | 
         
            +
                            - **Quick Test (20Q)**: 5-15 minutes, ~$0.10-0.25
         
     | 
| 1071 | 
         
            +
                            - **Medium Test (50Q)**: 15-30 minutes, ~$0.25-0.50
         
     | 
| 1072 | 
         
            +
                            - **Full Benchmark (300Q)**: 1-3 hours, ~$1-3
         
     | 
| 1073 | 
         
            +
                            
         
     | 
| 1074 | 
         
            +
                            ## 🔄 Continuous Benchmarking Workflow
         
     | 
| 1075 | 
         
            +
                            
         
     | 
| 1076 | 
         
            +
                            1. **Development**: Start with Quick Test (20 questions)
         
     | 
| 1077 | 
         
            +
                            2. **Validation**: Use Medium Test (50 questions) for validation
         
     | 
| 1078 | 
         
            +
                            3. **Optimization**: Iterate on model improvements
         
     | 
| 1079 | 
         
            +
                            4. **Benchmarking**: Run Full Benchmark (300 questions) when ready
         
     | 
| 1080 | 
         
            +
                            5. **Submission**: Upload to official GAIA leaderboard
         
     | 
| 1081 | 
         
            +
                            6. **Analysis**: Compare with other models and iterate
         
     | 
| 1082 | 
         
            +
                            
         
     | 
| 1083 | 
         
            +
                            ## 📋 Official Dataset Integration
         
     | 
| 1084 | 
         
            +
                            
         
     | 
| 1085 | 
         
            +
                            ### **Metadata.jsonl Structure**
         
     | 
| 1086 | 
         
            +
                            - **Questions**: Stored in `2023/validation/metadata.jsonl` and `2023/test/metadata.jsonl`
         
     | 
| 1087 | 
         
            +
                            - **Additional Files**: Some questions reference images, documents, or data files
         
     | 
| 1088 | 
         
            +
                            - **Format**: Each line contains one question in JSON format
         
     | 
| 1089 | 
         
            +
                            - **Fields**: `task_id`, `Question`, `Level`, `file_name` (optional), `Final answer` (validation only)
         
     | 
| 1090 | 
         
            +
                            
         
     | 
| 1091 | 
         
            +
                            ### **Submission Format**
         
     | 
| 1092 | 
         
             
                            Results are saved in official GAIA leaderboard format:
         
     | 
| 1093 | 
         
             
                            ```json
         
     | 
| 1094 | 
         
             
                            {"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
         
     | 
| 
         | 
|
| 1097 | 
         | 
| 1098 | 
         
             
                            ## ⚡ Pro Tips for Best Results
         
     | 
| 1099 | 
         | 
| 1100 | 
         
            +
                            ### **Performance Optimization**
         
     | 
| 1101 | 
         
            +
                            1. **Start Small**: Always test with Quick Test first
         
     | 
| 1102 | 
         
             
                            2. **Choose Wisely**: Balance speed vs quality based on your goals
         
     | 
| 1103 | 
         
             
                            3. **Monitor Resources**: Use GPU acceleration for larger models
         
     | 
| 1104 | 
         
             
                            4. **Validate Format**: Ensure JSONL files are properly formatted
         
     | 
| 1105 | 
         | 
| 1106 | 
         
            +
                            ### **Leaderboard Strategy**
         
     | 
| 1107 | 
         
            +
                            1. **Baseline First**: Get initial results with Quick Test
         
     | 
| 1108 | 
         
            +
                            2. **Iterate Quickly**: Test improvements on Medium Test
         
     | 
| 1109 | 
         
             
                            3. **Full Benchmark**: Run complete evaluation when ready
         
     | 
| 1110 | 
         
             
                            4. **Compare Results**: Analyze performance across difficulty levels
         
     | 
| 1111 | 
         
             
                            5. **Document Approach**: Include model details and methodology
         
     | 
| 1112 | 
         | 
| 1113 | 
         
            +
                            ### **Cost Management**
         
     | 
| 1114 | 
         
            +
                            - **Development**: Use Quick Test (20Q) for rapid iteration (~$0.10-0.25)
         
     | 
| 1115 | 
         
            +
                            - **Validation**: Use Medium Test (50Q) for validation (~$0.25-0.50)
         
     | 
| 1116 | 
         
            +
                            - **Production**: Use Full Benchmark (300Q) for final submission (~$1-3)
         
     | 
| 1117 | 
         
            +
                            - **Hardware**: T4 Small GPU recommended for best price/performance
         
     | 
| 1118 | 
         
            +
                            
         
     | 
| 1119 | 
         
            +
                            ### **Common Pitfalls to Avoid**
         
     | 
| 1120 | 
         
             
                            - Don't run full benchmark on untested models
         
     | 
| 1121 | 
         
             
                            - Ensure stable internet connection for long evaluations
         
     | 
| 1122 | 
         
             
                            - Verify submission file format before uploading
         
     | 
| 
         | 
|
| 1125 | 
         | 
| 1126 | 
         
             
                            ## 🎯 Getting Started Checklist
         
     | 
| 1127 | 
         | 
| 1128 | 
         
            +
                            - [ ] **Load Model**: Choose and load a model in "Model Setup"
         
     | 
| 1129 | 
         
            +
                            - [ ] **Test Single**: Try example questions in "Single Question"
         
     | 
| 1130 | 
         
            +
                            - [ ] **Quick Test**: Run 20-question benchmark to verify setup
         
     | 
| 1131 | 
         
            +
                            - [ ] **Preview Dataset**: Check "Preview Test Questions" in Full Benchmark
         
     | 
| 1132 | 
         
            +
                            - [ ] **Medium Test**: Run 50-question validation benchmark
         
     | 
| 1133 | 
         
            +
                            - [ ] **Full Benchmark**: Run complete 300-question evaluation when ready
         
     | 
| 1134 | 
         
            +
                            - [ ] **Download Files**: Get JSONL submission and metadata files
         
     | 
| 1135 | 
         
            +
                            - [ ] **Submit**: Upload to GAIA leaderboard
         
     | 
| 1136 | 
         
            +
                            - [ ] **Compare**: Analyze your results against other models!
         
     | 
| 1137 | 
         
            +
                            
         
     | 
| 1138 | 
         
            +
                            ## 🔗 Resources
         
     | 
| 1139 | 
         
            +
                            - [GAIA Paper](https://arxiv.org/abs/2311.12983) - Original research paper
         
     | 
| 1140 | 
         
            +
                            - [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard) - Official rankings
         
     | 
| 1141 | 
         
            +
                            - [GAIA Dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA) - Official dataset repository
         
     | 
| 1142 | 
         
            +
                            - [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces) - Deployment documentation
         
     | 
| 1143 | 
         | 
| 1144 | 
         
             
                            ---
         
     | 
| 1145 | 
         | 
| 1146 | 
         
            +
                            **Ready to start benchmarking?** Begin with the Model Setup tab, then progress through Quick Test → Medium Test → Full Benchmark. Good luck climbing the leaderboard! 🚀
         
     | 
| 1147 | 
         
             
                            """)
         
     | 
| 1148 | 
         | 
| 1149 | 
         
             
                    return app
         
     |