Ashokdll commited on
Commit
ba63dbb
·
verified ·
1 Parent(s): 8fea07e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +150 -104
app.py CHANGED
@@ -30,12 +30,14 @@ from transformers import (
30
  from datasets import load_dataset
31
  from huggingface_hub import HfApi, hf_hub_download
32
 
33
- # Import leaderboard integration
34
  from gaia_leaderboard_integration import (
35
  enhanced_gaia_agent,
36
- run_leaderboard_benchmark_interface,
37
  load_test_questions_interface,
38
- get_leaderboard_info
 
 
39
  )
40
 
41
  # Setup logging
@@ -809,10 +811,8 @@ def create_gaia_app():
809
  outputs=[download_output]
810
  )
811
 
812
- # Add this to your Full Benchmark tab in app.py
813
-
814
  # ===============================
815
- # TAB 4: FULL BENCHMARK (UPDATED FOR 300 QUESTIONS)
816
  # ===============================
817
  with gr.Tab("🏆 Full Benchmark"):
818
  gr.Markdown("## Official GAIA Leaderboard Benchmark")
@@ -834,52 +834,36 @@ def create_gaia_app():
834
  value="Click above to see actual GAIA dataset structure"
835
  )
836
 
837
- # Benchmark Configuration Section (NEW)
838
- gr.Markdown("### 🎛️ Benchmark Configuration")
839
 
840
  with gr.Row():
841
- with gr.Column():
842
- # Question count selection
843
- question_count = gr.Slider(
 
 
 
 
 
 
844
  minimum=10,
845
- maximum=300,
846
  value=50,
847
  step=10,
848
- label="Number of Questions",
849
- info="Choose how many questions to evaluate (300 = full benchmark)"
850
  )
851
 
852
- # Selection strategy
853
  selection_strategy = gr.Dropdown(
854
  choices=["balanced", "random", "sequential"],
855
  value="balanced",
856
- label="Question Selection Strategy",
857
- info="Balanced recommended for representative evaluation"
858
- )
859
-
860
- with gr.Column():
861
- # Configuration info
862
- config_info = gr.Markdown(
863
- value=get_question_selection_info()
864
  )
865
-
866
- # Benchmark execution
867
- gr.Markdown("### 🚀 Run Benchmark")
868
-
869
- with gr.Row():
870
- # Custom benchmark button
871
- custom_benchmark_btn = gr.Button(
872
- "🎯 Start Custom Benchmark",
873
- variant="primary",
874
- size="lg"
875
- )
876
 
877
- # Full 300-question benchmark button
878
- full_benchmark_btn = gr.Button(
879
- "🏆 Start FULL 300-Question Benchmark",
880
- variant="secondary",
881
- size="lg"
882
- )
883
 
884
  # Warning message for full benchmark
885
  gr.Markdown("""
@@ -890,7 +874,7 @@ def create_gaia_app():
890
  - **Recommendation**: Test with smaller batches first
891
  """)
892
 
893
- # Results section
894
  benchmark_status = gr.Textbox(
895
  label="📊 Benchmark Status",
896
  value="Ready to run benchmark",
@@ -900,7 +884,7 @@ def create_gaia_app():
900
  with gr.Row():
901
  with gr.Column():
902
  benchmark_report = gr.Markdown(
903
- label="📈 Benchmark Report",
904
  value="Run benchmark to see detailed results"
905
  )
906
 
@@ -912,69 +896,97 @@ def create_gaia_app():
912
  )
913
 
914
  metadata_file = gr.File(
915
- label="📋 Download Metadata File",
916
  visible=False
917
  )
918
 
919
  gr.Markdown("""
920
- ### 📤 Leaderboard Submission
921
- 1. Download the JSONL file above
922
- 2. Visit [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
923
- 3. Upload your submission file
924
- 4. View your model's ranking!
925
  """)
926
 
927
- # Event handlers
 
 
 
 
928
  test_preview_btn.click(
929
  fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
930
  outputs=[test_preview_output]
931
  )
932
 
933
- # NEW: Dataset structure preview
934
  dataset_structure_btn.click(
935
  fn=preview_dataset_structure_interface,
936
- outputs=[dataset_structure_output]
937
  )
938
 
939
- # Custom benchmark with user settings
940
- def run_custom_benchmark_with_settings(num_questions, strategy, progress=gr.Progress()):
941
- return run_custom_benchmark_interface(num_questions, strategy, progress)
 
 
 
942
 
943
- # Full 300-question benchmark
944
  def run_full_300_benchmark(progress=gr.Progress()):
945
  return run_custom_benchmark_interface(300, "balanced", progress)
946
 
947
- def benchmark_with_files(*args):
948
- status, report, sub_file, meta_file = args[0], args[1], args[2], args[3]
 
 
 
949
  return (
950
- status,
951
  report,
952
- sub_file,
953
  meta_file,
954
  gr.update(visible=True), # Show submission file
955
  gr.update(visible=True) # Show metadata file
956
  )
957
 
958
- # Custom benchmark event
959
- custom_benchmark_btn.click(
960
- fn=run_custom_benchmark_with_settings,
961
- inputs=[question_count, selection_strategy],
962
  outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
963
  ).then(
964
- fn=benchmark_with_files,
965
  inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
966
  outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
967
  )
968
 
969
- # Full 300-question benchmark event
 
 
 
 
 
 
 
 
 
970
  full_benchmark_btn.click(
971
  fn=run_full_300_benchmark,
972
  outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
973
  ).then(
974
- fn=benchmark_with_files,
 
 
 
 
 
 
 
 
 
 
 
975
  inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
976
  outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
977
  )
 
978
  # ===============================
979
  # TAB 5: INFORMATION (UPDATED)
980
  # ===============================
@@ -1015,19 +1027,20 @@ def create_gaia_app():
1015
  - Then try "GAIA Test Set" for real benchmark evaluation
1016
  - Download results in JSONL format for submission
1017
 
1018
- ### 4. Full Benchmark (NEW!)
1019
- - Run complete evaluation on all 300 official test questions
1020
- - Get leaderboard-ready submission files
1021
- - Upload directly to GAIA leaderboard for ranking
 
1022
 
1023
  ## 📊 Model Recommendations
1024
 
1025
- | Model | Best For | Memory | Speed | Quality | Leaderboard Ready |
1026
- |-------|----------|---------|-------|---------|------------------|
1027
- | Fast & Light | Quick testing | Low | Fast | Good | |
1028
- | Balanced | General use | Medium | Medium | Better | |
1029
- | High Quality | Best results | High | Slow | Best | |
1030
- | Instruction Following | Complex reasoning | High | Medium | Excellent | |
1031
 
1032
  ## 🏅 Benchmark Performance Expectations
1033
 
@@ -1040,22 +1053,42 @@ def create_gaia_app():
1040
  | **Level 3** (Advanced) | 35-60% | 20-35% | 10-20% |
1041
  | **Overall Average** | 65-75% | 45-65% | 30-45% |
1042
 
1043
- ## 🚀 Continuous Benchmarking Workflow
1044
 
1045
- 1. **Development**: Test with sample questions
1046
- 2. **Validation**: Run batch evaluation (10-50 questions)
1047
- 3. **Benchmarking**: Full evaluation (300 questions)
1048
- 4. **Submission**: Upload to leaderboard
1049
- 5. **Analysis**: Compare with other models
1050
- 6. **Iteration**: Improve and re-benchmark
1051
 
1052
- ## 🔗 Resources
1053
- - [GAIA Paper](https://arxiv.org/abs/2311.12983) - Original research paper
1054
- - [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard) - Official rankings
1055
- - [GAIA Dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA) - Training/validation data
1056
- - [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces) - Deployment documentation
1057
 
1058
- ## 📋 Submission Format
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1059
  Results are saved in official GAIA leaderboard format:
1060
  ```json
1061
  {"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
@@ -1064,20 +1097,26 @@ def create_gaia_app():
1064
 
1065
  ## ⚡ Pro Tips for Best Results
1066
 
1067
- ### Performance Optimization
1068
- 1. **Start Small**: Always test with sample questions first
1069
  2. **Choose Wisely**: Balance speed vs quality based on your goals
1070
  3. **Monitor Resources**: Use GPU acceleration for larger models
1071
  4. **Validate Format**: Ensure JSONL files are properly formatted
1072
 
1073
- ### Leaderboard Strategy
1074
- 1. **Baseline First**: Get initial results with fast model
1075
- 2. **Iterate Quickly**: Test improvements on small batches
1076
  3. **Full Benchmark**: Run complete evaluation when ready
1077
  4. **Compare Results**: Analyze performance across difficulty levels
1078
  5. **Document Approach**: Include model details and methodology
1079
 
1080
- ### Common Pitfalls to Avoid
 
 
 
 
 
 
1081
  - Don't run full benchmark on untested models
1082
  - Ensure stable internet connection for long evaluations
1083
  - Verify submission file format before uploading
@@ -1086,18 +1125,25 @@ def create_gaia_app():
1086
 
1087
  ## 🎯 Getting Started Checklist
1088
 
1089
- - [ ] Load and test a model in "Model Setup"
1090
- - [ ] Try example questions in "Single Question"
1091
- - [ ] Run small batch in "Batch Evaluation"
1092
- - [ ] Review test questions in "Full Benchmark"
1093
- - [ ] Run complete benchmark when ready
1094
- - [ ] Download submission files
1095
- - [ ] Upload to GAIA leaderboard
1096
- - [ ] Compare your results with others!
 
 
 
 
 
 
 
1097
 
1098
  ---
1099
 
1100
- **Ready to start benchmarking?** Begin with the Model Setup tab and work your way through each stage. Good luck! 🚀
1101
  """)
1102
 
1103
  return app
 
30
  from datasets import load_dataset
31
  from huggingface_hub import HfApi, hf_hub_download
32
 
33
+ # Import leaderboard integration (CORRECTED IMPORTS)
34
  from gaia_leaderboard_integration import (
35
  enhanced_gaia_agent,
36
+ run_custom_benchmark_interface, # ← FIXED: was run_leaderboard_benchmark_interface
37
  load_test_questions_interface,
38
+ preview_dataset_structure_interface, # ← NEW FUNCTION
39
+ get_leaderboard_info,
40
+ get_question_selection_info # ← NEW FUNCTION
41
  )
42
 
43
  # Setup logging
 
811
  outputs=[download_output]
812
  )
813
 
 
 
814
  # ===============================
815
+ # TAB 4: FULL BENCHMARK (ENHANCED FOR 300 QUESTIONS)
816
  # ===============================
817
  with gr.Tab("🏆 Full Benchmark"):
818
  gr.Markdown("## Official GAIA Leaderboard Benchmark")
 
834
  value="Click above to see actual GAIA dataset structure"
835
  )
836
 
837
+ # Quick benchmark options
838
+ gr.Markdown("### 🎯 Quick Benchmark Options")
839
 
840
  with gr.Row():
841
+ # Preset buttons for common configurations
842
+ quick_test_btn = gr.Button("🚀 Quick Test (20 questions)", variant="secondary")
843
+ medium_test_btn = gr.Button("📊 Medium Test (50 questions)", variant="secondary")
844
+ full_benchmark_btn = gr.Button("🏆 FULL BENCHMARK (300 questions)", variant="primary", size="lg")
845
+
846
+ # Advanced configuration (collapsible)
847
+ with gr.Accordion("🎛️ Advanced Configuration", open=False):
848
+ with gr.Row():
849
+ custom_count = gr.Slider(
850
  minimum=10,
851
+ maximum=300,
852
  value=50,
853
  step=10,
854
+ label="Custom Question Count"
 
855
  )
856
 
 
857
  selection_strategy = gr.Dropdown(
858
  choices=["balanced", "random", "sequential"],
859
  value="balanced",
860
+ label="Selection Strategy"
 
 
 
 
 
 
 
861
  )
 
 
 
 
 
 
 
 
 
 
 
862
 
863
+ custom_benchmark_btn = gr.Button("🎯 Run Custom Benchmark", variant="secondary")
864
+
865
+ # Show selection info
866
+ selection_info = gr.Markdown(get_question_selection_info())
 
 
867
 
868
  # Warning message for full benchmark
869
  gr.Markdown("""
 
874
  - **Recommendation**: Test with smaller batches first
875
  """)
876
 
877
+ # Results section
878
  benchmark_status = gr.Textbox(
879
  label="📊 Benchmark Status",
880
  value="Ready to run benchmark",
 
884
  with gr.Row():
885
  with gr.Column():
886
  benchmark_report = gr.Markdown(
887
+ label="📈 Benchmark Report",
888
  value="Run benchmark to see detailed results"
889
  )
890
 
 
896
  )
897
 
898
  metadata_file = gr.File(
899
+ label="📋 Download Metadata File",
900
  visible=False
901
  )
902
 
903
  gr.Markdown("""
904
+ ### 📤 Leaderboard Submission Steps
905
+ 1. **Download** the JSONL file above
906
+ 2. **Visit** [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard)
907
+ 3. **Upload** your submission file
908
+ 4. **View** your model's ranking!
909
  """)
910
 
911
+ # ================================
912
+ # EVENT HANDLERS (FIXED FUNCTION CALLS)
913
+ # ================================
914
+
915
+ # Preview functions
916
  test_preview_btn.click(
917
  fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
918
  outputs=[test_preview_output]
919
  )
920
 
 
921
  dataset_structure_btn.click(
922
  fn=preview_dataset_structure_interface,
923
+ outputs=[dataset_structure_output]
924
  )
925
 
926
+ # Quick benchmark functions
927
+ def run_quick_test(progress=gr.Progress()):
928
+ return run_custom_benchmark_interface(20, "balanced", progress)
929
+
930
+ def run_medium_test(progress=gr.Progress()):
931
+ return run_custom_benchmark_interface(50, "balanced", progress)
932
 
 
933
  def run_full_300_benchmark(progress=gr.Progress()):
934
  return run_custom_benchmark_interface(300, "balanced", progress)
935
 
936
+ def run_custom_benchmark_wrapper(count, strategy, progress=gr.Progress()):
937
+ return run_custom_benchmark_interface(count, strategy, progress)
938
+
939
+ # Helper function to show download files
940
+ def show_download_files(status, report, sub_file, meta_file):
941
  return (
942
+ status,
943
  report,
944
+ sub_file,
945
  meta_file,
946
  gr.update(visible=True), # Show submission file
947
  gr.update(visible=True) # Show metadata file
948
  )
949
 
950
+ # Quick test events
951
+ quick_test_btn.click(
952
+ fn=run_quick_test,
 
953
  outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
954
  ).then(
955
+ fn=show_download_files,
956
  inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
957
  outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
958
  )
959
 
960
+ medium_test_btn.click(
961
+ fn=run_medium_test,
962
+ outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
963
+ ).then(
964
+ fn=show_download_files,
965
+ inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
966
+ outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
967
+ )
968
+
969
+ # FULL 300-question benchmark
970
  full_benchmark_btn.click(
971
  fn=run_full_300_benchmark,
972
  outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
973
  ).then(
974
+ fn=show_download_files,
975
+ inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
976
+ outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
977
+ )
978
+
979
+ # Custom benchmark
980
+ custom_benchmark_btn.click(
981
+ fn=run_custom_benchmark_wrapper,
982
+ inputs=[custom_count, selection_strategy],
983
+ outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
984
+ ).then(
985
+ fn=show_download_files,
986
  inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
987
  outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
988
  )
989
+
990
  # ===============================
991
  # TAB 5: INFORMATION (UPDATED)
992
  # ===============================
 
1027
  - Then try "GAIA Test Set" for real benchmark evaluation
1028
  - Download results in JSONL format for submission
1029
 
1030
+ ### 4. Full Benchmark (Enhanced!)
1031
+ - **Quick Tests**: 20 or 50 questions for rapid iteration
1032
+ - **Custom Configuration**: Choose exact question count and strategy
1033
+ - **Full 300-Question Benchmark**: Complete official evaluation
1034
+ - **Leaderboard Ready**: Automatic JSONL generation for submission
1035
 
1036
  ## 📊 Model Recommendations
1037
 
1038
+ | Model | Best For | Memory | Speed | Quality | 300Q Time | Cost (T4) |
1039
+ |-------|----------|---------|-------|---------|-----------|-----------|
1040
+ | Fast & Light | Quick testing | Low | Fast | Good | 45-75 min | ~$0.60-1.00 |
1041
+ | Balanced | General use | Medium | Medium | Better | 60-120 min | ~$1.00-2.00 |
1042
+ | High Quality | Best results | High | Slow | Best | 90-180 min | ~$1.50-3.00 |
1043
+ | Instruction Following | Complex reasoning | High | Medium | Excellent | 75-150 min | ~$1.25-2.50 |
1044
 
1045
  ## 🏅 Benchmark Performance Expectations
1046
 
 
1053
  | **Level 3** (Advanced) | 35-60% | 20-35% | 10-20% |
1054
  | **Overall Average** | 65-75% | 45-65% | 30-45% |
1055
 
1056
+ ## 🚀 Flexible Benchmarking Features
1057
 
1058
+ ### 🎯 **Custom Question Selection**
1059
+ - **Question Count**: Choose 10-300 questions
1060
+ - **Selection Strategies**: Balanced, Random, Sequential
1061
+ - **Level Distribution**: Automatic balancing across difficulties
1062
+ - **Reproducible**: Consistent results with same settings
 
1063
 
1064
+ ### 📊 **Smart Sampling**
1065
+ - **Balanced**: Realistic distribution (40% L1, 35% L2, 25% L3)
1066
+ - **Representative**: Questions from all difficulty levels
1067
+ - **Efficient**: Test fewer questions while maintaining quality
 
1068
 
1069
+ ### **Quick Options**
1070
+ - **Quick Test (20Q)**: 5-15 minutes, ~$0.10-0.25
1071
+ - **Medium Test (50Q)**: 15-30 minutes, ~$0.25-0.50
1072
+ - **Full Benchmark (300Q)**: 1-3 hours, ~$1-3
1073
+
1074
+ ## 🔄 Continuous Benchmarking Workflow
1075
+
1076
+ 1. **Development**: Start with Quick Test (20 questions)
1077
+ 2. **Validation**: Use Medium Test (50 questions) for validation
1078
+ 3. **Optimization**: Iterate on model improvements
1079
+ 4. **Benchmarking**: Run Full Benchmark (300 questions) when ready
1080
+ 5. **Submission**: Upload to official GAIA leaderboard
1081
+ 6. **Analysis**: Compare with other models and iterate
1082
+
1083
+ ## 📋 Official Dataset Integration
1084
+
1085
+ ### **Metadata.jsonl Structure**
1086
+ - **Questions**: Stored in `2023/validation/metadata.jsonl` and `2023/test/metadata.jsonl`
1087
+ - **Additional Files**: Some questions reference images, documents, or data files
1088
+ - **Format**: Each line contains one question in JSON format
1089
+ - **Fields**: `task_id`, `Question`, `Level`, `file_name` (optional), `Final answer` (validation only)
1090
+
1091
+ ### **Submission Format**
1092
  Results are saved in official GAIA leaderboard format:
1093
  ```json
1094
  {"task_id": "gaia_001", "model_answer": "[FULL RESPONSE]", "reasoning_trace": "[STEP-BY-STEP REASONING]"}
 
1097
 
1098
  ## ⚡ Pro Tips for Best Results
1099
 
1100
+ ### **Performance Optimization**
1101
+ 1. **Start Small**: Always test with Quick Test first
1102
  2. **Choose Wisely**: Balance speed vs quality based on your goals
1103
  3. **Monitor Resources**: Use GPU acceleration for larger models
1104
  4. **Validate Format**: Ensure JSONL files are properly formatted
1105
 
1106
+ ### **Leaderboard Strategy**
1107
+ 1. **Baseline First**: Get initial results with Quick Test
1108
+ 2. **Iterate Quickly**: Test improvements on Medium Test
1109
  3. **Full Benchmark**: Run complete evaluation when ready
1110
  4. **Compare Results**: Analyze performance across difficulty levels
1111
  5. **Document Approach**: Include model details and methodology
1112
 
1113
+ ### **Cost Management**
1114
+ - **Development**: Use Quick Test (20Q) for rapid iteration (~$0.10-0.25)
1115
+ - **Validation**: Use Medium Test (50Q) for validation (~$0.25-0.50)
1116
+ - **Production**: Use Full Benchmark (300Q) for final submission (~$1-3)
1117
+ - **Hardware**: T4 Small GPU recommended for best price/performance
1118
+
1119
+ ### **Common Pitfalls to Avoid**
1120
  - Don't run full benchmark on untested models
1121
  - Ensure stable internet connection for long evaluations
1122
  - Verify submission file format before uploading
 
1125
 
1126
  ## 🎯 Getting Started Checklist
1127
 
1128
+ - [ ] **Load Model**: Choose and load a model in "Model Setup"
1129
+ - [ ] **Test Single**: Try example questions in "Single Question"
1130
+ - [ ] **Quick Test**: Run 20-question benchmark to verify setup
1131
+ - [ ] **Preview Dataset**: Check "Preview Test Questions" in Full Benchmark
1132
+ - [ ] **Medium Test**: Run 50-question validation benchmark
1133
+ - [ ] **Full Benchmark**: Run complete 300-question evaluation when ready
1134
+ - [ ] **Download Files**: Get JSONL submission and metadata files
1135
+ - [ ] **Submit**: Upload to GAIA leaderboard
1136
+ - [ ] **Compare**: Analyze your results against other models!
1137
+
1138
+ ## 🔗 Resources
1139
+ - [GAIA Paper](https://arxiv.org/abs/2311.12983) - Original research paper
1140
+ - [GAIA Leaderboard](https://huggingface.co/spaces/gaia-benchmark/leaderboard) - Official rankings
1141
+ - [GAIA Dataset](https://huggingface.co/datasets/gaia-benchmark/GAIA) - Official dataset repository
1142
+ - [Hugging Face Spaces](https://huggingface.co/docs/hub/spaces) - Deployment documentation
1143
 
1144
  ---
1145
 
1146
+ **Ready to start benchmarking?** Begin with the Model Setup tab, then progress through Quick Test Medium Test → Full Benchmark. Good luck climbing the leaderboard! 🚀
1147
  """)
1148
 
1149
  return app