Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
|
@@ -809,8 +809,10 @@ def create_gaia_app():
|
|
| 809 |
outputs=[download_output]
|
| 810 |
)
|
| 811 |
|
|
|
|
|
|
|
| 812 |
# ===============================
|
| 813 |
-
# TAB 4: FULL BENCHMARK (
|
| 814 |
# ===============================
|
| 815 |
with gr.Tab("🏆 Full Benchmark"):
|
| 816 |
gr.Markdown("## Official GAIA Leaderboard Benchmark")
|
|
@@ -826,20 +828,69 @@ def create_gaia_app():
|
|
| 826 |
value="Click above to preview official test questions"
|
| 827 |
)
|
| 828 |
|
| 829 |
-
#
|
| 830 |
-
gr.
|
| 831 |
-
gr.Markdown(
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 835 |
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
)
|
| 841 |
|
| 842 |
-
# Benchmark
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 843 |
benchmark_status = gr.Textbox(
|
| 844 |
label="📊 Benchmark Status",
|
| 845 |
value="Ready to run benchmark",
|
|
@@ -875,12 +926,26 @@ def create_gaia_app():
|
|
| 875 |
|
| 876 |
# Event handlers
|
| 877 |
test_preview_btn.click(
|
| 878 |
-
fn=load_test_questions_interface,
|
| 879 |
outputs=[test_preview_output]
|
| 880 |
)
|
| 881 |
|
| 882 |
-
|
| 883 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 884 |
return (
|
| 885 |
status,
|
| 886 |
report,
|
|
@@ -890,18 +955,26 @@ def create_gaia_app():
|
|
| 890 |
gr.update(visible=True) # Show metadata file
|
| 891 |
)
|
| 892 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 893 |
full_benchmark_btn.click(
|
| 894 |
-
fn=
|
| 895 |
-
outputs=[
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
submission_file, # Update visibility
|
| 901 |
-
metadata_file # Update visibility
|
| 902 |
-
]
|
| 903 |
)
|
| 904 |
-
|
| 905 |
# ===============================
|
| 906 |
# TAB 5: INFORMATION (UPDATED)
|
| 907 |
# ===============================
|
|
|
|
| 809 |
outputs=[download_output]
|
| 810 |
)
|
| 811 |
|
| 812 |
+
# Add this to your Full Benchmark tab in app.py
|
| 813 |
+
|
| 814 |
# ===============================
|
| 815 |
+
# TAB 4: FULL BENCHMARK (UPDATED FOR 300 QUESTIONS)
|
| 816 |
# ===============================
|
| 817 |
with gr.Tab("🏆 Full Benchmark"):
|
| 818 |
gr.Markdown("## Official GAIA Leaderboard Benchmark")
|
|
|
|
| 828 |
value="Click above to preview official test questions"
|
| 829 |
)
|
| 830 |
|
| 831 |
+
# Dataset structure preview (NEW)
|
| 832 |
+
dataset_structure_btn = gr.Button("📁 Preview Dataset Structure", variant="secondary")
|
| 833 |
+
dataset_structure_output = gr.Markdown(
|
| 834 |
+
value="Click above to see actual GAIA dataset structure"
|
| 835 |
+
)
|
| 836 |
+
|
| 837 |
+
# Benchmark Configuration Section (NEW)
|
| 838 |
+
gr.Markdown("### 🎛️ Benchmark Configuration")
|
| 839 |
+
|
| 840 |
+
with gr.Row():
|
| 841 |
+
with gr.Column():
|
| 842 |
+
# Question count selection
|
| 843 |
+
question_count = gr.Slider(
|
| 844 |
+
minimum=10,
|
| 845 |
+
maximum=300,
|
| 846 |
+
value=50,
|
| 847 |
+
step=10,
|
| 848 |
+
label="Number of Questions",
|
| 849 |
+
info="Choose how many questions to evaluate (300 = full benchmark)"
|
| 850 |
+
)
|
| 851 |
+
|
| 852 |
+
# Selection strategy
|
| 853 |
+
selection_strategy = gr.Dropdown(
|
| 854 |
+
choices=["balanced", "random", "sequential"],
|
| 855 |
+
value="balanced",
|
| 856 |
+
label="Question Selection Strategy",
|
| 857 |
+
info="Balanced recommended for representative evaluation"
|
| 858 |
+
)
|
| 859 |
|
| 860 |
+
with gr.Column():
|
| 861 |
+
# Configuration info
|
| 862 |
+
config_info = gr.Markdown(
|
| 863 |
+
value=get_question_selection_info()
|
| 864 |
)
|
| 865 |
|
| 866 |
+
# Benchmark execution
|
| 867 |
+
gr.Markdown("### 🚀 Run Benchmark")
|
| 868 |
+
|
| 869 |
+
with gr.Row():
|
| 870 |
+
# Custom benchmark button
|
| 871 |
+
custom_benchmark_btn = gr.Button(
|
| 872 |
+
"🎯 Start Custom Benchmark",
|
| 873 |
+
variant="primary",
|
| 874 |
+
size="lg"
|
| 875 |
+
)
|
| 876 |
+
|
| 877 |
+
# Full 300-question benchmark button
|
| 878 |
+
full_benchmark_btn = gr.Button(
|
| 879 |
+
"🏆 Start FULL 300-Question Benchmark",
|
| 880 |
+
variant="secondary",
|
| 881 |
+
size="lg"
|
| 882 |
+
)
|
| 883 |
+
|
| 884 |
+
# Warning message for full benchmark
|
| 885 |
+
gr.Markdown("""
|
| 886 |
+
**⚠️ Full 300-Question Benchmark Warning**:
|
| 887 |
+
- **Time**: 1-3 hours depending on model and hardware
|
| 888 |
+
- **Cost**: ~$1-3 on GPU (T4 Small recommended)
|
| 889 |
+
- **Purpose**: Official leaderboard submission
|
| 890 |
+
- **Recommendation**: Test with smaller batches first
|
| 891 |
+
""")
|
| 892 |
+
|
| 893 |
+
# Results section
|
| 894 |
benchmark_status = gr.Textbox(
|
| 895 |
label="📊 Benchmark Status",
|
| 896 |
value="Ready to run benchmark",
|
|
|
|
| 926 |
|
| 927 |
# Event handlers
|
| 928 |
test_preview_btn.click(
|
| 929 |
+
fn=lambda: load_test_questions_interface(max_questions=10, selection_type="balanced"),
|
| 930 |
outputs=[test_preview_output]
|
| 931 |
)
|
| 932 |
|
| 933 |
+
# NEW: Dataset structure preview
|
| 934 |
+
dataset_structure_btn.click(
|
| 935 |
+
fn=preview_dataset_structure_interface,
|
| 936 |
+
outputs=[dataset_structure_output]
|
| 937 |
+
)
|
| 938 |
+
|
| 939 |
+
# Custom benchmark with user settings
|
| 940 |
+
def run_custom_benchmark_with_settings(num_questions, strategy, progress=gr.Progress()):
|
| 941 |
+
return run_custom_benchmark_interface(num_questions, strategy, progress)
|
| 942 |
+
|
| 943 |
+
# Full 300-question benchmark
|
| 944 |
+
def run_full_300_benchmark(progress=gr.Progress()):
|
| 945 |
+
return run_custom_benchmark_interface(300, "balanced", progress)
|
| 946 |
+
|
| 947 |
+
def benchmark_with_files(*args):
|
| 948 |
+
status, report, sub_file, meta_file = args[0], args[1], args[2], args[3]
|
| 949 |
return (
|
| 950 |
status,
|
| 951 |
report,
|
|
|
|
| 955 |
gr.update(visible=True) # Show metadata file
|
| 956 |
)
|
| 957 |
|
| 958 |
+
# Custom benchmark event
|
| 959 |
+
custom_benchmark_btn.click(
|
| 960 |
+
fn=run_custom_benchmark_with_settings,
|
| 961 |
+
inputs=[question_count, selection_strategy],
|
| 962 |
+
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
|
| 963 |
+
).then(
|
| 964 |
+
fn=benchmark_with_files,
|
| 965 |
+
inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
|
| 966 |
+
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
|
| 967 |
+
)
|
| 968 |
+
|
| 969 |
+
# Full 300-question benchmark event
|
| 970 |
full_benchmark_btn.click(
|
| 971 |
+
fn=run_full_300_benchmark,
|
| 972 |
+
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file]
|
| 973 |
+
).then(
|
| 974 |
+
fn=benchmark_with_files,
|
| 975 |
+
inputs=[benchmark_status, benchmark_report, submission_file, metadata_file],
|
| 976 |
+
outputs=[benchmark_status, benchmark_report, submission_file, metadata_file, submission_file, metadata_file]
|
|
|
|
|
|
|
|
|
|
| 977 |
)
|
|
|
|
| 978 |
# ===============================
|
| 979 |
# TAB 5: INFORMATION (UPDATED)
|
| 980 |
# ===============================
|