Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -68,7 +68,7 @@ def run_toy_evaluation():
|
|
| 68 |
# 3. MMLU Evaluation call
|
| 69 |
# ---------------------------------------------------------------------------
|
| 70 |
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
|
| 71 |
-
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions):
|
| 72 |
"""
|
| 73 |
Runs the MMLU evaluation with the specified parameters.
|
| 74 |
|
|
@@ -78,13 +78,15 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
| 78 |
num_shots (int): Number of few-shot examples (0-5)
|
| 79 |
all_questions (bool): Whether to evaluate all questions per subject
|
| 80 |
num_questions (int): Number of examples per subject (1-20 or -1 for all)
|
|
|
|
| 81 |
"""
|
| 82 |
|
| 83 |
if not model_loaded:
|
| 84 |
load_model()
|
| 85 |
|
| 86 |
if not model_loaded:
|
| 87 |
-
return "⚠️ Model not loaded. Please load the model first.", None
|
|
|
|
| 88 |
|
| 89 |
# Convert num_subjects to -1 if all_subjects is True
|
| 90 |
if all_subjects:
|
|
@@ -142,7 +144,9 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
| 142 |
f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
|
| 143 |
)
|
| 144 |
|
| 145 |
-
|
|
|
|
|
|
|
| 146 |
|
| 147 |
# ---------------------------------------------------------------------------
|
| 148 |
# 4. Gradio Interface
|
|
@@ -200,20 +204,31 @@ with gr.Blocks() as demo:
|
|
| 200 |
value=False, # Default is unchecked
|
| 201 |
info="When checked, evaluates all available questions for each subject"
|
| 202 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 203 |
num_questions_slider = gr.Slider(
|
| 204 |
minimum=1,
|
| 205 |
maximum=20,
|
| 206 |
value=10, # Default is 10 questions
|
| 207 |
step=1,
|
| 208 |
label="Questions per Subject",
|
| 209 |
-
info="Choose a subset of questions (1-20)
|
| 210 |
interactive=True
|
| 211 |
)
|
| 212 |
|
| 213 |
with gr.Row():
|
| 214 |
-
|
|
|
|
|
|
|
| 215 |
results_output = gr.Markdown(label="Evaluation Results")
|
| 216 |
-
|
|
|
|
|
|
|
| 217 |
|
| 218 |
# Connect components
|
| 219 |
load_button.click(fn=load_model, inputs=None, outputs=load_status)
|
|
@@ -238,21 +253,61 @@ with gr.Blocks() as demo:
|
|
| 238 |
outputs=[num_subjects_slider]
|
| 239 |
)
|
| 240 |
|
| 241 |
-
# Update
|
| 242 |
-
def
|
| 243 |
if checked:
|
| 244 |
-
return gr.update(
|
| 245 |
else:
|
| 246 |
-
return gr.update(
|
| 247 |
|
| 248 |
all_questions_checkbox.change(
|
| 249 |
-
fn=
|
| 250 |
inputs=[all_questions_checkbox],
|
| 251 |
-
outputs=[
|
| 252 |
)
|
| 253 |
|
| 254 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 255 |
eval_mmlu_button.click(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 256 |
fn=run_mmlu_evaluation,
|
| 257 |
inputs=[
|
| 258 |
all_subjects_checkbox,
|
|
@@ -261,7 +316,30 @@ with gr.Blocks() as demo:
|
|
| 261 |
all_questions_checkbox,
|
| 262 |
num_questions_slider
|
| 263 |
],
|
| 264 |
-
outputs=[
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 265 |
)
|
| 266 |
|
| 267 |
-
demo.launch()
|
|
|
|
| 68 |
# 3. MMLU Evaluation call
|
| 69 |
# ---------------------------------------------------------------------------
|
| 70 |
@spaces.GPU(duration=120) # Allow up to 2 minutes for full evaluation
|
| 71 |
+
def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, num_questions, progress=gr.Progress()):
|
| 72 |
"""
|
| 73 |
Runs the MMLU evaluation with the specified parameters.
|
| 74 |
|
|
|
|
| 78 |
num_shots (int): Number of few-shot examples (0-5)
|
| 79 |
all_questions (bool): Whether to evaluate all questions per subject
|
| 80 |
num_questions (int): Number of examples per subject (1-20 or -1 for all)
|
| 81 |
+
progress (gr.Progress): Progress indicator
|
| 82 |
"""
|
| 83 |
|
| 84 |
if not model_loaded:
|
| 85 |
load_model()
|
| 86 |
|
| 87 |
if not model_loaded:
|
| 88 |
+
return "⚠️ Model not loaded. Please load the model first.", None, gr.update(interactive=True), gr.update(visible=False), \
|
| 89 |
+
[gr.update(interactive=True) for _ in range(5)]
|
| 90 |
|
| 91 |
# Convert num_subjects to -1 if all_subjects is True
|
| 92 |
if all_subjects:
|
|
|
|
| 144 |
f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
|
| 145 |
)
|
| 146 |
|
| 147 |
+
# Return values that re-enable UI components after completion
|
| 148 |
+
return report, results_df, gr.update(interactive=True), gr.update(visible=False), \
|
| 149 |
+
[gr.update(interactive=True) for _ in range(5)]
|
| 150 |
|
| 151 |
# ---------------------------------------------------------------------------
|
| 152 |
# 4. Gradio Interface
|
|
|
|
| 204 |
value=False, # Default is unchecked
|
| 205 |
info="When checked, evaluates all available questions for each subject"
|
| 206 |
)
|
| 207 |
+
questions_info_text = gr.Markdown(visible=False, value="**All 14,042 questions across all subjects will be evaluated**")
|
| 208 |
+
|
| 209 |
+
with gr.Row(elem_id="questions_selection_row"):
|
| 210 |
+
questions_container = gr.Column(scale=1, elem_id="questions_slider_container")
|
| 211 |
+
|
| 212 |
+
# Move the slider into the container for easier visibility toggling
|
| 213 |
+
with questions_container:
|
| 214 |
num_questions_slider = gr.Slider(
|
| 215 |
minimum=1,
|
| 216 |
maximum=20,
|
| 217 |
value=10, # Default is 10 questions
|
| 218 |
step=1,
|
| 219 |
label="Questions per Subject",
|
| 220 |
+
info="Choose a subset of questions (1-20)",
|
| 221 |
interactive=True
|
| 222 |
)
|
| 223 |
|
| 224 |
with gr.Row():
|
| 225 |
+
with gr.Column(scale=1):
|
| 226 |
+
eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary", interactive=True)
|
| 227 |
+
cancel_mmlu_button = gr.Button("Cancel MMLU Evaluation", variant="stop", visible=False)
|
| 228 |
results_output = gr.Markdown(label="Evaluation Results")
|
| 229 |
+
|
| 230 |
+
with gr.Row():
|
| 231 |
+
results_table = gr.DataFrame(interactive=True, label="Detailed Results (Sortable)", visible=True)
|
| 232 |
|
| 233 |
# Connect components
|
| 234 |
load_button.click(fn=load_model, inputs=None, outputs=load_status)
|
|
|
|
| 253 |
outputs=[num_subjects_slider]
|
| 254 |
)
|
| 255 |
|
| 256 |
+
# Update interface based on all_questions checkbox
|
| 257 |
+
def update_questions_interface(checked):
|
| 258 |
if checked:
|
| 259 |
+
return gr.update(visible=False), gr.update(visible=True)
|
| 260 |
else:
|
| 261 |
+
return gr.update(visible=True), gr.update(visible=False)
|
| 262 |
|
| 263 |
all_questions_checkbox.change(
|
| 264 |
+
fn=update_questions_interface,
|
| 265 |
inputs=[all_questions_checkbox],
|
| 266 |
+
outputs=[questions_container, questions_info_text]
|
| 267 |
)
|
| 268 |
|
| 269 |
+
# Function to disable UI components during evaluation
|
| 270 |
+
def disable_ui_for_evaluation():
|
| 271 |
+
return [
|
| 272 |
+
gr.update(interactive=False, info="MMLU Evaluation currently in progress"), # all_subjects_checkbox
|
| 273 |
+
gr.update(interactive=False, info="MMLU Evaluation currently in progress"), # num_subjects_slider
|
| 274 |
+
gr.update(interactive=False, info="MMLU Evaluation currently in progress"), # num_shots_slider
|
| 275 |
+
gr.update(interactive=False, info="MMLU Evaluation currently in progress"), # all_questions_checkbox
|
| 276 |
+
gr.update(interactive=False, info="MMLU Evaluation currently in progress"), # num_questions_slider
|
| 277 |
+
gr.update(interactive=False), # eval_mmlu_button
|
| 278 |
+
gr.update(visible=True) # cancel_mmlu_button
|
| 279 |
+
]
|
| 280 |
+
|
| 281 |
+
# Function to handle cancel button click
|
| 282 |
+
def cancel_evaluation():
|
| 283 |
+
# This doesn't actually cancel the GPU job (which would require more backend support)
|
| 284 |
+
# But it does reset the UI state to be interactive again
|
| 285 |
+
return [
|
| 286 |
+
gr.update(interactive=True, info="When checked, evaluates all 57 MMLU subjects"), # all_subjects_checkbox
|
| 287 |
+
gr.update(interactive=True, info="Number of subjects to evaluate (1-57). They will be loaded in alphabetical order."), # num_subjects_slider
|
| 288 |
+
gr.update(interactive=True, info="Number of examples to use for few-shot learning (0-5). They will be loaded in alphabetical order."), # num_shots_slider
|
| 289 |
+
gr.update(interactive=True, info="When checked, evaluates all available questions for each subject"), # all_questions_checkbox
|
| 290 |
+
gr.update(interactive=True, info="Choose a subset of questions (1-20)"), # num_questions_slider
|
| 291 |
+
gr.update(interactive=True), # eval_mmlu_button
|
| 292 |
+
gr.update(visible=False), # cancel_mmlu_button
|
| 293 |
+
"⚠️ Evaluation canceled by user", # results_output
|
| 294 |
+
None # results_table
|
| 295 |
+
]
|
| 296 |
+
|
| 297 |
+
# Connect MMLU evaluation button - now disables UI and shows cancel button
|
| 298 |
eval_mmlu_button.click(
|
| 299 |
+
fn=disable_ui_for_evaluation,
|
| 300 |
+
inputs=None,
|
| 301 |
+
outputs=[
|
| 302 |
+
all_subjects_checkbox,
|
| 303 |
+
num_subjects_slider,
|
| 304 |
+
num_shots_slider,
|
| 305 |
+
all_questions_checkbox,
|
| 306 |
+
num_questions_slider,
|
| 307 |
+
eval_mmlu_button,
|
| 308 |
+
cancel_mmlu_button
|
| 309 |
+
]
|
| 310 |
+
).then(
|
| 311 |
fn=run_mmlu_evaluation,
|
| 312 |
inputs=[
|
| 313 |
all_subjects_checkbox,
|
|
|
|
| 316 |
all_questions_checkbox,
|
| 317 |
num_questions_slider
|
| 318 |
],
|
| 319 |
+
outputs=[
|
| 320 |
+
results_output,
|
| 321 |
+
results_table,
|
| 322 |
+
eval_mmlu_button,
|
| 323 |
+
cancel_mmlu_button,
|
| 324 |
+
[all_subjects_checkbox, num_subjects_slider, num_shots_slider, all_questions_checkbox, num_questions_slider]
|
| 325 |
+
]
|
| 326 |
+
)
|
| 327 |
+
|
| 328 |
+
# Connect cancel button
|
| 329 |
+
cancel_mmlu_button.click(
|
| 330 |
+
fn=cancel_evaluation,
|
| 331 |
+
inputs=None,
|
| 332 |
+
outputs=[
|
| 333 |
+
all_subjects_checkbox,
|
| 334 |
+
num_subjects_slider,
|
| 335 |
+
num_shots_slider,
|
| 336 |
+
all_questions_checkbox,
|
| 337 |
+
num_questions_slider,
|
| 338 |
+
eval_mmlu_button,
|
| 339 |
+
cancel_mmlu_button,
|
| 340 |
+
results_output,
|
| 341 |
+
results_table
|
| 342 |
+
]
|
| 343 |
)
|
| 344 |
|
| 345 |
+
demo.launch()
|