Spaces:
Sleeping
Sleeping
Update mmlu_pro_eval_adapted.py
Browse files- mmlu_pro_eval_adapted.py +20 -14
mmlu_pro_eval_adapted.py
CHANGED
|
@@ -310,17 +310,18 @@ def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5, debug_mode
|
|
| 310 |
|
| 311 |
return correctness, accuracy
|
| 312 |
|
| 313 |
-
|
| 314 |
-
@spaces.GPU(duration=240) # Extended to 4 minutes for larger evaluations
|
| 315 |
-
def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
|
| 316 |
"""
|
| 317 |
Main evaluation function for MMLU-Pro benchmark.
|
| 318 |
|
| 319 |
Args:
|
| 320 |
model_name: Name/path of model to evaluate
|
| 321 |
num_subjects: Number of subjects to test (-1 for all)
|
| 322 |
-
num_questions: Number of questions per subject
|
| 323 |
num_shots: Number of examples to include in prompts
|
|
|
|
|
|
|
|
|
|
| 324 |
"""
|
| 325 |
print(f"Is CUDA available: {torch.cuda.is_available()}")
|
| 326 |
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
|
|
@@ -332,12 +333,13 @@ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5
|
|
| 332 |
# Sort DataFrames
|
| 333 |
test_df = test_df.sort_values(['category', 'question_id'])
|
| 334 |
val_df = val_df.sort_values(['category', 'question_id'])
|
| 335 |
-
|
| 336 |
# Get unique subjects
|
| 337 |
all_subjects = sorted(test_df['category'].unique())
|
| 338 |
|
| 339 |
-
# Select subjects based on
|
| 340 |
-
if
|
|
|
|
|
|
|
| 341 |
selected_subjects = all_subjects
|
| 342 |
else:
|
| 343 |
selected_subjects = all_subjects[:num_subjects]
|
|
@@ -348,13 +350,17 @@ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5
|
|
| 348 |
results = {}
|
| 349 |
all_correctness = []
|
| 350 |
results_table = []
|
| 351 |
-
|
| 352 |
# Process each subject
|
| 353 |
for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
|
| 354 |
# Filter data for current subject
|
| 355 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 356 |
val_samples = val_df[val_df['category'] == subject].head(num_shots)
|
| 357 |
-
|
| 358 |
# Run evaluation
|
| 359 |
correctness, acc = eval_cot(
|
| 360 |
subject,
|
|
@@ -377,13 +383,13 @@ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5
|
|
| 377 |
|
| 378 |
# Calculate overall metrics
|
| 379 |
weighted_acc = np.mean(all_correctness)
|
| 380 |
-
min_acc_subject = min(results.items(), key=lambda x: x[1])
|
| 381 |
-
max_acc_subject = max(results.items(), key=lambda x: x[1])
|
| 382 |
|
| 383 |
# Return results summary
|
| 384 |
return {
|
| 385 |
"overall_accuracy": weighted_acc,
|
| 386 |
-
"min_accuracy_subject":
|
| 387 |
-
"max_accuracy_subject":
|
| 388 |
"full_accuracy_table": results_table,
|
| 389 |
}
|
|
|
|
| 310 |
|
| 311 |
return correctness, accuracy
|
| 312 |
|
| 313 |
+
def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5, specific_subjects=None, flash_attention=False, regex_pattern=None):
|
|
|
|
|
|
|
| 314 |
"""
|
| 315 |
Main evaluation function for MMLU-Pro benchmark.
|
| 316 |
|
| 317 |
Args:
|
| 318 |
model_name: Name/path of model to evaluate
|
| 319 |
num_subjects: Number of subjects to test (-1 for all)
|
| 320 |
+
num_questions: Number of questions per subject (-1 for all)
|
| 321 |
num_shots: Number of examples to include in prompts
|
| 322 |
+
specific_subjects: List of specific subjects to evaluate (overrides num_subjects)
|
| 323 |
+
flash_attention: Whether to use flash attention (currently ignored)
|
| 324 |
+
regex_pattern: Regex pattern for answer extraction (currently ignored)
|
| 325 |
"""
|
| 326 |
print(f"Is CUDA available: {torch.cuda.is_available()}")
|
| 327 |
print(f"CUDA device: {torch.cuda.get_device_name(torch.cuda.current_device())}")
|
|
|
|
| 333 |
# Sort DataFrames
|
| 334 |
test_df = test_df.sort_values(['category', 'question_id'])
|
| 335 |
val_df = val_df.sort_values(['category', 'question_id'])
|
|
|
|
| 336 |
# Get unique subjects
|
| 337 |
all_subjects = sorted(test_df['category'].unique())
|
| 338 |
|
| 339 |
+
# Select subjects based on parameters
|
| 340 |
+
if specific_subjects is not None:
|
| 341 |
+
selected_subjects = [subject for subject in specific_subjects if subject in all_subjects]
|
| 342 |
+
elif num_subjects == -1 or num_subjects >= len(all_subjects):
|
| 343 |
selected_subjects = all_subjects
|
| 344 |
else:
|
| 345 |
selected_subjects = all_subjects[:num_subjects]
|
|
|
|
| 350 |
results = {}
|
| 351 |
all_correctness = []
|
| 352 |
results_table = []
|
|
|
|
| 353 |
# Process each subject
|
| 354 |
for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
|
| 355 |
# Filter data for current subject
|
| 356 |
+
if num_questions == -1:
|
| 357 |
+
# Use all questions for this subject
|
| 358 |
+
test_samples = test_df[test_df['category'] == subject]
|
| 359 |
+
else:
|
| 360 |
+
# Use specified number of questions
|
| 361 |
+
test_samples = test_df[test_df['category'] == subject].head(num_questions)
|
| 362 |
+
|
| 363 |
val_samples = val_df[val_df['category'] == subject].head(num_shots)
|
|
|
|
| 364 |
# Run evaluation
|
| 365 |
correctness, acc = eval_cot(
|
| 366 |
subject,
|
|
|
|
| 383 |
|
| 384 |
# Calculate overall metrics
|
| 385 |
weighted_acc = np.mean(all_correctness)
|
| 386 |
+
min_acc_subject = min(results.items(), key=lambda x: x[1])
|
| 387 |
+
max_acc_subject = max(results.items(), key=lambda x: x[1])
|
| 388 |
|
| 389 |
# Return results summary
|
| 390 |
return {
|
| 391 |
"overall_accuracy": weighted_acc,
|
| 392 |
+
"min_accuracy_subject": min_acc_subject,
|
| 393 |
+
"max_accuracy_subject": max_acc_subject,
|
| 394 |
"full_accuracy_table": results_table,
|
| 395 |
}
|