Spaces:
Sleeping
Sleeping
Update run_evaluation.py
Browse files- run_evaluation.py +17 -0
run_evaluation.py
CHANGED
|
@@ -115,6 +115,23 @@ def run_mmlu_evaluation(subject_selection_mode, num_subjects, selected_subjects,
|
|
| 115 |
'Difference': abs(overall_diff),
|
| 116 |
'Winner': overall_winner
|
| 117 |
})
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
comparison_df = pd.DataFrame(comparison_data)
|
| 120 |
|
|
|
|
| 115 |
'Difference': abs(overall_diff),
|
| 116 |
'Winner': overall_winner
|
| 117 |
})
|
| 118 |
+
|
| 119 |
+
report = (
|
| 120 |
+
f"### Head-to-Head Comparison Results\n\n"
|
| 121 |
+
f"#### Model 1: {model1_config['name']}\n"
|
| 122 |
+
f"* Overall Accuracy: {model1_overall_acc:.3f}\n"
|
| 123 |
+
f"* Best Performance: {model1_max_subject} ({model1_max_acc:.3f})\n"
|
| 124 |
+
f"* Worst Performance: {model1_min_subject} ({model1_min_acc:.3f})\n"
|
| 125 |
+
f"* Evaluation completed in {model1_elapsed_time:.2f} seconds\n\n"
|
| 126 |
+
f"#### Model 2: {model2_config['name']}\n"
|
| 127 |
+
f"* Overall Accuracy: {model2_overall_acc:.3f}\n"
|
| 128 |
+
f"* Best Performance: {model2_max_subject} ({model2_max_acc:.3f})\n"
|
| 129 |
+
f"* Worst Performance: {model2_min_subject} ({model2_min_acc:.3f})\n"
|
| 130 |
+
f"* Evaluation completed in {model2_elapsed_time:.2f} seconds\n\n"
|
| 131 |
+
f"#### Overall Winner: {overall_winner}\n"
|
| 132 |
+
f"* Margin: {abs(overall_diff):.3f}\n"
|
| 133 |
+
)
|
| 134 |
+
|
| 135 |
|
| 136 |
comparison_df = pd.DataFrame(comparison_data)
|
| 137 |
|