Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -7,6 +7,7 @@ from toy_dataset_eval import evaluate_toy_dataset
|
|
| 7 |
from mmlu_eval_original import evaluate_mmlu_batched
|
| 8 |
import spaces
|
| 9 |
import pandas as pd
|
|
|
|
| 10 |
|
| 11 |
# Read token and login
|
| 12 |
hf_token = os.getenv("HF_TOKEN_READ_WRITE")
|
|
@@ -27,6 +28,7 @@ model_loaded = False
|
|
| 27 |
def load_model():
|
| 28 |
"""Loads the Mistral model and tokenizer and updates the load status."""
|
| 29 |
global tokenizer, model, model_loaded
|
|
|
|
| 30 |
try:
|
| 31 |
if tokenizer is None:
|
| 32 |
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
|
|
@@ -38,14 +40,15 @@ def load_model():
|
|
| 38 |
)
|
| 39 |
model.to('cuda')
|
| 40 |
model_loaded = True
|
| 41 |
-
|
|
|
|
| 42 |
except Exception as e:
|
| 43 |
model_loaded = False
|
| 44 |
return f"❌ Model Load Failed: {str(e)}"
|
| 45 |
# ---------------------------------------------------------------------------
|
| 46 |
# 2. Toy Evaluation
|
| 47 |
# ---------------------------------------------------------------------------
|
| 48 |
-
@spaces.GPU
|
| 49 |
def run_toy_evaluation():
|
| 50 |
"""Runs the toy dataset evaluation."""
|
| 51 |
if not model_loaded:
|
|
@@ -54,8 +57,12 @@ def run_toy_evaluation():
|
|
| 54 |
if not model_loaded:
|
| 55 |
return "⚠️ Model not loaded. Please load the model first."
|
| 56 |
|
|
|
|
| 57 |
results = evaluate_toy_dataset(model, tokenizer)
|
| 58 |
-
|
|
|
|
|
|
|
|
|
|
| 59 |
|
| 60 |
# ---------------------------------------------------------------------------
|
| 61 |
# 3. MMLU Evaluation call
|
|
@@ -77,7 +84,7 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
| 77 |
load_model()
|
| 78 |
|
| 79 |
if not model_loaded:
|
| 80 |
-
return "⚠️ Model not loaded. Please load the model first."
|
| 81 |
|
| 82 |
# Convert num_subjects to -1 if all_subjects is True
|
| 83 |
if all_subjects:
|
|
@@ -87,7 +94,8 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
| 87 |
if all_questions:
|
| 88 |
num_questions = -1
|
| 89 |
|
| 90 |
-
# Run evaluation
|
|
|
|
| 91 |
results = evaluate_mmlu(
|
| 92 |
model,
|
| 93 |
tokenizer,
|
|
@@ -96,6 +104,7 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
| 96 |
num_shots=num_shots,
|
| 97 |
auto_batch_size=True
|
| 98 |
)
|
|
|
|
| 99 |
|
| 100 |
# Format results
|
| 101 |
overall_acc = results["overall_accuracy"]
|
|
@@ -105,17 +114,35 @@ def run_mmlu_evaluation(all_subjects, num_subjects, num_shots, all_questions, nu
|
|
| 105 |
# Create DataFrame from results table
|
| 106 |
results_df = pd.DataFrame(results["full_accuracy_table"])
|
| 107 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
# Format the report
|
| 109 |
report = (
|
| 110 |
f"### Overall Results\n"
|
| 111 |
f"* Overall Accuracy: {overall_acc:.3f}\n"
|
| 112 |
f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
|
| 113 |
-
f"* Worst Performance: {min_subject} ({min_acc:.3f})\n
|
| 114 |
-
f"
|
| 115 |
-
f"{results_df.to_markdown()}\n"
|
| 116 |
)
|
| 117 |
|
| 118 |
-
return report
|
| 119 |
|
| 120 |
# ---------------------------------------------------------------------------
|
| 121 |
# 4. Gradio Interface
|
|
@@ -186,6 +213,7 @@ with gr.Blocks() as demo:
|
|
| 186 |
with gr.Row():
|
| 187 |
eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
|
| 188 |
results_output = gr.Markdown(label="Evaluation Results")
|
|
|
|
| 189 |
|
| 190 |
# Connect components
|
| 191 |
load_button.click(fn=load_model, inputs=None, outputs=load_status)
|
|
@@ -233,7 +261,7 @@ with gr.Blocks() as demo:
|
|
| 233 |
all_questions_checkbox,
|
| 234 |
num_questions_slider
|
| 235 |
],
|
| 236 |
-
outputs=results_output
|
| 237 |
)
|
| 238 |
|
| 239 |
-
demo.launch()
|
|
|
|
| 7 |
from mmlu_eval_original import evaluate_mmlu_batched
|
| 8 |
import spaces
|
| 9 |
import pandas as pd
|
| 10 |
+
import time # Added for timing functionality
|
| 11 |
|
| 12 |
# Read token and login
|
| 13 |
hf_token = os.getenv("HF_TOKEN_READ_WRITE")
|
|
|
|
| 28 |
def load_model():
|
| 29 |
"""Loads the Mistral model and tokenizer and updates the load status."""
|
| 30 |
global tokenizer, model, model_loaded
|
| 31 |
+
start_time = time.time() # Start timing
|
| 32 |
try:
|
| 33 |
if tokenizer is None:
|
| 34 |
tokenizer = AutoTokenizer.from_pretrained(model_name, token=hf_token)
|
|
|
|
| 40 |
)
|
| 41 |
model.to('cuda')
|
| 42 |
model_loaded = True
|
| 43 |
+
elapsed_time = time.time() - start_time # Calculate elapsed time
|
| 44 |
+
return f"✅ Model Loaded in {elapsed_time:.2f} seconds!"
|
| 45 |
except Exception as e:
|
| 46 |
model_loaded = False
|
| 47 |
return f"❌ Model Load Failed: {str(e)}"
|
| 48 |
# ---------------------------------------------------------------------------
|
| 49 |
# 2. Toy Evaluation
|
| 50 |
# ---------------------------------------------------------------------------
|
| 51 |
+
@spaces.GPU(duration=120)
|
| 52 |
def run_toy_evaluation():
|
| 53 |
"""Runs the toy dataset evaluation."""
|
| 54 |
if not model_loaded:
|
|
|
|
| 57 |
if not model_loaded:
|
| 58 |
return "⚠️ Model not loaded. Please load the model first."
|
| 59 |
|
| 60 |
+
start_time = time.time() # Start timing
|
| 61 |
results = evaluate_toy_dataset(model, tokenizer)
|
| 62 |
+
elapsed_time = time.time() - start_time # Calculate elapsed time
|
| 63 |
+
|
| 64 |
+
return f"{results}\n\nEvaluation completed in {elapsed_time:.2f} seconds.", \
|
| 65 |
+
f"<div>Time taken: {elapsed_time:.2f} seconds</div>" # Return timing info
|
| 66 |
|
| 67 |
# ---------------------------------------------------------------------------
|
| 68 |
# 3. MMLU Evaluation call
|
|
|
|
| 84 |
load_model()
|
| 85 |
|
| 86 |
if not model_loaded:
|
| 87 |
+
return "⚠️ Model not loaded. Please load the model first.", None
|
| 88 |
|
| 89 |
# Convert num_subjects to -1 if all_subjects is True
|
| 90 |
if all_subjects:
|
|
|
|
| 94 |
if all_questions:
|
| 95 |
num_questions = -1
|
| 96 |
|
| 97 |
+
# Run evaluation with timing
|
| 98 |
+
start_time = time.time() # Start timing
|
| 99 |
results = evaluate_mmlu(
|
| 100 |
model,
|
| 101 |
tokenizer,
|
|
|
|
| 104 |
num_shots=num_shots,
|
| 105 |
auto_batch_size=True
|
| 106 |
)
|
| 107 |
+
elapsed_time = time.time() - start_time # Calculate elapsed time
|
| 108 |
|
| 109 |
# Format results
|
| 110 |
overall_acc = results["overall_accuracy"]
|
|
|
|
| 114 |
# Create DataFrame from results table
|
| 115 |
results_df = pd.DataFrame(results["full_accuracy_table"])
|
| 116 |
|
| 117 |
+
# Calculate totals for the overall row
|
| 118 |
+
total_samples = results_df['Num_samples'].sum()
|
| 119 |
+
total_correct = results_df['Num_correct'].sum()
|
| 120 |
+
|
| 121 |
+
# Create overall row
|
| 122 |
+
overall_row = pd.DataFrame({
|
| 123 |
+
'Subject': ['**Overall**'],
|
| 124 |
+
'Num_samples': [total_samples],
|
| 125 |
+
'Num_correct': [total_correct],
|
| 126 |
+
'Accuracy': [overall_acc]
|
| 127 |
+
})
|
| 128 |
+
|
| 129 |
+
# Concatenate overall row with results
|
| 130 |
+
results_df = pd.concat([overall_row, results_df], ignore_index=True)
|
| 131 |
+
|
| 132 |
+
# Verify that the overall accuracy is consistent with the total correct/total samples
|
| 133 |
+
assert abs(overall_acc - (total_correct / total_samples)) < 1e-6, \
|
| 134 |
+
"Overall accuracy calculation mismatch detected"
|
| 135 |
+
|
| 136 |
# Format the report
|
| 137 |
report = (
|
| 138 |
f"### Overall Results\n"
|
| 139 |
f"* Overall Accuracy: {overall_acc:.3f}\n"
|
| 140 |
f"* Best Performance: {max_subject} ({max_acc:.3f})\n"
|
| 141 |
+
f"* Worst Performance: {min_subject} ({min_acc:.3f})\n"
|
| 142 |
+
f"* Evaluation completed in {elapsed_time:.2f} seconds\n"
|
|
|
|
| 143 |
)
|
| 144 |
|
| 145 |
+
return report, results_df # Return both text report and dataframe
|
| 146 |
|
| 147 |
# ---------------------------------------------------------------------------
|
| 148 |
# 4. Gradio Interface
|
|
|
|
| 213 |
with gr.Row():
|
| 214 |
eval_mmlu_button = gr.Button("Run MMLU Evaluation", variant="primary")
|
| 215 |
results_output = gr.Markdown(label="Evaluation Results")
|
| 216 |
+
results_table = gr.DataFrame(interactive=True, label="Detailed Results (Sortable)")
|
| 217 |
|
| 218 |
# Connect components
|
| 219 |
load_button.click(fn=load_model, inputs=None, outputs=load_status)
|
|
|
|
| 261 |
all_questions_checkbox,
|
| 262 |
num_questions_slider
|
| 263 |
],
|
| 264 |
+
outputs=[results_output, results_table]
|
| 265 |
)
|
| 266 |
|
| 267 |
+
demo.launch()
|