Spaces:
Sleeping
Sleeping
Update mmlu_pro_eval_adapted.py
Browse files- mmlu_pro_eval_adapted.py +70 -6
mmlu_pro_eval_adapted.py
CHANGED
|
@@ -149,7 +149,7 @@ def extract_final(text):
|
|
| 149 |
def batch_inference(llm, sampling_params, inference_batch):
|
| 150 |
start = time.time()
|
| 151 |
outputs = llm.generate(inference_batch, sampling_params)
|
| 152 |
-
logging.info(str(len(inference_batch)) + "
|
| 153 |
response_batch = []
|
| 154 |
pred_batch = []
|
| 155 |
for output in outputs:
|
|
@@ -157,9 +157,72 @@ def batch_inference(llm, sampling_params, inference_batch):
|
|
| 157 |
response_batch.append(generated_text)
|
| 158 |
pred = extract_answer(generated_text)
|
| 159 |
pred_batch.append(pred)
|
| 160 |
-
logging.info("PRED BATCH: %s, RESPONSE BATCH: %s", pred_batch, response_batch)
|
| 161 |
return pred_batch, response_batch
|
| 162 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
|
| 164 |
def calculate_accuracy(res):
|
| 165 |
"""
|
|
@@ -190,7 +253,7 @@ def calculate_accuracy(res):
|
|
| 190 |
|
| 191 |
|
| 192 |
@torch.no_grad()
|
| 193 |
-
def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
|
| 194 |
"""
|
| 195 |
Evaluate model using chain-of-thought prompting.
|
| 196 |
|
|
@@ -231,8 +294,9 @@ def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
|
|
| 231 |
|
| 232 |
inference_batches.append(prompt)
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
|
|
|
| 236 |
|
| 237 |
# Add predictions to test DataFrame
|
| 238 |
results_df = test_df.copy()
|
|
@@ -247,7 +311,7 @@ def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
|
|
| 247 |
return correctness, accuracy
|
| 248 |
|
| 249 |
|
| 250 |
-
@spaces.GPU(duration=240) # Extended to
|
| 251 |
def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
|
| 252 |
"""
|
| 253 |
Main evaluation function for MMLU-Pro benchmark.
|
|
|
|
| 149 |
def batch_inference(llm, sampling_params, inference_batch):
|
| 150 |
start = time.time()
|
| 151 |
outputs = llm.generate(inference_batch, sampling_params)
|
| 152 |
+
logging.info("Batch of size: ", str(len(inference_batch)) + ". Time taken: " + str(time.time() - start))
|
| 153 |
response_batch = []
|
| 154 |
pred_batch = []
|
| 155 |
for output in outputs:
|
|
|
|
| 157 |
response_batch.append(generated_text)
|
| 158 |
pred = extract_answer(generated_text)
|
| 159 |
pred_batch.append(pred)
|
|
|
|
| 160 |
return pred_batch, response_batch
|
| 161 |
|
| 162 |
+
def batch_inference_debug_mode(llm, sampling_params, inference_batch, tokenizer):
|
| 163 |
+
start = time.time()
|
| 164 |
+
outputs = llm.generate(inference_batch, sampling_params)
|
| 165 |
+
logging.info(str(len(inference_batch)) + " size batch costing time: " + str(time.time() - start))
|
| 166 |
+
response_batch = []
|
| 167 |
+
pred_batch = []
|
| 168 |
+
input_token_counts = []
|
| 169 |
+
output_token_counts = []
|
| 170 |
+
|
| 171 |
+
for i, output in enumerate(outputs):
|
| 172 |
+
generated_text = output.outputs[0].text
|
| 173 |
+
response_batch.append(generated_text)
|
| 174 |
+
pred = extract_answer(generated_text)
|
| 175 |
+
pred_batch.append(pred)
|
| 176 |
+
|
| 177 |
+
# Proper token count using tokenizer
|
| 178 |
+
input_tokens = len(tokenizer.encode(inference_batch[i]))
|
| 179 |
+
output_tokens = len(tokenizer.encode(generated_text))
|
| 180 |
+
|
| 181 |
+
input_token_counts.append(input_tokens)
|
| 182 |
+
output_token_counts.append(output_tokens)
|
| 183 |
+
|
| 184 |
+
logging.info("PRED BATCH: %s", pred_batch)
|
| 185 |
+
logging.info("RESPONSE BATCH: %s", response_batch)
|
| 186 |
+
|
| 187 |
+
# Convert to DataFrame for logging (handle cases with fewer than 40 requests)
|
| 188 |
+
num_samples = min(40, len(inference_batch))
|
| 189 |
+
summary_df = pd.DataFrame({
|
| 190 |
+
'Input': inference_batch[:num_samples],
|
| 191 |
+
'Response': response_batch[:num_samples]
|
| 192 |
+
})
|
| 193 |
+
logging.info("\nSummary of first %d requests and responses:\n%s", num_samples, summary_df.to_string())
|
| 194 |
+
|
| 195 |
+
# Total and average input/output token statistics
|
| 196 |
+
total_input_tokens = sum(input_token_counts)
|
| 197 |
+
total_output_tokens = sum(output_token_counts)
|
| 198 |
+
avg_input_tokens = total_input_tokens / len(input_token_counts)
|
| 199 |
+
avg_output_tokens = total_output_tokens / len(output_token_counts)
|
| 200 |
+
|
| 201 |
+
max_input_idx = np.argmax(input_token_counts)
|
| 202 |
+
max_output_idx = np.argmax(output_token_counts)
|
| 203 |
+
min_input_idx = np.argmin(input_token_counts)
|
| 204 |
+
min_output_idx = np.argmin(output_token_counts)
|
| 205 |
+
|
| 206 |
+
logging.info("\nTotal input tokens: %d", total_input_tokens)
|
| 207 |
+
logging.info("Total output tokens: %d", total_output_tokens)
|
| 208 |
+
logging.info("Average input tokens: %.2f", avg_input_tokens)
|
| 209 |
+
logging.info("Average output tokens: %.2f", avg_output_tokens)
|
| 210 |
+
|
| 211 |
+
logging.info("\nRequest with max input tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
|
| 212 |
+
max_input_idx, input_token_counts[max_input_idx], inference_batch[max_input_idx], response_batch[max_input_idx])
|
| 213 |
+
|
| 214 |
+
logging.info("\nRequest with max output tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
|
| 215 |
+
max_output_idx, output_token_counts[max_output_idx], inference_batch[max_output_idx], response_batch[max_output_idx])
|
| 216 |
+
|
| 217 |
+
logging.info("\nRequest with min input tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
|
| 218 |
+
min_input_idx, input_token_counts[min_input_idx], inference_batch[min_input_idx], response_batch[min_input_idx])
|
| 219 |
+
|
| 220 |
+
logging.info("\nRequest with min output tokens: %d (Tokens: %d)\nInput: %s\nOutput: %s",
|
| 221 |
+
min_output_idx, output_token_counts[min_output_idx], inference_batch[min_output_idx], response_batch[min_output_idx])
|
| 222 |
+
|
| 223 |
+
return pred_batch, response_batch
|
| 224 |
+
|
| 225 |
+
|
| 226 |
|
| 227 |
def calculate_accuracy(res):
|
| 228 |
"""
|
|
|
|
| 253 |
|
| 254 |
|
| 255 |
@torch.no_grad()
|
| 256 |
+
def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5, debug_mode=True):
|
| 257 |
"""
|
| 258 |
Evaluate model using chain-of-thought prompting.
|
| 259 |
|
|
|
|
| 294 |
|
| 295 |
inference_batches.append(prompt)
|
| 296 |
|
| 297 |
+
|
| 298 |
+
batch_fn = batch_inference_debug_mode if debug_mode else batch_inference
|
| 299 |
+
pred_batch, response_batch = batch_fn(llm, sampling_params, inference_batches)
|
| 300 |
|
| 301 |
# Add predictions to test DataFrame
|
| 302 |
results_df = test_df.copy()
|
|
|
|
| 311 |
return correctness, accuracy
|
| 312 |
|
| 313 |
|
| 314 |
+
@spaces.GPU(duration=240) # Extended to 4 minutes for larger evaluations
|
| 315 |
def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
|
| 316 |
"""
|
| 317 |
Main evaluation function for MMLU-Pro benchmark.
|