Spaces:
Sleeping
Sleeping
Update mmlu_pro_eval_adapted.py
Browse files- mmlu_pro_eval_adapted.py +200 -330
mmlu_pro_eval_adapted.py
CHANGED
|
@@ -1,372 +1,242 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import torch
|
| 2 |
-
import
|
| 3 |
-
|
| 4 |
-
|
|
|
|
|
|
|
|
|
|
| 5 |
import logging
|
| 6 |
-
import
|
|
|
|
| 7 |
import pandas as pd
|
| 8 |
-
from tqdm import tqdm
|
| 9 |
|
| 10 |
-
# Set up logging
|
| 11 |
logging.basicConfig(level=logging.INFO)
|
| 12 |
logger = logging.getLogger(__name__)
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
MAX_CONTEXT_WINDOW = 4096
|
| 17 |
-
|
| 18 |
-
def load_dataset_from_hf(verbose=False):
|
| 19 |
-
mmlu_dataset = load_dataset("TIGER-Lab/MMLU-Pro")
|
| 20 |
-
|
| 21 |
-
if verbose:
|
| 22 |
-
for split in mmlu_dataset.keys():
|
| 23 |
-
dataset = mmlu_dataset[split] # Access the dataset split
|
| 24 |
-
|
| 25 |
-
# Log number of rows and columns
|
| 26 |
-
num_rows = len(dataset)
|
| 27 |
-
num_cols = len(dataset.column_names)
|
| 28 |
-
|
| 29 |
-
logger.info(f"Dataset Split: {split}")
|
| 30 |
-
logger.info(f"Number of Rows: {num_rows}")
|
| 31 |
-
logger.info(f"Number of Columns: {num_cols}")
|
| 32 |
-
|
| 33 |
-
# Log column names and their types
|
| 34 |
-
column_types = {col: str(dataset.features[col].dtype) for col in dataset.column_names}
|
| 35 |
-
logger.info(f"Column Names: {dataset.column_names}")
|
| 36 |
-
logger.info(f"Column Types: {column_types}")
|
| 37 |
-
|
| 38 |
-
# Log a sample of 5 rows
|
| 39 |
-
sample_rows = dataset.select(range(min(5, num_rows))) # Ensure we don't exceed available rows
|
| 40 |
-
logger.info("Sample Rows:")
|
| 41 |
-
for row in sample_rows:
|
| 42 |
-
logger.info(row)
|
| 43 |
-
|
| 44 |
-
logger.info("=" * 50) # Separator for readability
|
| 45 |
-
return mmlu_dataset
|
| 46 |
-
|
| 47 |
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
for entry in l:
|
| 52 |
-
s += " " + entry
|
| 53 |
-
return s
|
| 54 |
|
| 55 |
|
| 56 |
-
def
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
| 77 |
-
|
| 78 |
-
|
| 79 |
-
|
| 80 |
-
|
| 81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
return prompt
|
| 83 |
|
| 84 |
|
| 85 |
-
def
|
| 86 |
-
prompt =
|
| 87 |
-
|
| 88 |
-
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
| 93 |
return prompt
|
| 94 |
|
| 95 |
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
logger.info(f"Subject: {subject}, processing with batch_size={batch_size}")
|
| 105 |
-
|
| 106 |
-
cors = []
|
| 107 |
-
all_probs = []
|
| 108 |
-
|
| 109 |
-
if (train_shots < 0):
|
| 110 |
-
train_shots = 0 # Make positive.
|
| 111 |
-
|
| 112 |
-
# Generate the few-shot examples for this subject once
|
| 113 |
-
train_prompt = gen_prompt(dev_df, subject, train_shots)
|
| 114 |
-
|
| 115 |
-
# Process test examples in batches
|
| 116 |
-
for batch_start in range(0, test_df.shape[0], batch_size):
|
| 117 |
-
batch_end = min(batch_start + batch_size, test_df.shape[0])
|
| 118 |
-
batch_size_actual = batch_end - batch_start
|
| 119 |
-
|
| 120 |
-
# Prepare batch prompts
|
| 121 |
-
batch_prompts = []
|
| 122 |
-
batch_labels = []
|
| 123 |
-
|
| 124 |
-
for i in range(batch_start, batch_end):
|
| 125 |
-
prompt_end = format_example(test_df, i, include_answer=False)
|
| 126 |
-
prompt = train_prompt + prompt_end
|
| 127 |
-
batch_prompts.append(prompt)
|
| 128 |
-
|
| 129 |
-
label = test_df.iloc[i, 3]
|
| 130 |
-
label_letter = {0: "A", 1: "B", 2: "C", 3: "D"}[label]
|
| 131 |
-
batch_labels.append(label_letter)
|
| 132 |
-
|
| 133 |
-
# Tokenize all prompts in batch
|
| 134 |
-
tokenized_inputs = tokenizer(batch_prompts, padding=True, return_tensors="pt")
|
| 135 |
-
input_ids = tokenized_inputs.input_ids.to(model.device)
|
| 136 |
-
attention_mask = tokenized_inputs.attention_mask.to(model.device)
|
| 137 |
-
|
| 138 |
-
# Check if any example exceeds context window and adjust if needed
|
| 139 |
-
if input_ids.shape[1] > MAX_CONTEXT_WINDOW:
|
| 140 |
-
logger.warning(f"Some examples exceed max context window ({input_ids.shape[1]} > {MAX_CONTEXT_WINDOW})")
|
| 141 |
-
logger.warning(f"Reducing train_shots from {train_shots}")
|
| 142 |
-
|
| 143 |
-
# Find the lowest train_shots that fits
|
| 144 |
-
while train_shots > 0:
|
| 145 |
-
train_shots -= 1
|
| 146 |
-
train_prompt = gen_prompt(dev_df, subject, train_shots)
|
| 147 |
-
|
| 148 |
-
# Recalculate prompts with fewer shots
|
| 149 |
-
temp_prompt = train_prompt + format_example(test_df, batch_start, include_answer=False)
|
| 150 |
-
temp_tokens = tokenizer(temp_prompt, return_tensors="pt").input_ids
|
| 151 |
-
|
| 152 |
-
if temp_tokens.shape[1] <= MAX_CONTEXT_WINDOW:
|
| 153 |
-
logger.info(f"Reduced to train_shots={train_shots}")
|
| 154 |
-
|
| 155 |
-
# Regenerate all prompts in the batch with fewer shots
|
| 156 |
-
batch_prompts = []
|
| 157 |
-
for i in range(batch_start, batch_end):
|
| 158 |
-
prompt_end = format_example(test_df, i, include_answer=False)
|
| 159 |
-
prompt = train_prompt + prompt_end
|
| 160 |
-
batch_prompts.append(prompt)
|
| 161 |
-
|
| 162 |
-
# Retokenize with reduced shots
|
| 163 |
-
tokenized_inputs = tokenizer(batch_prompts, padding=True, return_tensors="pt")
|
| 164 |
-
input_ids = tokenized_inputs.input_ids.to(model.device)
|
| 165 |
-
attention_mask = tokenized_inputs.attention_mask.to(model.device)
|
| 166 |
-
break
|
| 167 |
-
|
| 168 |
-
# If we still can't fit even with 0 shots, we have to skip
|
| 169 |
-
if input_ids.shape[1] > MAX_CONTEXT_WINDOW:
|
| 170 |
-
logger.error(f"Even with 0 shots, context is too long ({input_ids.shape[1]} > {MAX_CONTEXT_WINDOW})")
|
| 171 |
-
# Process individually as fallback
|
| 172 |
-
for i in range(batch_start, batch_end):
|
| 173 |
-
single_prompt = format_example(test_df, i, include_answer=False)
|
| 174 |
-
single_tokens = tokenizer(single_prompt, return_tensors="pt").input_ids.to(model.device)
|
| 175 |
-
if single_tokens.shape[1] <= MAX_CONTEXT_WINDOW:
|
| 176 |
-
single_output = model(input_ids=single_tokens)
|
| 177 |
-
single_logits = single_output.logits[0, -1]
|
| 178 |
-
single_probs = get_option_probs(tokenizer, single_logits)
|
| 179 |
-
pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(single_probs)]
|
| 180 |
-
cors.append(pred == batch_labels[i-batch_start])
|
| 181 |
-
all_probs.append(single_probs)
|
| 182 |
-
else:
|
| 183 |
-
logger.error(f"Example {i} is too long even by itself, skipping")
|
| 184 |
-
continue
|
| 185 |
-
|
| 186 |
-
# Run model on batch
|
| 187 |
-
outputs = model(input_ids=input_ids, attention_mask=attention_mask)
|
| 188 |
-
|
| 189 |
-
# Extract predictions for each example in batch
|
| 190 |
-
for j in range(batch_size_actual):
|
| 191 |
-
# Get logits for the last token in each sequence
|
| 192 |
-
sequence_len = attention_mask[j].sum()
|
| 193 |
-
logits = outputs.logits[j, sequence_len-1]
|
| 194 |
-
|
| 195 |
-
# Calculate probabilities for A, B, C, D
|
| 196 |
-
probs = get_option_probs(tokenizer, logits)
|
| 197 |
-
pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
|
| 198 |
-
|
| 199 |
-
cor = pred == batch_labels[j]
|
| 200 |
-
|
| 201 |
-
# Log first example for debugging
|
| 202 |
-
if batch_start == 0 and j == 0:
|
| 203 |
-
logger.info(f"Prompt (truncated): {batch_prompts[j][:200]}...")
|
| 204 |
-
logger.info(f"Label_Letter: {batch_labels[j]}")
|
| 205 |
-
logger.info(f"Probabilities: {probs}")
|
| 206 |
-
logger.info(f"Prediction: {pred}")
|
| 207 |
-
logger.info(f"Correct: {cor}")
|
| 208 |
-
|
| 209 |
-
cors.append(cor)
|
| 210 |
-
all_probs.append(probs)
|
| 211 |
-
|
| 212 |
-
acc = np.mean(cors)
|
| 213 |
-
cors = np.array(cors)
|
| 214 |
-
all_probs = np.array(all_probs)
|
| 215 |
-
|
| 216 |
-
print("Average accuracy {:.3f} - {}".format(acc, subject))
|
| 217 |
-
|
| 218 |
-
return subject, cors, acc, all_probs
|
| 219 |
-
|
| 220 |
-
|
| 221 |
-
def get_option_probs(tokenizer, logits):
|
| 222 |
-
"""Helper function to extract option probabilities from logits"""
|
| 223 |
-
option_probs = torch.nn.functional.softmax(
|
| 224 |
-
torch.tensor(
|
| 225 |
-
[
|
| 226 |
-
logits[tokenizer("A").input_ids[-1]],
|
| 227 |
-
logits[tokenizer("B").input_ids[-1]],
|
| 228 |
-
logits[tokenizer("C").input_ids[-1]],
|
| 229 |
-
logits[tokenizer("D").input_ids[-1]],
|
| 230 |
-
]
|
| 231 |
-
).float(),
|
| 232 |
-
dim=0,
|
| 233 |
-
).detach().cpu().numpy()
|
| 234 |
-
|
| 235 |
-
return option_probs
|
| 236 |
|
| 237 |
|
| 238 |
-
def
|
| 239 |
-
|
| 240 |
-
|
| 241 |
-
|
| 242 |
-
|
| 243 |
-
|
| 244 |
-
tokenizer: The tokenizer to use
|
| 245 |
-
example_text: A sample text input
|
| 246 |
-
max_memory_fraction: Maximum fraction of GPU memory to use (0.8 = 80%)
|
| 247 |
-
|
| 248 |
-
Returns:
|
| 249 |
-
Estimated maximum batch size
|
| 250 |
-
"""
|
| 251 |
-
import torch
|
| 252 |
-
|
| 253 |
-
# Get total GPU memory and currently allocated memory
|
| 254 |
-
total_memory = torch.cuda.get_device_properties(0).total_memory
|
| 255 |
-
|
| 256 |
-
# Keep a safe buffer to avoid OOM
|
| 257 |
-
safe_memory = int(total_memory * max_memory_fraction)
|
| 258 |
-
|
| 259 |
-
# Tokenize example to get size
|
| 260 |
-
example_tokens = tokenizer(example_text, return_tensors="pt").to(model.device)
|
| 261 |
-
example_len = example_tokens.input_ids.shape[1]
|
| 262 |
-
|
| 263 |
-
# Run a single forward pass to measure memory usage
|
| 264 |
-
torch.cuda.empty_cache()
|
| 265 |
-
torch.cuda.reset_peak_memory_stats()
|
| 266 |
-
_ = model(**example_tokens)
|
| 267 |
-
single_forward_memory = torch.cuda.max_memory_allocated()
|
| 268 |
-
|
| 269 |
-
# Calculate memory per example and estimate max batch size
|
| 270 |
-
estimated_max_batch = safe_memory // single_forward_memory
|
| 271 |
-
|
| 272 |
-
# Reduce by a factor for safety (activations, gradients, etc.)
|
| 273 |
-
safe_batch_size = max(1, int(estimated_max_batch * 0.8))
|
| 274 |
-
|
| 275 |
-
logger.info(f"Estimated max batch size: {safe_batch_size} for sequence length {example_len}")
|
| 276 |
-
logger.info(f"Memory usage: {single_forward_memory / 1e9:.2f} GB per example")
|
| 277 |
-
logger.info(f"Total memory: {total_memory / 1e9:.2f} GB, Safe memory: {safe_memory / 1e9:.2f} GB")
|
| 278 |
-
|
| 279 |
-
return safe_batch_size
|
| 280 |
|
| 281 |
-
def evaluate_mmlu_batched(model, tokenizer, num_subjects=10, num_questions=10, num_shots=5, batch_size=8, auto_batch_size=False):
|
| 282 |
-
"""
|
| 283 |
-
Evaluates the model on MMLU using batched GPU processing for faster inference.
|
| 284 |
-
|
| 285 |
-
Args:
|
| 286 |
-
model: The model to evaluate
|
| 287 |
-
tokenizer: The tokenizer to use
|
| 288 |
-
num_subjects (int): Number of subjects to evaluate. If -1, evaluates all subjects
|
| 289 |
-
num_questions (int): Number of questions per subject
|
| 290 |
-
num_shots (int): Number of few-shot examples to use
|
| 291 |
-
batch_size (int): Batch size for processing multiple examples at once
|
| 292 |
-
auto_batch_size (bool): If True, automatically determine the optimal batch size
|
| 293 |
-
"""
|
| 294 |
-
model.eval() # Ensure Dropout and BatchNorm behave appropriately for inference
|
| 295 |
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
if
|
| 300 |
-
|
| 301 |
-
|
|
|
|
| 302 |
|
| 303 |
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 309 |
|
| 310 |
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 321 |
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 325 |
|
| 326 |
-
|
| 327 |
# Get all unique subjects
|
| 328 |
-
all_subjects = sorted(test_df['
|
|
|
|
| 329 |
|
| 330 |
# Select subjects based on num_subjects parameter
|
| 331 |
if num_subjects == -1 or num_subjects >= len(all_subjects):
|
| 332 |
-
|
| 333 |
else:
|
| 334 |
# Take the first num_subjects subjects
|
| 335 |
-
|
| 336 |
-
|
|
|
|
|
|
|
| 337 |
results = {}
|
| 338 |
-
|
| 339 |
results_table = []
|
| 340 |
-
|
| 341 |
-
for subject in tqdm(
|
| 342 |
-
test_samples = test_df[test_df['
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
logger.info(f"Subject: {subject}, Test Samples: {len(test_samples)}, Dev Samples: {len(dev_samples)}")
|
| 347 |
-
|
| 348 |
-
subject, cors, acc, probs = eval_batched(
|
| 349 |
-
subject,
|
| 350 |
-
model,
|
| 351 |
-
tokenizer,
|
| 352 |
-
dev_samples,
|
| 353 |
-
test_samples,
|
| 354 |
-
num_questions_per_subject=num_questions,
|
| 355 |
-
train_shots=num_shots,
|
| 356 |
-
batch_size=batch_size
|
| 357 |
-
)
|
| 358 |
-
|
| 359 |
results[subject] = acc
|
| 360 |
-
|
| 361 |
-
|
| 362 |
results_table.append({
|
| 363 |
-
'Subject': subject,
|
| 364 |
-
'Num_samples': len(test_samples),
|
| 365 |
-
'Num_correct':
|
| 366 |
'Accuracy': acc
|
| 367 |
})
|
| 368 |
|
| 369 |
-
|
|
|
|
| 370 |
|
| 371 |
min_acc_subject = min(results.items(), key=lambda x: x[1])[0]
|
| 372 |
max_acc_subject = max(results.items(), key=lambda x: x[1])[0]
|
|
|
|
| 1 |
+
# Adapted from https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/main/evaluate_from_local.py
|
| 2 |
+
import csv
|
| 3 |
+
import json
|
| 4 |
+
import argparse
|
| 5 |
+
import os
|
| 6 |
import torch
|
| 7 |
+
import random
|
| 8 |
+
import transformers
|
| 9 |
+
import time
|
| 10 |
+
import re
|
| 11 |
+
from vllm import LLM, SamplingParams
|
| 12 |
+
from tqdm import tqdm
|
| 13 |
import logging
|
| 14 |
+
import sys
|
| 15 |
+
from datasets import load_dataset
|
| 16 |
import pandas as pd
|
|
|
|
| 17 |
|
|
|
|
| 18 |
logging.basicConfig(level=logging.INFO)
|
| 19 |
logger = logging.getLogger(__name__)
|
| 20 |
|
| 21 |
+
# Can be found at https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/main/cot_prompt_lib/initial_prompt.txt
|
| 22 |
+
initial_prompt = "The following are multiple choice questions (with answers) about {$}. Think step by step and then finish your answer with "the answer is (X)" where X is the correct letter choice."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
|
| 24 |
+
choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"]
|
| 25 |
+
max_model_length = 4096
|
| 26 |
+
max_new_tokens = 2048
|
|
|
|
|
|
|
|
|
|
| 27 |
|
| 28 |
|
| 29 |
+
def preprocess(test_df):
|
| 30 |
+
res_df = []
|
| 31 |
+
for each in test_df:
|
| 32 |
+
options = []
|
| 33 |
+
for opt in each["options"]:
|
| 34 |
+
if opt == "N/A":
|
| 35 |
+
continue
|
| 36 |
+
options.append(opt)
|
| 37 |
+
each["options"] = options
|
| 38 |
+
res_df.append(each)
|
| 39 |
+
return res_df
|
| 40 |
+
|
| 41 |
+
|
| 42 |
+
def load_mmlu_pro():
|
| 43 |
+
dataset = load_dataset("TIGER-Lab/MMLU-Pro")
|
| 44 |
+
test_df, val_df = dataset["test"], dataset["validation"]
|
| 45 |
+
test_df = preprocess(test_df)
|
| 46 |
+
val_df = preprocess(val_df)
|
| 47 |
+
return test_df, val_df
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
def load_model(model_name, gpu_utilization=0.8):
|
| 51 |
+
llm = LLM(model=model_name, gpu_memory_utilization=float(gpu_utilization),
|
| 52 |
+
tensor_parallel_size=torch.cuda.device_count(),
|
| 53 |
+
max_model_len=max_model_length,
|
| 54 |
+
trust_remote_code=True)
|
| 55 |
+
logger.info(f"Torch Device CUDA Count: {torch.cuda.device_count()}")
|
| 56 |
+
sampling_params = SamplingParams(temperature=0, max_tokens=max_new_tokens,
|
| 57 |
+
stop=["Question:"])
|
| 58 |
+
tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
|
| 59 |
+
return (llm, sampling_params), tokenizer
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
def format_cot_example(example, including_answer=True):
|
| 63 |
+
prompt = "Question:\n"
|
| 64 |
+
question = example["question"]
|
| 65 |
+
options = example["options"]
|
| 66 |
+
prompt += question + "\n"
|
| 67 |
+
prompt += "Options:\n"
|
| 68 |
+
for i, opt in enumerate(options):
|
| 69 |
+
prompt += "{}. {}\n".format(choices[i], opt)
|
| 70 |
+
if including_answer:
|
| 71 |
+
cot_content = example["cot_content"].replace("A: Let's think step by step.",
|
| 72 |
+
"Answer: Let's think step by step.")
|
| 73 |
+
prompt += cot_content + "\n\n"
|
| 74 |
+
else:
|
| 75 |
+
prompt += "Answer: Let's think step by step."
|
| 76 |
return prompt
|
| 77 |
|
| 78 |
|
| 79 |
+
def generate_cot_prompt(val_df, curr, k):
|
| 80 |
+
prompt = initial_prompt
|
| 81 |
+
subject = curr["category"]
|
| 82 |
+
# Assert that all rows in val_df have 'category' equal to subject
|
| 83 |
+
assert (val_df["category"] == subject).all(), "Not all rows in val_df have the correct category"
|
| 84 |
+
val_df = val_df[: k]
|
| 85 |
+
prompt = prompt.replace("{$}", subject) + "\n"
|
| 86 |
+
for example in val_df:
|
| 87 |
+
prompt += format_cot_example(example, including_answer=True)
|
| 88 |
+
prompt += format_cot_example(curr, including_answer=False)
|
| 89 |
return prompt
|
| 90 |
|
| 91 |
|
| 92 |
+
def extract_answer(text):
|
| 93 |
+
pattern = r"answer is \(?([A-J])\)?"
|
| 94 |
+
match = re.search(pattern, text)
|
| 95 |
+
if match:
|
| 96 |
+
return match.group(1)
|
| 97 |
+
else:
|
| 98 |
+
print("1st answer extract failed\n" + text)
|
| 99 |
+
return extract_again(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
|
| 102 |
+
def extract_again(text):
|
| 103 |
+
match = re.search(r'.*[aA]nswer:\s*([A-J])', text)
|
| 104 |
+
if match:
|
| 105 |
+
return match.group(1)
|
| 106 |
+
else:
|
| 107 |
+
return extract_final(text)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 108 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 109 |
|
| 110 |
+
def extract_final(text):
|
| 111 |
+
pattern = r"\b[A-J]\b(?!.*\b[A-J]\b)"
|
| 112 |
+
match = re.search(pattern, text, re.DOTALL)
|
| 113 |
+
if match:
|
| 114 |
+
return match.group(0)
|
| 115 |
+
else:
|
| 116 |
+
return None
|
| 117 |
|
| 118 |
|
| 119 |
+
def batch_inference(llm, sampling_params, inference_batch):
|
| 120 |
+
start = time.time()
|
| 121 |
+
outputs = llm.generate(inference_batch, sampling_params)
|
| 122 |
+
logging.info(str(len(inference_batch)) + "size batch costing time: " + str(time.time() - start))
|
| 123 |
+
response_batch = []
|
| 124 |
+
pred_batch = []
|
| 125 |
+
for output in outputs:
|
| 126 |
+
generated_text = output.outputs[0].text
|
| 127 |
+
response_batch.append(generated_text)
|
| 128 |
+
pred = extract_answer(generated_text)
|
| 129 |
+
pred_batch.append(pred)
|
| 130 |
+
logging.info("PRED BATCH:", pred_batch, "RESPONSE BATCH:", response_batch)
|
| 131 |
+
return pred_batch, response_batch
|
| 132 |
|
| 133 |
|
| 134 |
+
def calculate_accuracy(res):
|
| 135 |
+
"""
|
| 136 |
+
Calculate accuracy and return an array of correctness (1 if correct, 0 if wrong)
|
| 137 |
+
along with the overall accuracy.
|
| 138 |
+
"""
|
| 139 |
+
correctness = []
|
| 140 |
+
for each in res:
|
| 141 |
+
if not each["pred"]:
|
| 142 |
+
# If prediction is None, use random choice with fixed seed
|
| 143 |
+
# This ensures reproducibility when handling missing predictions
|
| 144 |
+
random.seed(12345)
|
| 145 |
+
x = random.randint(0, len(each["options"]) - 1)
|
| 146 |
+
is_correct = 1 if x == each["answer_index"] else 0
|
| 147 |
+
else:
|
| 148 |
+
is_correct = 1 if each["pred"] == each["answer"] else 0
|
| 149 |
+
correctness.append(is_correct)
|
| 150 |
+
|
| 151 |
+
# Calculate accuracy from correctness array
|
| 152 |
+
if len(correctness) == 0:
|
| 153 |
+
return [], 0.0
|
| 154 |
|
| 155 |
+
accuracy = sum(correctness) / len(correctness)
|
| 156 |
+
return correctness, accuracy
|
| 157 |
+
|
| 158 |
+
@torch.no_grad()
|
| 159 |
+
def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
|
| 160 |
+
llm, sampling_params = model
|
| 161 |
+
global choices
|
| 162 |
+
logging.info("evaluating " + subject)
|
| 163 |
+
inference_batches = []
|
| 164 |
+
k = num_shots
|
| 165 |
+
|
| 166 |
+
for i in tqdm(range(len(test_df))):
|
| 167 |
+
curr = test_df[i]
|
| 168 |
+
prompt_length_ok = False
|
| 169 |
+
prompt = None
|
| 170 |
+
while not prompt_length_ok:
|
| 171 |
+
prompt = generate_cot_prompt(val_df, curr, k)
|
| 172 |
+
inputs = tokenizer(prompt, return_tensors="pt")
|
| 173 |
+
inputs = {key: value.cuda() for key, value in inputs.items()}
|
| 174 |
+
length = len(inputs["input_ids"][0])
|
| 175 |
+
if length < max_model_length - max_new_tokens:
|
| 176 |
+
prompt_length_ok = True
|
| 177 |
+
k -= 1
|
| 178 |
+
inference_batches.append(prompt)
|
| 179 |
+
|
| 180 |
+
pred_batch, response_batch = batch_inference(llm, sampling_params, inference_batches)
|
| 181 |
+
results = []
|
| 182 |
+
for j, curr in enumerate(test_df):
|
| 183 |
+
curr["pred"] = pred_batch[j]
|
| 184 |
+
curr["model_outputs"] = response_batch[j]
|
| 185 |
+
results.append(curr)
|
| 186 |
+
|
| 187 |
+
# Get array of correctness and overall accuracy
|
| 188 |
+
correctness, accuracy = calculate_accuracy(results)
|
| 189 |
+
logging.info("This batch accuracy is: {}, correct samples: {}/{}\n".format(
|
| 190 |
+
str(accuracy), str(sum(correctness)), str(len(correctness))))
|
| 191 |
+
|
| 192 |
+
return correctness, accuracy
|
| 193 |
+
|
| 194 |
+
def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
|
| 195 |
+
model, tokenizer = load_model(model_name, gpu_utilization=0.8)
|
| 196 |
+
|
| 197 |
+
# Ensure model is in evaluation mode
|
| 198 |
+
model[0].model.eval() # Assuming model is a tuple of (llm, sampling_params)
|
| 199 |
+
|
| 200 |
+
test_df, val_df = load_mmlu_pro()
|
| 201 |
+
|
| 202 |
+
test_df = pd.DataFrame(test_df)
|
| 203 |
+
val_df = pd.DataFrame(val_df) # Fixed: was 'val_def'
|
| 204 |
+
test_df = test_df.sort_values(['category', 'question_id'])
|
| 205 |
+
val_df = val_df.sort_values(['category', 'question_id']) # Fixed: was 'dev_df'
|
| 206 |
|
|
|
|
| 207 |
# Get all unique subjects
|
| 208 |
+
all_subjects = sorted(test_df['category'].unique())
|
| 209 |
+
selected_subjects = []
|
| 210 |
|
| 211 |
# Select subjects based on num_subjects parameter
|
| 212 |
if num_subjects == -1 or num_subjects >= len(all_subjects):
|
| 213 |
+
selected_subjects = all_subjects
|
| 214 |
else:
|
| 215 |
# Take the first num_subjects subjects
|
| 216 |
+
selected_subjects = all_subjects[:num_subjects]
|
| 217 |
+
|
| 218 |
+
logging.info("selected subjects:\n" + "\n".join(selected_subjects))
|
| 219 |
+
|
| 220 |
results = {}
|
| 221 |
+
all_correctness = []
|
| 222 |
results_table = []
|
| 223 |
+
|
| 224 |
+
for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
|
| 225 |
+
test_samples = test_df[test_df['category'] == subject].head(num_questions)
|
| 226 |
+
val_samples = val_df[val_df['category'] == subject].head(num_shots)
|
| 227 |
+
|
| 228 |
+
correctness, acc = eval_cot(subject, model, tokenizer, val_df=val_samples, test_df=test_samples, num_shots=num_shots)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 229 |
results[subject] = acc
|
| 230 |
+
all_correctness.extend(correctness)
|
|
|
|
| 231 |
results_table.append({
|
| 232 |
+
'Subject': subject,
|
| 233 |
+
'Num_samples': len(test_samples),
|
| 234 |
+
'Num_correct': sum(correctness),
|
| 235 |
'Accuracy': acc
|
| 236 |
})
|
| 237 |
|
| 238 |
+
import numpy as np # Added: missing import
|
| 239 |
+
weighted_acc = np.mean(all_correctness)
|
| 240 |
|
| 241 |
min_acc_subject = min(results.items(), key=lambda x: x[1])[0]
|
| 242 |
max_acc_subject = max(results.items(), key=lambda x: x[1])[0]
|