rohansampath commited on
Commit
cbd1959
·
verified ·
1 Parent(s): fb92e40

Update mmlu_pro_eval_adapted.py

Browse files
Files changed (1) hide show
  1. mmlu_pro_eval_adapted.py +200 -330
mmlu_pro_eval_adapted.py CHANGED
@@ -1,372 +1,242 @@
 
 
 
 
 
1
  import torch
2
- import evaluate
3
- from datasets import load_dataset
4
- from transformers import AutoTokenizer, AutoModelForCausalLM
 
 
 
5
  import logging
6
- import numpy as np
 
7
  import pandas as pd
8
- from tqdm import tqdm
9
 
10
- # Set up logging
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
14
- accuracy_metric = evaluate.load("accuracy")
15
- option_letters = ["A", "B", "C", "D"]
16
- MAX_CONTEXT_WINDOW = 4096
17
-
18
- def load_dataset_from_hf(verbose=False):
19
- mmlu_dataset = load_dataset("TIGER-Lab/MMLU-Pro")
20
-
21
- if verbose:
22
- for split in mmlu_dataset.keys():
23
- dataset = mmlu_dataset[split] # Access the dataset split
24
-
25
- # Log number of rows and columns
26
- num_rows = len(dataset)
27
- num_cols = len(dataset.column_names)
28
-
29
- logger.info(f"Dataset Split: {split}")
30
- logger.info(f"Number of Rows: {num_rows}")
31
- logger.info(f"Number of Columns: {num_cols}")
32
-
33
- # Log column names and their types
34
- column_types = {col: str(dataset.features[col].dtype) for col in dataset.column_names}
35
- logger.info(f"Column Names: {dataset.column_names}")
36
- logger.info(f"Column Types: {column_types}")
37
-
38
- # Log a sample of 5 rows
39
- sample_rows = dataset.select(range(min(5, num_rows))) # Ensure we don't exceed available rows
40
- logger.info("Sample Rows:")
41
- for row in sample_rows:
42
- logger.info(row)
43
-
44
- logger.info("=" * 50) # Separator for readability
45
- return mmlu_dataset
46
-
47
 
48
- def format_subject(subject):
49
- l = subject.split("_")
50
- s = ""
51
- for entry in l:
52
- s += " " + entry
53
- return s
54
 
55
 
56
- def format_example(df, idx, include_answer=True):
57
- """
58
- Format a single example for the prompt based on the actual dataset structure:
59
- - Column 0: question text
60
- - Column 1: subject
61
- - Column 2: choices as a list of strings
62
- - Column 3: answer as a numeric index (0-3)
63
- """
64
- # Get the question text
65
- prompt = df.iloc[idx, 0]
66
-
67
- # Get the choices from the dataframe
68
- options_list = df.iloc[idx, 2]
69
- assert(isinstance(options_list, list))
70
-
71
-
72
- for j, option in enumerate(options_list):
73
- prompt += f"\n{option_letters[j]}. {option}"
74
-
75
- prompt += "\nAnswer:"
76
- if include_answer:
77
- # Convert numeric answer to letter
78
- answer_num = df.iloc[idx, 3]
79
- answer_letter = {0: "A", 1: "B", 2: "C", 3: "D"}[answer_num]
80
- prompt += f" {answer_letter}\n\n"
81
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  return prompt
83
 
84
 
85
- def gen_prompt(df, subject, k=-1):
86
- prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(
87
- format_subject(subject)
88
- )
89
- if k == -1:
90
- k = df.shape[0]
91
- for i in range(k):
92
- prompt += format_example(df, i, include_answer=True)
 
 
93
  return prompt
94
 
95
 
96
- @torch.no_grad()
97
- def eval_batched(subject, model, tokenizer, dev_df, test_df, num_questions_per_subject=5, train_shots=5, batch_size=8):
98
- """
99
- Improved eval function that uses batched processing on GPU
100
- """
101
- assert all(dev_df['subject'] == subject), f"Not all items in dev_df match subject {subject}"
102
- assert all(test_df['subject'] == subject), f"Not all items in test_df match subject {subject}"
103
-
104
- logger.info(f"Subject: {subject}, processing with batch_size={batch_size}")
105
-
106
- cors = []
107
- all_probs = []
108
-
109
- if (train_shots < 0):
110
- train_shots = 0 # Make positive.
111
-
112
- # Generate the few-shot examples for this subject once
113
- train_prompt = gen_prompt(dev_df, subject, train_shots)
114
-
115
- # Process test examples in batches
116
- for batch_start in range(0, test_df.shape[0], batch_size):
117
- batch_end = min(batch_start + batch_size, test_df.shape[0])
118
- batch_size_actual = batch_end - batch_start
119
-
120
- # Prepare batch prompts
121
- batch_prompts = []
122
- batch_labels = []
123
-
124
- for i in range(batch_start, batch_end):
125
- prompt_end = format_example(test_df, i, include_answer=False)
126
- prompt = train_prompt + prompt_end
127
- batch_prompts.append(prompt)
128
-
129
- label = test_df.iloc[i, 3]
130
- label_letter = {0: "A", 1: "B", 2: "C", 3: "D"}[label]
131
- batch_labels.append(label_letter)
132
-
133
- # Tokenize all prompts in batch
134
- tokenized_inputs = tokenizer(batch_prompts, padding=True, return_tensors="pt")
135
- input_ids = tokenized_inputs.input_ids.to(model.device)
136
- attention_mask = tokenized_inputs.attention_mask.to(model.device)
137
-
138
- # Check if any example exceeds context window and adjust if needed
139
- if input_ids.shape[1] > MAX_CONTEXT_WINDOW:
140
- logger.warning(f"Some examples exceed max context window ({input_ids.shape[1]} > {MAX_CONTEXT_WINDOW})")
141
- logger.warning(f"Reducing train_shots from {train_shots}")
142
-
143
- # Find the lowest train_shots that fits
144
- while train_shots > 0:
145
- train_shots -= 1
146
- train_prompt = gen_prompt(dev_df, subject, train_shots)
147
-
148
- # Recalculate prompts with fewer shots
149
- temp_prompt = train_prompt + format_example(test_df, batch_start, include_answer=False)
150
- temp_tokens = tokenizer(temp_prompt, return_tensors="pt").input_ids
151
-
152
- if temp_tokens.shape[1] <= MAX_CONTEXT_WINDOW:
153
- logger.info(f"Reduced to train_shots={train_shots}")
154
-
155
- # Regenerate all prompts in the batch with fewer shots
156
- batch_prompts = []
157
- for i in range(batch_start, batch_end):
158
- prompt_end = format_example(test_df, i, include_answer=False)
159
- prompt = train_prompt + prompt_end
160
- batch_prompts.append(prompt)
161
-
162
- # Retokenize with reduced shots
163
- tokenized_inputs = tokenizer(batch_prompts, padding=True, return_tensors="pt")
164
- input_ids = tokenized_inputs.input_ids.to(model.device)
165
- attention_mask = tokenized_inputs.attention_mask.to(model.device)
166
- break
167
-
168
- # If we still can't fit even with 0 shots, we have to skip
169
- if input_ids.shape[1] > MAX_CONTEXT_WINDOW:
170
- logger.error(f"Even with 0 shots, context is too long ({input_ids.shape[1]} > {MAX_CONTEXT_WINDOW})")
171
- # Process individually as fallback
172
- for i in range(batch_start, batch_end):
173
- single_prompt = format_example(test_df, i, include_answer=False)
174
- single_tokens = tokenizer(single_prompt, return_tensors="pt").input_ids.to(model.device)
175
- if single_tokens.shape[1] <= MAX_CONTEXT_WINDOW:
176
- single_output = model(input_ids=single_tokens)
177
- single_logits = single_output.logits[0, -1]
178
- single_probs = get_option_probs(tokenizer, single_logits)
179
- pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(single_probs)]
180
- cors.append(pred == batch_labels[i-batch_start])
181
- all_probs.append(single_probs)
182
- else:
183
- logger.error(f"Example {i} is too long even by itself, skipping")
184
- continue
185
-
186
- # Run model on batch
187
- outputs = model(input_ids=input_ids, attention_mask=attention_mask)
188
-
189
- # Extract predictions for each example in batch
190
- for j in range(batch_size_actual):
191
- # Get logits for the last token in each sequence
192
- sequence_len = attention_mask[j].sum()
193
- logits = outputs.logits[j, sequence_len-1]
194
-
195
- # Calculate probabilities for A, B, C, D
196
- probs = get_option_probs(tokenizer, logits)
197
- pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
198
-
199
- cor = pred == batch_labels[j]
200
-
201
- # Log first example for debugging
202
- if batch_start == 0 and j == 0:
203
- logger.info(f"Prompt (truncated): {batch_prompts[j][:200]}...")
204
- logger.info(f"Label_Letter: {batch_labels[j]}")
205
- logger.info(f"Probabilities: {probs}")
206
- logger.info(f"Prediction: {pred}")
207
- logger.info(f"Correct: {cor}")
208
-
209
- cors.append(cor)
210
- all_probs.append(probs)
211
-
212
- acc = np.mean(cors)
213
- cors = np.array(cors)
214
- all_probs = np.array(all_probs)
215
-
216
- print("Average accuracy {:.3f} - {}".format(acc, subject))
217
-
218
- return subject, cors, acc, all_probs
219
-
220
-
221
- def get_option_probs(tokenizer, logits):
222
- """Helper function to extract option probabilities from logits"""
223
- option_probs = torch.nn.functional.softmax(
224
- torch.tensor(
225
- [
226
- logits[tokenizer("A").input_ids[-1]],
227
- logits[tokenizer("B").input_ids[-1]],
228
- logits[tokenizer("C").input_ids[-1]],
229
- logits[tokenizer("D").input_ids[-1]],
230
- ]
231
- ).float(),
232
- dim=0,
233
- ).detach().cpu().numpy()
234
-
235
- return option_probs
236
 
237
 
238
- def get_max_batch_size(model, tokenizer, example_text, max_memory_fraction=0.8):
239
- """
240
- Estimate the maximum possible batch size based on available GPU memory
241
-
242
- Args:
243
- model: The model to evaluate
244
- tokenizer: The tokenizer to use
245
- example_text: A sample text input
246
- max_memory_fraction: Maximum fraction of GPU memory to use (0.8 = 80%)
247
-
248
- Returns:
249
- Estimated maximum batch size
250
- """
251
- import torch
252
-
253
- # Get total GPU memory and currently allocated memory
254
- total_memory = torch.cuda.get_device_properties(0).total_memory
255
-
256
- # Keep a safe buffer to avoid OOM
257
- safe_memory = int(total_memory * max_memory_fraction)
258
-
259
- # Tokenize example to get size
260
- example_tokens = tokenizer(example_text, return_tensors="pt").to(model.device)
261
- example_len = example_tokens.input_ids.shape[1]
262
-
263
- # Run a single forward pass to measure memory usage
264
- torch.cuda.empty_cache()
265
- torch.cuda.reset_peak_memory_stats()
266
- _ = model(**example_tokens)
267
- single_forward_memory = torch.cuda.max_memory_allocated()
268
-
269
- # Calculate memory per example and estimate max batch size
270
- estimated_max_batch = safe_memory // single_forward_memory
271
-
272
- # Reduce by a factor for safety (activations, gradients, etc.)
273
- safe_batch_size = max(1, int(estimated_max_batch * 0.8))
274
-
275
- logger.info(f"Estimated max batch size: {safe_batch_size} for sequence length {example_len}")
276
- logger.info(f"Memory usage: {single_forward_memory / 1e9:.2f} GB per example")
277
- logger.info(f"Total memory: {total_memory / 1e9:.2f} GB, Safe memory: {safe_memory / 1e9:.2f} GB")
278
-
279
- return safe_batch_size
280
 
281
- def evaluate_mmlu_batched(model, tokenizer, num_subjects=10, num_questions=10, num_shots=5, batch_size=8, auto_batch_size=False):
282
- """
283
- Evaluates the model on MMLU using batched GPU processing for faster inference.
284
-
285
- Args:
286
- model: The model to evaluate
287
- tokenizer: The tokenizer to use
288
- num_subjects (int): Number of subjects to evaluate. If -1, evaluates all subjects
289
- num_questions (int): Number of questions per subject
290
- num_shots (int): Number of few-shot examples to use
291
- batch_size (int): Batch size for processing multiple examples at once
292
- auto_batch_size (bool): If True, automatically determine the optimal batch size
293
- """
294
- model.eval() # Ensure Dropout and BatchNorm behave appropriately for inference
295
 
296
- if tokenizer.pad_token is None:
297
- logger.info("NO TOKENIZER PAD TOKEN")
298
- tokenizer.pad_token = tokenizer.eos_token
299
- if model.config.pad_token_id is None:
300
- logger.info("NO PAD TOKEN ID")
301
- model.config.pad_token_id = tokenizer.pad_token_id
 
302
 
303
 
304
- dataset = load_dataset_from_hf(verbose=True)
305
- test_df = pd.DataFrame(dataset['test'])
306
- dev_df = pd.DataFrame(dataset['dev'])
307
- test_df = test_df.sort_values(['subject', 'question'])
308
- dev_df = dev_df.sort_values(['subject', 'question'])
 
 
 
 
 
 
 
 
309
 
310
 
311
- # If auto_batch_size is enabled, estimate the optimal batch size
312
- if auto_batch_size:
313
- # Get a sample prompt
314
- subject = test_df['subject'].iloc[0]
315
- test_sample = test_df[test_df['subject'] == subject].head(1)
316
- dev_sample = dev_df[dev_df['subject'] == subject].head(num_shots)
317
-
318
- # Generate a sample prompt
319
- train_prompt = gen_prompt(dev_sample, subject, num_shots)
320
- sample_prompt = train_prompt + format_example(test_sample, 0, include_answer=False)
 
 
 
 
 
 
 
 
 
 
321
 
322
- # Estimate the max batch size
323
- batch_size = get_max_batch_size(model, tokenizer, sample_prompt)
324
- logger.info(f"Auto-adjusted batch size: {batch_size}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
325
 
326
-
327
  # Get all unique subjects
328
- all_subjects = sorted(test_df['subject'].unique())
 
329
 
330
  # Select subjects based on num_subjects parameter
331
  if num_subjects == -1 or num_subjects >= len(all_subjects):
332
- subjects = all_subjects
333
  else:
334
  # Take the first num_subjects subjects
335
- subjects = all_subjects[:num_subjects]
336
-
 
 
337
  results = {}
338
- all_cors = []
339
  results_table = []
340
-
341
- for subject in tqdm(subjects, desc="Processing subjects"):
342
- test_samples = test_df[test_df['subject'] == subject].head(num_questions)
343
- dev_samples = dev_df[dev_df['subject'] == subject].head(num_shots)
344
-
345
- # Log subject and sample counts
346
- logger.info(f"Subject: {subject}, Test Samples: {len(test_samples)}, Dev Samples: {len(dev_samples)}")
347
-
348
- subject, cors, acc, probs = eval_batched(
349
- subject,
350
- model,
351
- tokenizer,
352
- dev_samples,
353
- test_samples,
354
- num_questions_per_subject=num_questions,
355
- train_shots=num_shots,
356
- batch_size=batch_size
357
- )
358
-
359
  results[subject] = acc
360
- all_cors.append(cors)
361
-
362
  results_table.append({
363
- 'Subject': subject,
364
- 'Num_samples': len(test_samples),
365
- 'Num_correct': int(np.sum(cors)),
366
  'Accuracy': acc
367
  })
368
 
369
- weighted_acc = np.mean(np.concatenate(all_cors))
 
370
 
371
  min_acc_subject = min(results.items(), key=lambda x: x[1])[0]
372
  max_acc_subject = max(results.items(), key=lambda x: x[1])[0]
 
1
+ # Adapted from https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/main/evaluate_from_local.py
2
+ import csv
3
+ import json
4
+ import argparse
5
+ import os
6
  import torch
7
+ import random
8
+ import transformers
9
+ import time
10
+ import re
11
+ from vllm import LLM, SamplingParams
12
+ from tqdm import tqdm
13
  import logging
14
+ import sys
15
+ from datasets import load_dataset
16
  import pandas as pd
 
17
 
 
18
  logging.basicConfig(level=logging.INFO)
19
  logger = logging.getLogger(__name__)
20
 
21
+ # Can be found at https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/main/cot_prompt_lib/initial_prompt.txt
22
+ initial_prompt = "The following are multiple choice questions (with answers) about {$}. Think step by step and then finish your answer with "the answer is (X)" where X is the correct letter choice."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
+ choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"]
25
+ max_model_length = 4096
26
+ max_new_tokens = 2048
 
 
 
27
 
28
 
29
+ def preprocess(test_df):
30
+ res_df = []
31
+ for each in test_df:
32
+ options = []
33
+ for opt in each["options"]:
34
+ if opt == "N/A":
35
+ continue
36
+ options.append(opt)
37
+ each["options"] = options
38
+ res_df.append(each)
39
+ return res_df
40
+
41
+
42
+ def load_mmlu_pro():
43
+ dataset = load_dataset("TIGER-Lab/MMLU-Pro")
44
+ test_df, val_df = dataset["test"], dataset["validation"]
45
+ test_df = preprocess(test_df)
46
+ val_df = preprocess(val_df)
47
+ return test_df, val_df
48
+
49
+
50
+ def load_model(model_name, gpu_utilization=0.8):
51
+ llm = LLM(model=model_name, gpu_memory_utilization=float(gpu_utilization),
52
+ tensor_parallel_size=torch.cuda.device_count(),
53
+ max_model_len=max_model_length,
54
+ trust_remote_code=True)
55
+ logger.info(f"Torch Device CUDA Count: {torch.cuda.device_count()}")
56
+ sampling_params = SamplingParams(temperature=0, max_tokens=max_new_tokens,
57
+ stop=["Question:"])
58
+ tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
59
+ return (llm, sampling_params), tokenizer
60
+
61
+
62
+ def format_cot_example(example, including_answer=True):
63
+ prompt = "Question:\n"
64
+ question = example["question"]
65
+ options = example["options"]
66
+ prompt += question + "\n"
67
+ prompt += "Options:\n"
68
+ for i, opt in enumerate(options):
69
+ prompt += "{}. {}\n".format(choices[i], opt)
70
+ if including_answer:
71
+ cot_content = example["cot_content"].replace("A: Let's think step by step.",
72
+ "Answer: Let's think step by step.")
73
+ prompt += cot_content + "\n\n"
74
+ else:
75
+ prompt += "Answer: Let's think step by step."
76
  return prompt
77
 
78
 
79
+ def generate_cot_prompt(val_df, curr, k):
80
+ prompt = initial_prompt
81
+ subject = curr["category"]
82
+ # Assert that all rows in val_df have 'category' equal to subject
83
+ assert (val_df["category"] == subject).all(), "Not all rows in val_df have the correct category"
84
+ val_df = val_df[: k]
85
+ prompt = prompt.replace("{$}", subject) + "\n"
86
+ for example in val_df:
87
+ prompt += format_cot_example(example, including_answer=True)
88
+ prompt += format_cot_example(curr, including_answer=False)
89
  return prompt
90
 
91
 
92
+ def extract_answer(text):
93
+ pattern = r"answer is \(?([A-J])\)?"
94
+ match = re.search(pattern, text)
95
+ if match:
96
+ return match.group(1)
97
+ else:
98
+ print("1st answer extract failed\n" + text)
99
+ return extract_again(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
 
102
+ def extract_again(text):
103
+ match = re.search(r'.*[aA]nswer:\s*([A-J])', text)
104
+ if match:
105
+ return match.group(1)
106
+ else:
107
+ return extract_final(text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
 
110
+ def extract_final(text):
111
+ pattern = r"\b[A-J]\b(?!.*\b[A-J]\b)"
112
+ match = re.search(pattern, text, re.DOTALL)
113
+ if match:
114
+ return match.group(0)
115
+ else:
116
+ return None
117
 
118
 
119
+ def batch_inference(llm, sampling_params, inference_batch):
120
+ start = time.time()
121
+ outputs = llm.generate(inference_batch, sampling_params)
122
+ logging.info(str(len(inference_batch)) + "size batch costing time: " + str(time.time() - start))
123
+ response_batch = []
124
+ pred_batch = []
125
+ for output in outputs:
126
+ generated_text = output.outputs[0].text
127
+ response_batch.append(generated_text)
128
+ pred = extract_answer(generated_text)
129
+ pred_batch.append(pred)
130
+ logging.info("PRED BATCH:", pred_batch, "RESPONSE BATCH:", response_batch)
131
+ return pred_batch, response_batch
132
 
133
 
134
+ def calculate_accuracy(res):
135
+ """
136
+ Calculate accuracy and return an array of correctness (1 if correct, 0 if wrong)
137
+ along with the overall accuracy.
138
+ """
139
+ correctness = []
140
+ for each in res:
141
+ if not each["pred"]:
142
+ # If prediction is None, use random choice with fixed seed
143
+ # This ensures reproducibility when handling missing predictions
144
+ random.seed(12345)
145
+ x = random.randint(0, len(each["options"]) - 1)
146
+ is_correct = 1 if x == each["answer_index"] else 0
147
+ else:
148
+ is_correct = 1 if each["pred"] == each["answer"] else 0
149
+ correctness.append(is_correct)
150
+
151
+ # Calculate accuracy from correctness array
152
+ if len(correctness) == 0:
153
+ return [], 0.0
154
 
155
+ accuracy = sum(correctness) / len(correctness)
156
+ return correctness, accuracy
157
+
158
+ @torch.no_grad()
159
+ def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
160
+ llm, sampling_params = model
161
+ global choices
162
+ logging.info("evaluating " + subject)
163
+ inference_batches = []
164
+ k = num_shots
165
+
166
+ for i in tqdm(range(len(test_df))):
167
+ curr = test_df[i]
168
+ prompt_length_ok = False
169
+ prompt = None
170
+ while not prompt_length_ok:
171
+ prompt = generate_cot_prompt(val_df, curr, k)
172
+ inputs = tokenizer(prompt, return_tensors="pt")
173
+ inputs = {key: value.cuda() for key, value in inputs.items()}
174
+ length = len(inputs["input_ids"][0])
175
+ if length < max_model_length - max_new_tokens:
176
+ prompt_length_ok = True
177
+ k -= 1
178
+ inference_batches.append(prompt)
179
+
180
+ pred_batch, response_batch = batch_inference(llm, sampling_params, inference_batches)
181
+ results = []
182
+ for j, curr in enumerate(test_df):
183
+ curr["pred"] = pred_batch[j]
184
+ curr["model_outputs"] = response_batch[j]
185
+ results.append(curr)
186
+
187
+ # Get array of correctness and overall accuracy
188
+ correctness, accuracy = calculate_accuracy(results)
189
+ logging.info("This batch accuracy is: {}, correct samples: {}/{}\n".format(
190
+ str(accuracy), str(sum(correctness)), str(len(correctness))))
191
+
192
+ return correctness, accuracy
193
+
194
+ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
195
+ model, tokenizer = load_model(model_name, gpu_utilization=0.8)
196
+
197
+ # Ensure model is in evaluation mode
198
+ model[0].model.eval() # Assuming model is a tuple of (llm, sampling_params)
199
+
200
+ test_df, val_df = load_mmlu_pro()
201
+
202
+ test_df = pd.DataFrame(test_df)
203
+ val_df = pd.DataFrame(val_df) # Fixed: was 'val_def'
204
+ test_df = test_df.sort_values(['category', 'question_id'])
205
+ val_df = val_df.sort_values(['category', 'question_id']) # Fixed: was 'dev_df'
206
 
 
207
  # Get all unique subjects
208
+ all_subjects = sorted(test_df['category'].unique())
209
+ selected_subjects = []
210
 
211
  # Select subjects based on num_subjects parameter
212
  if num_subjects == -1 or num_subjects >= len(all_subjects):
213
+ selected_subjects = all_subjects
214
  else:
215
  # Take the first num_subjects subjects
216
+ selected_subjects = all_subjects[:num_subjects]
217
+
218
+ logging.info("selected subjects:\n" + "\n".join(selected_subjects))
219
+
220
  results = {}
221
+ all_correctness = []
222
  results_table = []
223
+
224
+ for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
225
+ test_samples = test_df[test_df['category'] == subject].head(num_questions)
226
+ val_samples = val_df[val_df['category'] == subject].head(num_shots)
227
+
228
+ correctness, acc = eval_cot(subject, model, tokenizer, val_df=val_samples, test_df=test_samples, num_shots=num_shots)
 
 
 
 
 
 
 
 
 
 
 
 
 
229
  results[subject] = acc
230
+ all_correctness.extend(correctness)
 
231
  results_table.append({
232
+ 'Subject': subject,
233
+ 'Num_samples': len(test_samples),
234
+ 'Num_correct': sum(correctness),
235
  'Accuracy': acc
236
  })
237
 
238
+ import numpy as np # Added: missing import
239
+ weighted_acc = np.mean(all_correctness)
240
 
241
  min_acc_subject = min(results.items(), key=lambda x: x[1])[0]
242
  max_acc_subject = max(results.items(), key=lambda x: x[1])[0]