H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 18

Commit

cbd1959

verified ·

1 Parent(s): fb92e40

Update mmlu_pro_eval_adapted.py

Browse files

Files changed (1) hide show

mmlu_pro_eval_adapted.py +200 -330

mmlu_pro_eval_adapted.py CHANGED Viewed

@@ -1,372 +1,242 @@
 import torch
-import evaluate
-from datasets import load_dataset
-from transformers import AutoTokenizer, AutoModelForCausalLM
 import logging
-import numpy as np
 import pandas as pd
-from tqdm import tqdm
-# Set up logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
-accuracy_metric = evaluate.load("accuracy")
-option_letters = ["A", "B", "C", "D"]
-MAX_CONTEXT_WINDOW = 4096
-def load_dataset_from_hf(verbose=False):
-    mmlu_dataset = load_dataset("TIGER-Lab/MMLU-Pro")
-    if verbose:
-        for split in mmlu_dataset.keys():
-            dataset = mmlu_dataset[split]  # Access the dataset split
-            # Log number of rows and columns
-            num_rows = len(dataset)
-            num_cols = len(dataset.column_names)
-            logger.info(f"Dataset Split: {split}")
-            logger.info(f"Number of Rows: {num_rows}")
-            logger.info(f"Number of Columns: {num_cols}")
-            # Log column names and their types
-            column_types = {col: str(dataset.features[col].dtype) for col in dataset.column_names}
-            logger.info(f"Column Names: {dataset.column_names}")
-            logger.info(f"Column Types: {column_types}")
-            # Log a sample of 5 rows
-            sample_rows = dataset.select(range(min(5, num_rows)))  # Ensure we don't exceed available rows
-            logger.info("Sample Rows:")
-            for row in sample_rows:
-                logger.info(row)
-            logger.info("=" * 50)  # Separator for readability
-    return mmlu_dataset
-def format_subject(subject):
-    l = subject.split("_")
-    s = ""
-    for entry in l:
-        s += " " + entry
-    return s
-def format_example(df, idx, include_answer=True):
-    """
-    Format a single example for the prompt based on the actual dataset structure:
-    - Column 0: question text
-    - Column 1: subject
-    - Column 2: choices as a list of strings
-    - Column 3: answer as a numeric index (0-3)
-    """
-    # Get the question text
-    prompt = df.iloc[idx, 0]
-    # Get the choices from the dataframe
-    options_list = df.iloc[idx, 2]
-    assert(isinstance(options_list, list))
-    for j, option in enumerate(options_list):
-        prompt += f"\n{option_letters[j]}. {option}"
-    prompt += "\nAnswer:"
-    if include_answer:
-        # Convert numeric answer to letter
-        answer_num = df.iloc[idx, 3]
-        answer_letter = {0: "A", 1: "B", 2: "C", 3: "D"}[answer_num]
-        prompt += f" {answer_letter}\n\n"
     return prompt
-def gen_prompt(df, subject, k=-1):
-    prompt = "The following are multiple choice questions (with answers) about {}.\n\n".format(
-        format_subject(subject)
-    )
-    if k == -1:
-        k = df.shape[0]
-    for i in range(k):
-        prompt += format_example(df, i, include_answer=True)
     return prompt
-@torch.no_grad()
-def eval_batched(subject, model, tokenizer, dev_df, test_df, num_questions_per_subject=5, train_shots=5, batch_size=8):
-    """
-    Improved eval function that uses batched processing on GPU
-    """
-    assert all(dev_df['subject'] == subject), f"Not all items in dev_df match subject {subject}"
-    assert all(test_df['subject'] == subject), f"Not all items in test_df match subject {subject}"
-    logger.info(f"Subject: {subject}, processing with batch_size={batch_size}")
-    cors = []
-    all_probs = []
-    if (train_shots < 0):
-        train_shots = 0  # Make positive.
-    # Generate the few-shot examples for this subject once
-    train_prompt = gen_prompt(dev_df, subject, train_shots)
-    # Process test examples in batches
-    for batch_start in range(0, test_df.shape[0], batch_size):
-        batch_end = min(batch_start + batch_size, test_df.shape[0])
-        batch_size_actual = batch_end - batch_start
-        # Prepare batch prompts
-        batch_prompts = []
-        batch_labels = []
-        for i in range(batch_start, batch_end):
-            prompt_end = format_example(test_df, i, include_answer=False)
-            prompt = train_prompt + prompt_end
-            batch_prompts.append(prompt)
-            label = test_df.iloc[i, 3]
-            label_letter = {0: "A", 1: "B", 2: "C", 3: "D"}[label]
-            batch_labels.append(label_letter)
-        # Tokenize all prompts in batch
-        tokenized_inputs = tokenizer(batch_prompts, padding=True, return_tensors="pt")
-        input_ids = tokenized_inputs.input_ids.to(model.device)
-        attention_mask = tokenized_inputs.attention_mask.to(model.device)
-        # Check if any example exceeds context window and adjust if needed
-        if input_ids.shape[1] > MAX_CONTEXT_WINDOW:
-            logger.warning(f"Some examples exceed max context window ({input_ids.shape[1]} > {MAX_CONTEXT_WINDOW})")
-            logger.warning(f"Reducing train_shots from {train_shots}")
-            # Find the lowest train_shots that fits
-            while train_shots > 0:
-                train_shots -= 1
-                train_prompt = gen_prompt(dev_df, subject, train_shots)
-                # Recalculate prompts with fewer shots
-                temp_prompt = train_prompt + format_example(test_df, batch_start, include_answer=False)
-                temp_tokens = tokenizer(temp_prompt, return_tensors="pt").input_ids
-                if temp_tokens.shape[1] <= MAX_CONTEXT_WINDOW:
-                    logger.info(f"Reduced to train_shots={train_shots}")
-                    # Regenerate all prompts in the batch with fewer shots
-                    batch_prompts = []
-                    for i in range(batch_start, batch_end):
-                        prompt_end = format_example(test_df, i, include_answer=False)
-                        prompt = train_prompt + prompt_end
-                        batch_prompts.append(prompt)
-                    # Retokenize with reduced shots
-                    tokenized_inputs = tokenizer(batch_prompts, padding=True, return_tensors="pt")
-                    input_ids = tokenized_inputs.input_ids.to(model.device)
-                    attention_mask = tokenized_inputs.attention_mask.to(model.device)
-                    break
-            # If we still can't fit even with 0 shots, we have to skip
-            if input_ids.shape[1] > MAX_CONTEXT_WINDOW:
-                logger.error(f"Even with 0 shots, context is too long ({input_ids.shape[1]} > {MAX_CONTEXT_WINDOW})")
-                # Process individually as fallback
-                for i in range(batch_start, batch_end):
-                    single_prompt = format_example(test_df, i, include_answer=False)
-                    single_tokens = tokenizer(single_prompt, return_tensors="pt").input_ids.to(model.device)
-                    if single_tokens.shape[1] <= MAX_CONTEXT_WINDOW:
-                        single_output = model(input_ids=single_tokens)
-                        single_logits = single_output.logits[0, -1]
-                        single_probs = get_option_probs(tokenizer, single_logits)
-                        pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(single_probs)]
-                        cors.append(pred == batch_labels[i-batch_start])
-                        all_probs.append(single_probs)
-                    else:
-                        logger.error(f"Example {i} is too long even by itself, skipping")
-                continue
-        # Run model on batch
-        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
-        # Extract predictions for each example in batch
-        for j in range(batch_size_actual):
-            # Get logits for the last token in each sequence
-            sequence_len = attention_mask[j].sum()
-            logits = outputs.logits[j, sequence_len-1]
-            # Calculate probabilities for A, B, C, D
-            probs = get_option_probs(tokenizer, logits)
-            pred = {0: "A", 1: "B", 2: "C", 3: "D"}[np.argmax(probs)]
-            cor = pred == batch_labels[j]
-            # Log first example for debugging
-            if batch_start == 0 and j == 0:
-                logger.info(f"Prompt (truncated): {batch_prompts[j][:200]}...")
-                logger.info(f"Label_Letter: {batch_labels[j]}")
-                logger.info(f"Probabilities: {probs}")
-                logger.info(f"Prediction: {pred}")
-                logger.info(f"Correct: {cor}")
-            cors.append(cor)
-            all_probs.append(probs)
-    acc = np.mean(cors)
-    cors = np.array(cors)
-    all_probs = np.array(all_probs)
-    print("Average accuracy {:.3f} - {}".format(acc, subject))
-    return subject, cors, acc, all_probs
-def get_option_probs(tokenizer, logits):
-    """Helper function to extract option probabilities from logits"""
-    option_probs = torch.nn.functional.softmax(
-        torch.tensor(
-            [
-                logits[tokenizer("A").input_ids[-1]],
-                logits[tokenizer("B").input_ids[-1]],
-                logits[tokenizer("C").input_ids[-1]],
-                logits[tokenizer("D").input_ids[-1]],
-            ]
-        ).float(),
-        dim=0,
-    ).detach().cpu().numpy()
-    return option_probs
-def get_max_batch_size(model, tokenizer, example_text, max_memory_fraction=0.8):
-    """
-    Estimate the maximum possible batch size based on available GPU memory
-    Args:
-        model: The model to evaluate
-        tokenizer: The tokenizer to use
-        example_text: A sample text input
-        max_memory_fraction: Maximum fraction of GPU memory to use (0.8 = 80%)
-    Returns:
-        Estimated maximum batch size
-    """
-    import torch
-    # Get total GPU memory and currently allocated memory
-    total_memory = torch.cuda.get_device_properties(0).total_memory
-    # Keep a safe buffer to avoid OOM
-    safe_memory = int(total_memory * max_memory_fraction)
-    # Tokenize example to get size
-    example_tokens = tokenizer(example_text, return_tensors="pt").to(model.device)
-    example_len = example_tokens.input_ids.shape[1]
-    # Run a single forward pass to measure memory usage
-    torch.cuda.empty_cache()
-    torch.cuda.reset_peak_memory_stats()
-    _ = model(**example_tokens)
-    single_forward_memory = torch.cuda.max_memory_allocated()
-    # Calculate memory per example and estimate max batch size
-    estimated_max_batch = safe_memory // single_forward_memory
-    # Reduce by a factor for safety (activations, gradients, etc.)
-    safe_batch_size = max(1, int(estimated_max_batch * 0.8))
-    logger.info(f"Estimated max batch size: {safe_batch_size} for sequence length {example_len}")
-    logger.info(f"Memory usage: {single_forward_memory / 1e9:.2f} GB per example")
-    logger.info(f"Total memory: {total_memory / 1e9:.2f} GB, Safe memory: {safe_memory / 1e9:.2f} GB")
-    return safe_batch_size
-def evaluate_mmlu_batched(model, tokenizer, num_subjects=10, num_questions=10, num_shots=5, batch_size=8, auto_batch_size=False):
-    """
-    Evaluates the model on MMLU using batched GPU processing for faster inference.
-    Args:
-        model: The model to evaluate
-        tokenizer: The tokenizer to use
-        num_subjects (int): Number of subjects to evaluate. If -1, evaluates all subjects
-        num_questions (int): Number of questions per subject
-        num_shots (int): Number of few-shot examples to use
-        batch_size (int): Batch size for processing multiple examples at once
-        auto_batch_size (bool): If True, automatically determine the optimal batch size
-    """
-    model.eval()  # Ensure Dropout and BatchNorm behave appropriately for inference
-    if tokenizer.pad_token is None:
-        logger.info("NO TOKENIZER PAD TOKEN")
-        tokenizer.pad_token = tokenizer.eos_token
-    if model.config.pad_token_id is None:
-        logger.info("NO PAD TOKEN ID")
-        model.config.pad_token_id = tokenizer.pad_token_id
-    dataset = load_dataset_from_hf(verbose=True)
-    test_df = pd.DataFrame(dataset['test'])
-    dev_df = pd.DataFrame(dataset['dev'])
-    test_df = test_df.sort_values(['subject', 'question'])
-    dev_df = dev_df.sort_values(['subject', 'question'])
-    # If auto_batch_size is enabled, estimate the optimal batch size
-    if auto_batch_size:
-        # Get a sample prompt
-        subject = test_df['subject'].iloc[0]
-        test_sample = test_df[test_df['subject'] == subject].head(1)
-        dev_sample = dev_df[dev_df['subject'] == subject].head(num_shots)
-        # Generate a sample prompt
-        train_prompt = gen_prompt(dev_sample, subject, num_shots)
-        sample_prompt = train_prompt + format_example(test_sample, 0, include_answer=False)
-        # Estimate the max batch size
-        batch_size = get_max_batch_size(model, tokenizer, sample_prompt)
-        logger.info(f"Auto-adjusted batch size: {batch_size}")
     # Get all unique subjects
-    all_subjects = sorted(test_df['subject'].unique())
     # Select subjects based on num_subjects parameter
     if num_subjects == -1 or num_subjects >= len(all_subjects):
-        subjects = all_subjects
     else:
         # Take the first num_subjects subjects
-        subjects = all_subjects[:num_subjects]
     results = {}
-    all_cors = []
     results_table = []
-    for subject in tqdm(subjects, desc="Processing subjects"):
-        test_samples = test_df[test_df['subject'] == subject].head(num_questions)
-        dev_samples = dev_df[dev_df['subject'] == subject].head(num_shots)
-        # Log subject and sample counts
-        logger.info(f"Subject: {subject}, Test Samples: {len(test_samples)}, Dev Samples: {len(dev_samples)}")
-        subject, cors, acc, probs = eval_batched(
-            subject,
-            model,
-            tokenizer,
-            dev_samples,
-            test_samples,
-            num_questions_per_subject=num_questions,
-            train_shots=num_shots,
-            batch_size=batch_size
-        )
         results[subject] = acc
-        all_cors.append(cors)
         results_table.append({
-            'Subject': subject,
-            'Num_samples': len(test_samples),
-            'Num_correct': int(np.sum(cors)),
             'Accuracy': acc
         })
-    weighted_acc = np.mean(np.concatenate(all_cors))
     min_acc_subject = min(results.items(), key=lambda x: x[1])[0]
     max_acc_subject = max(results.items(), key=lambda x: x[1])[0]

+# Adapted from https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/main/evaluate_from_local.py
+import csv
+import json
+import argparse
+import os
 import torch
+import random
+import transformers
+import time
+import re
+from vllm import LLM, SamplingParams
+from tqdm import tqdm
 import logging
+import sys
+from datasets import load_dataset
 import pandas as pd
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
+# Can be found at https://github.com/TIGER-AI-Lab/MMLU-Pro/blob/main/cot_prompt_lib/initial_prompt.txt
+initial_prompt = "The following are multiple choice questions (with answers) about {$}. Think step by step and then finish your answer with "the answer is (X)" where X is the correct letter choice."
+choices = ["A", "B", "C", "D", "E", "F", "G", "H", "I", "J", "K", "L", "M", "N", "O", "P"]
+max_model_length = 4096
+max_new_tokens = 2048
+def preprocess(test_df):
+    res_df = []
+    for each in test_df:
+        options = []
+        for opt in each["options"]:
+            if opt == "N/A":
+                continue
+            options.append(opt)
+        each["options"] = options
+        res_df.append(each)
+    return res_df
+def load_mmlu_pro():
+    dataset = load_dataset("TIGER-Lab/MMLU-Pro")
+    test_df, val_df = dataset["test"], dataset["validation"]
+    test_df = preprocess(test_df)
+    val_df = preprocess(val_df)
+    return test_df, val_df
+def load_model(model_name, gpu_utilization=0.8):
+    llm = LLM(model=model_name, gpu_memory_utilization=float(gpu_utilization),
+                tensor_parallel_size=torch.cuda.device_count(),
+                max_model_len=max_model_length,
+                trust_remote_code=True)
+    logger.info(f"Torch Device CUDA Count: {torch.cuda.device_count()}")
+    sampling_params = SamplingParams(temperature=0, max_tokens=max_new_tokens,
+                                        stop=["Question:"])
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
+    return (llm, sampling_params), tokenizer
+def format_cot_example(example, including_answer=True):
+    prompt = "Question:\n"
+    question = example["question"]
+    options = example["options"]
+    prompt += question + "\n"
+    prompt += "Options:\n"
+    for i, opt in enumerate(options):
+        prompt += "{}. {}\n".format(choices[i], opt)
+    if including_answer:
+        cot_content = example["cot_content"].replace("A: Let's think step by step.",
+                                                     "Answer: Let's think step by step.")
+        prompt += cot_content + "\n\n"
+    else:
+        prompt += "Answer: Let's think step by step."
     return prompt
+def generate_cot_prompt(val_df, curr, k):
+    prompt = initial_prompt
+    subject = curr["category"]
+    # Assert that all rows in val_df have 'category' equal to subject
+    assert (val_df["category"] == subject).all(), "Not all rows in val_df have the correct category"
+    val_df = val_df[: k]
+    prompt = prompt.replace("{$}", subject) + "\n"
+    for example in val_df:
+        prompt += format_cot_example(example, including_answer=True)
+    prompt += format_cot_example(curr, including_answer=False)
     return prompt
+def extract_answer(text):
+    pattern = r"answer is \(?([A-J])\)?"
+    match = re.search(pattern, text)
+    if match:
+        return match.group(1)
+    else:
+        print("1st answer extract failed\n" + text)
+        return extract_again(text)
+def extract_again(text):
+    match = re.search(r'.*[aA]nswer:\s*([A-J])', text)
+    if match:
+        return match.group(1)
+    else:
+        return extract_final(text)
+def extract_final(text):
+    pattern = r"\b[A-J]\b(?!.*\b[A-J]\b)"
+    match = re.search(pattern, text, re.DOTALL)
+    if match:
+        return match.group(0)
+    else:
+        return None
+def batch_inference(llm, sampling_params, inference_batch):
+    start = time.time()
+    outputs = llm.generate(inference_batch, sampling_params)
+    logging.info(str(len(inference_batch)) + "size batch costing time: " + str(time.time() - start))
+    response_batch = []
+    pred_batch = []
+    for output in outputs:
+        generated_text = output.outputs[0].text
+        response_batch.append(generated_text)
+        pred = extract_answer(generated_text)
+        pred_batch.append(pred)
+    logging.info("PRED BATCH:", pred_batch, "RESPONSE BATCH:", response_batch)
+    return pred_batch, response_batch
+def calculate_accuracy(res):
+    """
+    Calculate accuracy and return an array of correctness (1 if correct, 0 if wrong)
+    along with the overall accuracy.
+    """
+    correctness = []
+    for each in res:
+        if not each["pred"]:
+            # If prediction is None, use random choice with fixed seed
+            # This ensures reproducibility when handling missing predictions
+            random.seed(12345)
+            x = random.randint(0, len(each["options"]) - 1)
+            is_correct = 1 if x == each["answer_index"] else 0
+        else:
+            is_correct = 1 if each["pred"] == each["answer"] else 0
+        correctness.append(is_correct)
+    # Calculate accuracy from correctness array
+    if len(correctness) == 0:
+        return [], 0.0
+    accuracy = sum(correctness) / len(correctness)
+    return correctness, accuracy
+@torch.no_grad()
+def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
+    llm, sampling_params = model
+    global choices
+    logging.info("evaluating " + subject)
+    inference_batches = []
+    k = num_shots
+    for i in tqdm(range(len(test_df))):
+        curr = test_df[i]
+        prompt_length_ok = False
+        prompt = None
+        while not prompt_length_ok:
+            prompt = generate_cot_prompt(val_df, curr, k)
+            inputs = tokenizer(prompt, return_tensors="pt")
+            inputs = {key: value.cuda() for key, value in inputs.items()}
+            length = len(inputs["input_ids"][0])
+            if length < max_model_length - max_new_tokens:
+                prompt_length_ok = True
+            k -= 1
+        inference_batches.append(prompt)
+    pred_batch, response_batch = batch_inference(llm, sampling_params, inference_batches)
+    results = []
+    for j, curr in enumerate(test_df):
+        curr["pred"] = pred_batch[j]
+        curr["model_outputs"] = response_batch[j]
+        results.append(curr)
+    # Get array of correctness and overall accuracy
+    correctness, accuracy = calculate_accuracy(results)
+    logging.info("This batch accuracy is: {}, correct samples: {}/{}\n".format(
+        str(accuracy), str(sum(correctness)), str(len(correctness))))
+    return correctness, accuracy
+def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
+    model, tokenizer = load_model(model_name, gpu_utilization=0.8)
+    # Ensure model is in evaluation mode
+    model[0].model.eval() # Assuming model is a tuple of (llm, sampling_params)
+    test_df, val_df = load_mmlu_pro()
+    test_df = pd.DataFrame(test_df)
+    val_df = pd.DataFrame(val_df)  # Fixed: was 'val_def'
+    test_df = test_df.sort_values(['category', 'question_id'])
+    val_df = val_df.sort_values(['category', 'question_id'])  # Fixed: was 'dev_df'
     # Get all unique subjects
+    all_subjects = sorted(test_df['category'].unique())
+    selected_subjects = []
     # Select subjects based on num_subjects parameter
     if num_subjects == -1 or num_subjects >= len(all_subjects):
+        selected_subjects = all_subjects
     else:
         # Take the first num_subjects subjects
+        selected_subjects = all_subjects[:num_subjects]
+    logging.info("selected subjects:\n" + "\n".join(selected_subjects))
     results = {}
+    all_correctness = []
     results_table = []
+    for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
+        test_samples = test_df[test_df['category'] == subject].head(num_questions)
+        val_samples = val_df[val_df['category'] == subject].head(num_shots)
+        correctness, acc = eval_cot(subject, model, tokenizer, val_df=val_samples, test_df=test_samples, num_shots=num_shots)
         results[subject] = acc
+        all_correctness.extend(correctness)
         results_table.append({
+            'Subject': subject,
+            'Num_samples': len(test_samples),
+            'Num_correct': sum(correctness),
             'Accuracy': acc
         })
+    import numpy as np  # Added: missing import
+    weighted_acc = np.mean(all_correctness)
     min_acc_subject = min(results.items(), key=lambda x: x[1])[0]
     max_acc_subject = max(results.items(), key=lambda x: x[1])[0]