H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 19

Commit

a3cb7ba

verified ·

1 Parent(s): 0e843f9

Update mmlu_pro_eval_adapted.py

Browse files

Files changed (1) hide show

mmlu_pro_eval_adapted.py +111 -38

mmlu_pro_eval_adapted.py CHANGED Viewed

@@ -15,7 +15,7 @@ import logging
 import sys
 from datasets import load_dataset
 import pandas as pd
-import numpy as mnp
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -46,6 +46,11 @@ def load_mmlu_pro():
     test_df, val_df = dataset["test"], dataset["validation"]
     test_df = preprocess(test_df)
     val_df = preprocess(val_df)
     return test_df, val_df
@@ -62,6 +67,10 @@ def load_model(model_name, gpu_utilization=0.8):
 def format_cot_example(example, including_answer=True):
     prompt = "Question:\n"
     question = example["question"]
     options = example["options"]
@@ -79,15 +88,34 @@ def format_cot_example(example, including_answer=True):
 def generate_cot_prompt(val_df, curr, k):
     prompt = initial_prompt
-    subject = curr["category"]
-    # Assert that all rows in val_df have 'category' equal to subject
-    assert (val_df["category"] == subject).all(), "Not all rows in val_df have the correct category"
-    val_df = val_df[: k]
     prompt = prompt.replace("{$}", subject) + "\n"
-    for example in val_df:
         prompt += format_cot_example(example, including_answer=True)
     prompt += format_cot_example(curr, including_answer=False)
     return prompt
@@ -121,7 +149,7 @@ def extract_final(text):
 def batch_inference(llm, sampling_params, inference_batch):
     start = time.time()
     outputs = llm.generate(inference_batch, sampling_params)
-    logging.info(str(len(inference_batch)) + "size batch costing time: " + str(time.time() - start))
     response_batch = []
     pred_batch = []
     for output in outputs:
@@ -139,15 +167,17 @@ def calculate_accuracy(res):
     along with the overall accuracy.
     """
     correctness = []
-    for each in res:
-        if not each["pred"]:
             # If prediction is None, use random choice with fixed seed
-            # This ensures reproducibility when handling missing predictions
             random.seed(12345)
-            x = random.randint(0, len(each["options"]) - 1)
-            is_correct = 1 if x == each["answer_index"] else 0
         else:
-            is_correct = 1 if each["pred"] == each["answer"] else 0
         correctness.append(is_correct)
     # Calculate accuracy from correctness array
@@ -157,77 +187,119 @@ def calculate_accuracy(res):
     accuracy = sum(correctness) / len(correctness)
     return correctness, accuracy
 @torch.no_grad()
 def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
     llm, sampling_params = model
     global choices
     logging.info("evaluating " + subject)
     inference_batches = []
-    k = num_shots
-    for i in tqdm(range(len(test_df))):
-        curr = test_df[i]
         prompt_length_ok = False
         prompt = None
-        while not prompt_length_ok:
             prompt = generate_cot_prompt(val_df, curr, k)
             inputs = tokenizer(prompt, return_tensors="pt")
             inputs = {key: value.cuda() for key, value in inputs.items()}
             length = len(inputs["input_ids"][0])
             if length < max_model_length - max_new_tokens:
                 prompt_length_ok = True
-            k -= 1
         inference_batches.append(prompt)
     pred_batch, response_batch = batch_inference(llm, sampling_params, inference_batches)
-    results = []
-    for j, curr in enumerate(test_df):
-        curr["pred"] = pred_batch[j]
-        curr["model_outputs"] = response_batch[j]
-        results.append(curr)
-    # Get array of correctness and overall accuracy
-    correctness, accuracy = calculate_accuracy(results)
     logging.info("This batch accuracy is: {}, correct samples: {}/{}\n".format(
         str(accuracy), str(sum(correctness)), str(len(correctness))))
     return correctness, accuracy
 @spaces.GPU(duration=240)  # Extended to 3 minutes for larger evaluations
 def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
-    print ("IS CUDA AVAILABLE: ", torch.cuda.is_available())
-    model, tokenizer = load_model(model_name, gpu_utilization=0.8)
     test_df, val_df = load_mmlu_pro()
-    test_df = pd.DataFrame(test_df)
-    val_df = pd.DataFrame(val_df)  # Fixed: was 'val_def'
     test_df = test_df.sort_values(['category', 'question_id'])
-    val_df = val_df.sort_values(['category', 'question_id'])  # Fixed: was 'dev_df'
-    # Get all unique subjects
     all_subjects = sorted(test_df['category'].unique())
-    selected_subjects = []
     # Select subjects based on num_subjects parameter
     if num_subjects == -1 or num_subjects >= len(all_subjects):
         selected_subjects = all_subjects
     else:
-        # Take the first num_subjects subjects
         selected_subjects = all_subjects[:num_subjects]
     logging.info("selected subjects:\n" + "\n".join(selected_subjects))
     results = {}
     all_correctness = []
     results_table = []
     for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
         test_samples = test_df[test_df['category'] == subject].head(num_questions)
         val_samples = val_df[val_df['category'] == subject].head(num_shots)
-        correctness, acc = eval_cot(subject, model, tokenizer, val_df=val_samples, test_df=test_samples, num_shots=num_shots)
         results[subject] = acc
         all_correctness.extend(correctness)
         results_table.append({
@@ -237,11 +309,12 @@ def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5
             'Accuracy': acc
         })
     weighted_acc = np.mean(all_correctness)
     min_acc_subject = min(results.items(), key=lambda x: x[1])[0]
     max_acc_subject = max(results.items(), key=lambda x: x[1])[0]
     return {
         "overall_accuracy": weighted_acc,
         "min_accuracy_subject": (min_acc_subject, results[min_acc_subject]),

 import sys
 from datasets import load_dataset
 import pandas as pd
+import numpy as np
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
     test_df, val_df = dataset["test"], dataset["validation"]
     test_df = preprocess(test_df)
     val_df = preprocess(val_df)
+    # Convert to DataFrames right after loading and preprocessing
+    test_df = pd.DataFrame(test_df)
+    val_df = pd.DataFrame(val_df)
     return test_df, val_df
 def format_cot_example(example, including_answer=True):
+    # Handle both Series and dict inputs
+    if isinstance(example, pd.Series):
+        example = example.to_dict()
     prompt = "Question:\n"
     question = example["question"]
     options = example["options"]
 def generate_cot_prompt(val_df, curr, k):
+    """
+    Generate prompt with examples from val_df matching curr's category.
+    Args:
+        val_df: DataFrame containing validation examples
+        curr: Series or dict representing current example
+        k: Number of examples to include
+    """
     prompt = initial_prompt
+    # Handle both Series and dict inputs for curr
+    if isinstance(curr, pd.Series):
+        subject = curr["category"]
+    else:
+        subject = curr["category"]
+    # Filter validation examples by category
+    filtered_val_df = val_df[val_df["category"] == subject].head(k)
     prompt = prompt.replace("{$}", subject) + "\n"
+    # Add each example to the prompt
+    for _, example in filtered_val_df.iterrows():
         prompt += format_cot_example(example, including_answer=True)
+    # Add the current example
     prompt += format_cot_example(curr, including_answer=False)
     return prompt
 def batch_inference(llm, sampling_params, inference_batch):
     start = time.time()
     outputs = llm.generate(inference_batch, sampling_params)
+    logging.info(str(len(inference_batch)) + " size batch costing time: " + str(time.time() - start))
     response_batch = []
     pred_batch = []
     for output in outputs:
     along with the overall accuracy.
     """
     correctness = []
+    # Process predictions and compute correctness
+    for i, row in res.iterrows():
+        if not row["pred"]:
             # If prediction is None, use random choice with fixed seed
             random.seed(12345)
+            options_len = len(row["options"]) if isinstance(row["options"], list) else 4
+            x = random.randint(0, options_len - 1)
+            is_correct = 1 if x == row["answer_index"] else 0
         else:
+            is_correct = 1 if row["pred"] == row["answer"] else 0
         correctness.append(is_correct)
     # Calculate accuracy from correctness array
     accuracy = sum(correctness) / len(correctness)
     return correctness, accuracy
 @torch.no_grad()
 def eval_cot(subject, model, tokenizer, val_df, test_df, num_shots=5):
+    """
+    Evaluate model using chain-of-thought prompting.
+    Args:
+        subject: Subject category being evaluated
+        model: Tuple of (llm, sampling_params)
+        tokenizer: Model tokenizer
+        val_df: DataFrame with validation examples
+        test_df: DataFrame with test examples
+        num_shots: Number of examples to include in prompt
+    """
     llm, sampling_params = model
     global choices
     logging.info("evaluating " + subject)
     inference_batches = []
+    # Process each test example
+    for i in range(len(test_df)):
+        curr = test_df.iloc[i]
+        k = num_shots  # Reset k for each example
+        # Find prompt that fits within token limit
         prompt_length_ok = False
         prompt = None
+        while not prompt_length_ok and k > 0:
             prompt = generate_cot_prompt(val_df, curr, k)
             inputs = tokenizer(prompt, return_tensors="pt")
             inputs = {key: value.cuda() for key, value in inputs.items()}
             length = len(inputs["input_ids"][0])
             if length < max_model_length - max_new_tokens:
                 prompt_length_ok = True
+            else:
+                k -= 1
+        if not prompt_length_ok:
+            # If we couldn't fit any examples, use just the test question
+            prompt = generate_cot_prompt(val_df.head(0), curr, 0)
         inference_batches.append(prompt)
+    # Get model predictions
     pred_batch, response_batch = batch_inference(llm, sampling_params, inference_batches)
+    # Add predictions to test DataFrame
+    results_df = test_df.copy()
+    results_df["pred"] = pred_batch
+    results_df["model_outputs"] = response_batch
+    # Calculate accuracy
+    correctness, accuracy = calculate_accuracy(results_df)
     logging.info("This batch accuracy is: {}, correct samples: {}/{}\n".format(
         str(accuracy), str(sum(correctness)), str(len(correctness))))
     return correctness, accuracy
 @spaces.GPU(duration=240)  # Extended to 3 minutes for larger evaluations
 def evaluate_mmlu_pro(model_name, num_subjects=-1, num_questions=10, num_shots=5):
+    """
+    Main evaluation function for MMLU-Pro benchmark.
+    Args:
+        model_name: Name/path of model to evaluate
+        num_subjects: Number of subjects to test (-1 for all)
+        num_questions: Number of questions per subject
+        num_shots: Number of examples to include in prompts
+    """
+    print("IS CUDA AVAILABLE: ", torch.cuda.is_available())
+    # Load model and data
+    model, tokenizer = load_model(model_name, gpu_utilization=0.8)
     test_df, val_df = load_mmlu_pro()
+    # Sort DataFrames
     test_df = test_df.sort_values(['category', 'question_id'])
+    val_df = val_df.sort_values(['category', 'question_id'])
+    # Get unique subjects
     all_subjects = sorted(test_df['category'].unique())
     # Select subjects based on num_subjects parameter
     if num_subjects == -1 or num_subjects >= len(all_subjects):
         selected_subjects = all_subjects
     else:
         selected_subjects = all_subjects[:num_subjects]
     logging.info("selected subjects:\n" + "\n".join(selected_subjects))
+    # Prepare results tracking
     results = {}
     all_correctness = []
     results_table = []
+    # Process each subject
     for subject in tqdm(selected_subjects, desc="Processing Selected Categories"):
+        # Filter data for current subject
         test_samples = test_df[test_df['category'] == subject].head(num_questions)
         val_samples = val_df[val_df['category'] == subject].head(num_shots)
+        # Run evaluation
+        correctness, acc = eval_cot(
+            subject,
+            model,
+            tokenizer,
+            val_df=val_samples,
+            test_df=test_samples,
+            num_shots=num_shots
+        )
+        # Store results
         results[subject] = acc
         all_correctness.extend(correctness)
         results_table.append({
             'Accuracy': acc
         })
+    # Calculate overall metrics
     weighted_acc = np.mean(all_correctness)
     min_acc_subject = min(results.items(), key=lambda x: x[1])[0]
     max_acc_subject = max(results.items(), key=lambda x: x[1])[0]
+    # Return results summary
     return {
         "overall_accuracy": weighted_acc,
         "min_accuracy_subject": (min_acc_subject, results[min_acc_subject]),