H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 13

Commit

8fa9808

verified ·

1 Parent(s): 4191f43

Create toy-dataset-eval.py

Browse files

A simple file to verify that evals are working properly.

Files changed (1) hide show

toy-dataset-eval.py +153 -0

toy-dataset-eval.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import torch
+import evaluate
+import re
+import base64
+import io
+import matplotlib.pyplot as plt
+from transformers import AutoModelForCausalLM, AutoTokenizer
+import spaces  # Assuming this is a custom or predefined library for GPU handling
+# ---------------------------------------------------------------------------
+# 1. Simple Test Dataset to Run GPU Calls On
+# ---------------------------------------------------------------------------
+test_data = [
+    {"question": "What is 2+2?", "answer": "4"},
+    {"question": "What is 3*3?", "answer": "9"},
+    {"question": "What is 10/2?", "answer": "5"},
+]
+# ---------------------------------------------------------------------------
+# 2. Load metric
+# ---------------------------------------------------------------------------
+accuracy_metric = evaluate.load("accuracy")
+# ---------------------------------------------------------------------------
+# 4. Inference helper functions
+# ---------------------------------------------------------------------------
+@spaces.GPU
+def generate_answer(question):
+    """
+    Generates an answer using Mistral's instruction format.
+    """
+    model, tokenizer = load_model()
+    # Mistral instruction format
+    prompt = f"""<s>[INST] {question}. Provide only the numerical answer. [/INST]"""
+    inputs = tokenizer(prompt, return_tensors="pt").to('cuda')
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=50,
+            pad_token_id=tokenizer.pad_token_id,
+            eos_token_id=tokenizer.eos_token_id
+        )
+    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    # Remove the original question from the output
+    return text_output.replace(question, "").strip()
+def parse_answer(model_output):
+    """
+    Extract numeric answer from model's text output.
+    """
+    # Look for numbers (including decimals)
+    match = re.search(r"(-?\d*\.?\d+)", model_output)
+    if match:
+        return match.group(1)
+    return model_output.strip()
+@spaces.GPU(duration=120)  # Allow up to 2 minutes for full evaluation
+def run_evaluation():
+    predictions = []
+    references = []
+    raw_outputs = []  # Store full model outputs for display
+    for sample in test_data:
+        question = sample["question"]
+        reference_answer = sample["answer"]
+        # Model inference
+        model_output = generate_answer(question)
+        predicted_answer = parse_answer(model_output)
+        predictions.append(predicted_answer)
+        references.append(reference_answer)
+        raw_outputs.append({
+            "question": question,
+            "model_output": model_output,
+            "parsed_answer": predicted_answer,
+            "reference": reference_answer
+        })
+    # Normalize answers
+    def normalize_answer(ans):
+        return str(ans).lower().strip()
+    norm_preds = [normalize_answer(p) for p in predictions]
+    norm_refs = [normalize_answer(r) for r in references]
+    # Compute accuracy
+    results = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)
+    accuracy = results["accuracy"]
+    # Create visualization
+    fig, ax = plt.subplots(figsize=(8, 6))
+    correct_count = sum(p == r for p, r in zip(norm_preds, norm_refs))
+    incorrect_count = len(test_data) - correct_count
+    bars = ax.bar(["Correct", "Incorrect"],
+                 [correct_count, incorrect_count],
+                 color=["#2ecc71", "#e74c3c"])
+    # Add value labels on bars
+    for bar in bars:
+        height = bar.get_height()
+        ax.text(bar.get_x() + bar.get_width()/2., height,
+                f'{int(height)}',
+                ha='center', va='bottom')
+    ax.set_title("Evaluation Results")
+    ax.set_ylabel("Count")
+    ax.set_ylim([0, len(test_data) + 0.5])
+    # Convert plot to base64
+    buf = io.BytesIO()
+    plt.savefig(buf, format="png", bbox_inches='tight', dpi=300)
+    buf.seek(0)
+    plt.close(fig)
+    data = base64.b64encode(buf.read()).decode("utf-8")
+    # Create detailed results HTML
+    details_html = """
+    <div style="margin-top: 20px;">
+        <h3>Detailed Results:</h3>
+        <table style="width:100%; border-collapse: collapse;">
+            <tr style="background-color: #f5f5f5;">
+                <th style="padding: 8px; border: 1px solid #ddd;">Question</th>
+                <th style="padding: 8px; border: 1px solid #ddd;">Model Output</th>
+                <th style="padding: 8px; border: 1px solid #ddd;">Parsed Answer</th>
+                <th style="padding: 8px; border: 1px solid #ddd;">Reference</th>
+            </tr>
+    """
+    for result in raw_outputs:
+        details_html += f"""
+            <tr>
+                <td style="padding: 8px; border: 1px solid #ddd;">{result['question']}</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">{result['model_output']}</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">{result['parsed_answer']}</td>
+                <td style="padding: 8px; border: 1px solid #ddd;">{result['reference']}</td>
+            </tr>
+        """
+    details_html += "</table></div>"
+    full_html = f"""
+    <div>
+        <img src="data:image/png;base64,{data}" style="width:100%; max-width:600px;">
+        {details_html}
+    </div>
+    """
+    return f"Accuracy: {accuracy:.2f}", full_html