H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 11

Commit

3195f7f

verified ·

1 Parent(s): c5224d3

Update app.py with a basic demonstration of loading Llama-3.1-instruct and running a simple eval on some Math

Browse files

Files changed (1) hide show

app.py +130 -3

app.py CHANGED Viewed

@@ -1,7 +1,134 @@
 import gradio as gr
-def greet(name):
-    return "Hello " + name + "!!"
-demo = gr.Interface(fn=greet, inputs="text", outputs="text")
 demo.launch()

 import gradio as gr
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import evaluate
+import re
+import matplotlib
+matplotlib.use('Agg')  # for non-interactive envs
+import matplotlib.pyplot as plt
+import io
+import base64
+# ---------------------------------------------------------------------------
+# 1. Define model name and load model/tokenizer
+# ---------------------------------------------------------------------------
+model_name = "meta-llama/Llama-3.2-1B-Instruct"  # fictional placeholder
+tokenizer = AutoTokenizer.from_pretrained(model_name)
+model = AutoModelForCausalLM.from_pretrained(model_name)
+# ---------------------------------------------------------------------------
+# 2. Define a tiny "dataset" for demonstration
+#    In reality, you'll load a real dataset from HF or custom code.
+# ---------------------------------------------------------------------------
+test_data = [
+    {"question": "What is 2+2?", "answer": "4"},
+    {"question": "What is 3*3?", "answer": "9"},
+    {"question": "What is 10/2?", "answer": "5"},
+]
+# ---------------------------------------------------------------------------
+# 3. Load a metric (accuracy) from Hugging Face evaluate library
+# ---------------------------------------------------------------------------
+accuracy_metric = evaluate.load("accuracy")
+# ---------------------------------------------------------------------------
+# 4. Inference helper functions
+# ---------------------------------------------------------------------------
+def generate_answer(question):
+    """
+    Generates an answer to the given question using the loaded model.
+    """
+    # Simple prompt
+    prompt = f"Question: {question}\nAnswer:"
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=30,
+            temperature=0.0,  # deterministic
+        )
+    text_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
+    return text_output
+def parse_answer(model_output):
+    """
+    Heuristic to extract the final numeric answer from model's text.
+    You can customize this regex or logic as needed.
+    """
+    # Example: find digits (possibly multiple, but we keep the first match)
+    match = re.search(r"(\d+)", model_output)
+    if match:
+        return match.group(1)
+    # fallback to entire text if no digits found
+    return model_output.strip()
+# ---------------------------------------------------------------------------
+# 5. Evaluation routine
+# ---------------------------------------------------------------------------
+def run_evaluation():
+    predictions = []
+    references = []
+    for sample in test_data:
+        question = sample["question"]
+        reference_answer = sample["answer"]
+        # Model inference
+        model_output = generate_answer(question)
+        predicted_answer = parse_answer(model_output)
+        predictions.append(predicted_answer)
+        references.append(reference_answer)
+    # Normalize answers (simple: just remove spaces/punctuation, lower case)
+    def normalize_answer(ans):
+        return ans.lower().strip()
+    norm_preds = [normalize_answer(p) for p in predictions]
+    norm_refs  = [normalize_answer(r) for r in references]
+    # Compute accuracy
+    results = accuracy_metric.compute(predictions=norm_preds, references=norm_refs)
+    accuracy = results["accuracy"]
+    # Create a simple bar chart: correct vs. incorrect
+    correct_count = sum(p == r for p, r in zip(norm_preds, norm_refs))
+    incorrect_count = len(test_data) - correct_count
+    fig, ax = plt.subplots()
+    ax.bar(["Correct", "Incorrect"], [correct_count, incorrect_count], color=["green", "red"])
+    ax.set_title("Evaluation Results")
+    ax.set_ylabel("Count")
+    ax.set_ylim([0, len(test_data)])
+    # Convert the plot to a base64-encoded PNG for Gradio display
+    buf = io.BytesIO()
+    plt.savefig(buf, format="png")
+    buf.seek(0)
+    plt.close(fig)
+    data = base64.b64encode(buf.read()).decode("utf-8")
+    image_url = f"data:image/png;base64,{data}"
+    # Return text and the plot
+    return f"Accuracy: {accuracy:.2f}", image_url
+# ---------------------------------------------------------------------------
+# 6. Gradio App
+# ---------------------------------------------------------------------------
+with gr.Blocks() as demo:
+    gr.Markdown("# Simple Math Evaluation with 'Llama 3.2'")
+    eval_button = gr.Button("Run Evaluation")
+    output_text = gr.Textbox(label="Results")
+    output_plot = gr.HTML(label="Plot")
+    eval_button.click(
+        fn=run_evaluation,
+        inputs=None,
+        outputs=[output_text, output_plot]
+    )
 demo.launch()