H2H-eval-comparator

Sleeping

App Files Files Community

rohansampath commited on Feb 12

Commit

ee60006

verified ·

1 Parent(s): b0fd62c

Update app.py

Browse files

Files changed (1) hide show

app.py +29 -1

app.py CHANGED Viewed

@@ -11,6 +11,7 @@ import base64
 import os
 from huggingface_hub import login
 import spaces
 # Read token and login
 hf_token = os.getenv("HF_TOKEN_READ_WRITE")
@@ -188,13 +189,31 @@ def run_evaluation():
     return f"Accuracy: {accuracy:.2f}", full_html
 # ---------------------------------------------------------------------------
 # 6. Gradio Interface
 # ---------------------------------------------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("# Mistral-7B Math Evaluation Demo")
     gr.Markdown("""
-    This demo evaluates Mistral-7B on basic math problems.
     Press the button below to run the evaluation.
     """)
@@ -208,4 +227,13 @@ with gr.Blocks() as demo:
         outputs=[output_text, output_plot]
     )
 demo.launch()

 import os
 from huggingface_hub import login
 import spaces
+from mmlu_eval import evaluate_mmlu
 # Read token and login
 hf_token = os.getenv("HF_TOKEN_READ_WRITE")
     return f"Accuracy: {accuracy:.2f}", full_html
+# ---------------------------------------------------------------------------
+# 5. MMLU Evaluation call
+# ---------------------------------------------------------------------------
+def run_mmlu_evaluation(num_questions):
+    """
+    Runs the MMLU evaluation with the specified number of questions per task.
+    """
+    results = evaluate_mmlu(model, tokenizer, num_questions)
+    report = (
+        f"Overall Accuracy: {results['overall_accuracy']:.2f}\n"
+        f"Min Accuracy: {results['min_accuracy_task'][1]:.2f} on {results['min_accuracy_task'][0]}\n"
+        f"Max Accuracy: {results['max_accuracy_task'][1]:.2f} on {results['max_accuracy_task'][0]}"
+    )
+    return report
 # ---------------------------------------------------------------------------
 # 6. Gradio Interface
 # ---------------------------------------------------------------------------
 with gr.Blocks() as demo:
     gr.Markdown("# Mistral-7B Math Evaluation Demo")
     gr.Markdown("""
+    This demo evaluates Mistral-7B on three very simple math problems to get started.
     Press the button below to run the evaluation.
     """)
         outputs=[output_text, output_plot]
     )
+    gr.Markdown("### MMLU Evaluation")
+    num_questions_input = gr.Number(label="Questions per Task (there are 57 total Tasks)", value=5, precision=0)
+    eval_mmlu_button = gr.Button("Run MMLU Evaluation")
+    mmlu_output = gr.Textbox(label="MMLU Evaluation Results")
+    eval_mmlu_button.click(fn=run_mmlu_evaluation, inputs=[num_questions_input], outputs=[mmlu_output])
 demo.launch()