Spaces:
Runtime error
Runtime error
| import gradio as gr | |
| import pandas as pd | |
| import torch | |
| from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM | |
| import json | |
| import os | |
| from datetime import datetime | |
| import time | |
| # --- Configuration --- | |
| QA_FILE = "qa.txt" | |
| RESULTS_FILE = "Eval_results.jsonl" | |
| JUDGE_MODEL_REPO = "google/flan-t5-base" # A capable but relatively small model for judging | |
| # --- Setup: Ensure files exist --- | |
| if not os.path.exists(RESULTS_FILE): | |
| with open(RESULTS_FILE, "w") as f: | |
| pass # Create an empty file if it doesn't exist | |
| if not os.path.exists(QA_FILE): | |
| # Create a dummy qa.txt if it's missing, with a few example questions | |
| dummy_data = """ID,Question_Type,Question,Golden_Answer_Summary | |
| 1,Code,"Create a Python function that implements the Bubble Sort algorithm.","The function should take a list, use nested loops to compare adjacent elements, and swap them if they are in the wrong order. The outer loop runs n times, and the inner loop runs n-i-1 times." | |
| 2,Common Chat,"What is the capital of France?","The answer must be Paris." | |
| 3,Advanced Code,"Write a Python script that connects to a public FTP server, lists the files in the root directory, and then disconnects.","The script must import the `ftplib` library. It should create an FTP object, for example `FTP('ftp.dlptest.com')`, call the `login()` method, then `retrlines('LIST')` to print the directory listing, and finally `quit()` to close the connection." | |
| """ | |
| with open(QA_FILE, "w") as f: | |
| f.write(dummy_data) | |
| # --- AI Judge Logic --- | |
| def get_ai_judge_verdict(judge_pipeline, question, golden_summary, ai_answer): | |
| """ | |
| Uses the AI Judge model to give a verdict on the tested model's answer. | |
| """ | |
| system_instruction = f""" | |
| You are an expert evaluator for an AI model benchmark. Your task is to determine if the AI's answer is a correct and satisfactory response to the user's question. You must only respond with a single character: '1' for a correct/passing answer, or '0' for an incorrect/failing answer. | |
| A '1' means the AI's answer correctly addresses the main components of the question and is similar in spirit to the expected golden answer summary. | |
| A '0' means the AI's answer is factually wrong, does not address the question, is a refusal to answer, or is fundamentally incomplete. | |
| --- | |
| User Question: | |
| {question} | |
| Expected Golden Answer Summary: | |
| {golden_summary} | |
| --- | |
| AI Model's Answer: | |
| {ai_answer} | |
| --- | |
| Based on this, is the AI Model's Answer correct? Respond with only '1' or '0'. | |
| """ | |
| try: | |
| response = judge_pipeline(system_instruction, max_new_tokens=5) | |
| # Extract the generated text and clean it up | |
| verdict = response[0]['generated_text'].strip() | |
| # Ensure the verdict is either '1' or '0' | |
| if '1' in verdict: | |
| return 1 | |
| else: | |
| return 0 | |
| except Exception: | |
| # If the judge fails for any reason, default to a failing grade | |
| return 0 | |
| # --- Core Evaluation Logic --- | |
| def run_evaluation(model_repo, model_nickname, progress=gr.Progress()): | |
| """ | |
| Loads a user-specified model, runs it against the benchmark, evaluates the answers | |
| using an AI judge, and saves the results. | |
| """ | |
| if not model_repo or not model_nickname: | |
| gr.Warning("Model Repository and Nickname cannot be empty.") | |
| return pd.DataFrame(), None | |
| # Load benchmark questions | |
| try: | |
| questions_df = pd.read_csv(QA_FILE) | |
| # Use a small subset for quick demos if needed | |
| # questions_df = questions_df.head(3) | |
| except Exception as e: | |
| gr.Error(f"Failed to load benchmark questions from {QA_FILE}: {e}") | |
| return pd.DataFrame(), None | |
| # --- Load Models --- | |
| progress(0, desc="Loading AI Judge Model...") | |
| try: | |
| judge_pipeline = pipeline("text2text-generation", model=JUDGE_MODEL_REPO, device_map="auto", torch_dtype=torch.bfloat16) | |
| except Exception as e: | |
| gr.Error(f"Failed to load AI Judge model '{JUDGE_MODEL_REPO}': {e}") | |
| return pd.DataFrame(), None | |
| progress(0.1, desc=f"Loading test model: {model_repo}") | |
| try: | |
| model_to_test_tokenizer = AutoTokenizer.from_pretrained(model_repo) | |
| model_to_test = AutoModelForCausalLM.from_pretrained( | |
| model_repo, | |
| device_map="auto", | |
| torch_dtype=torch.bfloat16 # bfloat16 is good for ZeroGPU | |
| ) | |
| test_pipeline = pipeline( | |
| "text-generation", | |
| model=model_to_test, | |
| tokenizer=model_to_test_tokenizer, | |
| max_new_tokens=1024, # Set a reasonable limit for code generation | |
| do_sample=True, | |
| temperature=0.7, | |
| top_p=0.95 | |
| ) | |
| except Exception as e: | |
| gr.Error(f"Failed to load the specified test model '{model_repo}': {e}") | |
| return pd.DataFrame(), None | |
| # --- Run Benchmark Loop --- | |
| detailed_results = [] | |
| total_score = 0 | |
| total_questions = len(questions_df) | |
| for i, row in enumerate(questions_df.itertuples()): | |
| progress_val = 0.1 + (0.8 * (i / total_questions)) | |
| progress(progress_val, desc=f"Running Q{row.ID}/{total_questions}") | |
| # Generate answer from the model being tested | |
| try: | |
| prompt = f"Question: {row.Question}\n\nAnswer:" | |
| response = test_pipeline(prompt) | |
| ai_answer = response[0]['generated_text'].replace(prompt, "").strip() | |
| except Exception as e: | |
| ai_answer = f"Error during generation: {e}" | |
| # Get verdict from the AI Judge | |
| score = get_ai_judge_verdict(judge_pipeline, row.Question, row.Golden_Answer_Summary, ai_answer) | |
| total_score += score | |
| detailed_results.append({ | |
| "ID": row.ID, | |
| "Question": row.Question, | |
| "AI_Answer": ai_answer, | |
| "Score": score | |
| }) | |
| time.sleep(0.1) # Small delay to allow UI to update | |
| # --- Finalize and Save Results --- | |
| progress(0.95, desc="Finalizing and saving...") | |
| final_score_percent = (total_score / total_questions) * 100 if total_questions > 0 else 0 | |
| run_summary = { | |
| "model_nickname": model_nickname, | |
| "model_repo": model_repo, | |
| "score_percent": round(final_score_percent, 2), | |
| "timestamp": datetime.utcnow().isoformat(), | |
| "detailed_results": detailed_results | |
| } | |
| try: | |
| with open(RESULTS_FILE, "a") as f: | |
| f.write(json.dumps(run_summary) + "\n") | |
| except Exception as e: | |
| gr.Warning(f"Could not save results to {RESULTS_FILE}: {e}") | |
| progress(1, desc="Evaluation Complete!") | |
| return pd.DataFrame(detailed_results), gr.Markdown(f"**Overall Score: {final_score_percent:.2f}%**") | |
| # --- Leaderboard Logic --- | |
| def load_leaderboard(): | |
| """ | |
| Loads and displays the leaderboard from the results file. | |
| """ | |
| if not os.path.exists(RESULTS_FILE) or os.path.getsize(RESULTS_FILE) == 0: | |
| return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"]) | |
| results_data = [] | |
| with open(RESULTS_FILE, "r") as f: | |
| for line in f: | |
| try: | |
| data = json.loads(line) | |
| results_data.append({ | |
| "Model Nickname": data.get("model_nickname"), | |
| "Score (%)": data.get("score_percent"), | |
| "Model Repo": data.get("model_repo"), | |
| "Date": datetime.fromisoformat(data.get("timestamp")).strftime('%Y-%m-%d %H:%M:%S') | |
| }) | |
| except (json.JSONDecodeError, KeyError): | |
| # Skip corrupted or malformed lines | |
| continue | |
| if not results_data: | |
| return pd.DataFrame(columns=["Rank", "Model Nickname", "Score (%)", "Date"]) | |
| leaderboard_df = pd.DataFrame(results_data) | |
| leaderboard_df = leaderboard_df.sort_values(by="Score (%)", ascending=False).reset_index(drop=True) | |
| leaderboard_df["Rank"] = leaderboard_df.index + 1 | |
| # Reorder columns for display | |
| leaderboard_df = leaderboard_df[["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"]] | |
| return leaderboard_df | |
| # --- Gradio UI --- | |
| with gr.Blocks(theme=gr.themes.Soft(), title="NPFL Benchmark") as demo: | |
| gr.Markdown("# NPFL (No Placeholders, Full Logic) AI Benchmark") | |
| with gr.Tabs(): | |
| with gr.TabItem("Run Evaluation"): | |
| with gr.Row(): | |
| with gr.Column(scale=2): | |
| model_repo_input = gr.Textbox( | |
| label="Hugging Face Model Repository", | |
| placeholder="e.g., google/gemma-2b-it", | |
| info="The model to be tested. Must be compatible with the text-generation pipeline." | |
| ) | |
| model_nickname_input = gr.Textbox( | |
| label="Model Nickname", | |
| placeholder="e.g., Gemma-2B-v1", | |
| info="A unique name to display on the leaderboard." | |
| ) | |
| run_button = gr.Button("Start Evaluation", variant="primary") | |
| with gr.Column(scale=1): | |
| final_score_output = gr.Markdown("**Overall Score: --**") | |
| gr.Markdown("---") | |
| gr.Markdown("### Detailed Run Results") | |
| results_output = gr.DataFrame( | |
| headers=["ID", "Question", "AI_Answer", "Score"], | |
| wrap=True, | |
| height=600 | |
| ) | |
| with gr.TabItem("Leaderboard"): | |
| leaderboard_refresh_button = gr.Button("Refresh Leaderboard") | |
| leaderboard_output = gr.DataFrame( | |
| headers=["Rank", "Model Nickname", "Score (%)", "Date", "Model Repo"], | |
| wrap=True, | |
| height=700 | |
| ) | |
| # --- Event Handlers --- | |
| run_button.click( | |
| fn=run_evaluation, | |
| inputs=[model_repo_input, model_nickname_input], | |
| outputs=[results_output, final_score_output] | |
| ) | |
| leaderboard_refresh_button.click( | |
| fn=load_leaderboard, | |
| inputs=[], | |
| outputs=[leaderboard_output] | |
| ) | |
| # Load leaderboard once on startup | |
| demo.load(load_leaderboard, None, leaderboard_output) | |
| if __name__ == "__main__": | |
| demo.launch(debug=True) |