Spaces:
Running
Running
| import os | |
| import re | |
| import time | |
| import csv | |
| import tempfile | |
| import requests | |
| import pandas as pd | |
| import gradio as gr | |
| ###################################### | |
| # Environment / Secrets | |
| ###################################### | |
| #OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY") | |
| #if not OPENAI_API_KEY: | |
| # raise Exception("OPENAI_API_KEY not found in environment variables. Please add it as a secret in your Space.") | |
| COHERE_API_KEY = os.environ.get("COHERE_API_KEY") | |
| if not COHERE_API_KEY: | |
| raise Exception("COHERE_API_KEY not found in environment variables. Please add it as a secret in your Space.") | |
| HF_API_TOKEN = os.environ.get("HF_TOKEN") | |
| hf_headers = {} | |
| if HF_API_TOKEN: | |
| hf_headers = {"Authorization": f"Bearer {HF_API_TOKEN}"} | |
| ###################################### | |
| # Load System Instructions | |
| ###################################### | |
| #with open("system_instructions.txt", "r", encoding="utf-8") as f: | |
| #system_instructions = f.read() | |
| system_instructions = os.environ.get("ecosophy") | |
| ###################################### | |
| # Helper Functions | |
| ###################################### | |
| def call_judge(prompt: str, max_tokens=200, temperature=0.7) -> str: | |
| """ | |
| Calls judge via Cohere Chat Completion API | |
| and returns the model's text output. | |
| """ | |
| url = "https://api.cohere.ai/v1/chat" | |
| headers = { | |
| "Authorization": f"Bearer {COHERE_API_KEY}", | |
| "Content-Type": "application/json" | |
| } | |
| payload = { | |
| "model": "command-r-plus-08-2024", | |
| "message": prompt, # Changed from "prompt" to "message" | |
| "max_tokens": max_tokens, | |
| "temperature": temperature | |
| } | |
| response = requests.post(url, json=payload, headers=headers) | |
| if response.status_code != 200: | |
| raise Exception(f"Cohere API error: {response.text}") | |
| result = response.json() | |
| return result["text"] # Changed from result["generations"][0]["text"] | |
| def call_judge_old(prompt: str, max_tokens=200, temperature=0.7) -> str: | |
| """ | |
| Calls judge via Chat Completion API | |
| and returns the model's text output. | |
| """ | |
| url = "https://api.cohere.ai/v1/chat" | |
| headers = { | |
| "Authorization": f"Bearer {COHERE_API_KEY}", | |
| "Content-Type": "application/json" | |
| } | |
| payload = { | |
| "model": "command-r-plus", # Adjust based on the desired Cohere model | |
| "prompt": prompt, | |
| "max_tokens": max_tokens, | |
| "temperature": temperature | |
| } | |
| response = requests.post(url, json=payload, headers=headers) | |
| if response.status_code != 200: | |
| raise Exception(f"Cohere API error: {response.text}") | |
| result = response.json() | |
| return result["generations"][0]["text"] | |
| def call_hf(model: str, prompt: str, max_new_tokens=200, max_retries=10, delay=5) -> str: | |
| """ | |
| Calls a Hugging Face Inference endpoint for text generation. | |
| Retries if the model is still loading. | |
| """ | |
| api_url = f"https://api-inference.huggingface.co/models/{model}" | |
| payload = { | |
| "inputs": prompt, | |
| "parameters": { | |
| "do_sample": False, | |
| "max_new_tokens": max_new_tokens | |
| } | |
| } | |
| for attempt in range(max_retries): | |
| resp = requests.post(api_url, json=payload, headers=hf_headers) | |
| data = resp.json() | |
| if isinstance(data, dict) and data.get("error"): | |
| if "loading" in data["error"].lower(): | |
| print(f"Attempt {attempt+1}/{max_retries}: Model is loading. Retrying in {delay} seconds...") | |
| time.sleep(delay) | |
| else: | |
| raise Exception(f"Error from model {model}: {data['error']}") | |
| else: | |
| # Data should be a list like [{ "generated_text": "..." }] | |
| return data[0]["generated_text"] | |
| raise Exception(f"Model {model} is still loading after {max_retries} attempts.") | |
| def generate_answer(question: str, evaluated_model: str) -> str: | |
| """ | |
| Generates an answer for the question, using the specified evaluated model or 'placeholder' if empty. | |
| """ | |
| if evaluated_model.strip().lower() == "please enter model to evaluate": | |
| return f"Placeholder answer for: {question}" | |
| else: | |
| return call_hf(evaluated_model, question) | |
| def judge_answer(question: str, answer: str) -> int: | |
| """ | |
| Sends question+answer to the judge with system instructions to produce a numeric score (0 to 5). | |
| """ | |
| prompt = ( | |
| f"{system_instructions}\n\n" | |
| f"Question: {question}\n" | |
| f"Answer: {answer}\n\n" | |
| "Please provide a score from 0 to 5, where 5 is perfect and 0 is entirely incorrect. " | |
| "Provide only the numeric score in your response." | |
| ) | |
| output = call_judge(prompt, max_tokens=200, temperature=0.7) | |
| match = re.search(r"\b([0-5])\b", output) | |
| if match: | |
| return int(match.group(1)) | |
| return 0 | |
| ###################################### | |
| # Main Evaluation | |
| ###################################### | |
| def evaluate_csv(csv_file, evaluated_model_name): | |
| """ | |
| Reads a CSV with a 'question' and a 'answer' column. | |
| Scores each Q&A with the judge model (0..5). | |
| Returns (avg_score_percent, csv_temp_path). | |
| """ | |
| df = pd.read_csv(csv_file) | |
| if "question" not in df.columns: | |
| raise ValueError("CSV must contain a 'question' column.") | |
| has_answer_col = ("answer" in df.columns) | |
| results = [] | |
| for _, row in df.iterrows(): | |
| q = str(row["question"]) | |
| if has_answer_col: | |
| a = str(row["answer"]) | |
| else: | |
| a = generate_answer(q, evaluated_model_name) | |
| score = judge_answer(q, a) | |
| results.append({"question": q, "answer": a, "score": score}) | |
| if len(results) == 0: | |
| return 0.0, None | |
| total_score = sum(item["score"] for item in results) | |
| max_possible = len(results) * 5 | |
| avg_score_percent = (total_score / max_possible) * 100 | |
| # Build output CSV (comma-separated) | |
| out_df = pd.DataFrame(results) | |
| csv_str = out_df.to_csv( | |
| index=False, | |
| sep=',', # Comma separated | |
| quotechar='"', | |
| quoting=csv.QUOTE_ALL, | |
| encoding='utf-8-sig' | |
| ) | |
| with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv", encoding="utf-8-sig") as tmp_file: | |
| tmp_file.write(csv_str) | |
| tmp_file_path = tmp_file.name | |
| return avg_score_percent, tmp_file_path | |
| def run_evaluation(csv_file, evaluated_model_name): | |
| """ | |
| Gradio callback: | |
| 1) Evaluates Q&A from the CSV. | |
| 2) Returns a big box with % and a downloadable CSV. | |
| """ | |
| avg_percentage, csv_path = evaluate_csv(csv_file, evaluated_model_name) | |
| # Build the same style box as the single Q&A will use | |
| score_box = f""" | |
| <div style="width:200px; height:200px; border:2px solid #333; | |
| display:flex; align-items:center; justify-content:center; font-size:30px;"> | |
| {avg_percentage:.2f}% | |
| </div> | |
| """ | |
| return score_box, csv_path | |
| ###################################### | |
| # Gradio Interface | |
| ###################################### | |
| with gr.Blocks() as demo: | |
| #################################### | |
| # Top row: Logo (left), Title + instructions (right) | |
| #################################### | |
| with gr.Row(): | |
| with gr.Column(scale=1, min_width=220): | |
| gr.Image("logo.png", show_label=False, interactive=False, width=220, height=220) | |
| with gr.Column(scale=5): | |
| gr.Markdown("## H4rmony Eval") | |
| gr.Markdown( | |
| "- The evaluation can be requested by CSV or by single Prompt/completion.\n" | |
| "- The CSV, if present, should have **both a 'question' and an 'answer'** column.\n\n" | |
| "The judge model scores each Q&A on a **0β5** scale, and you'll see the final percentage o score." | |
| ) | |
| #################################### | |
| # Middle row: | |
| # 1) Upload CSV | |
| # 2) Download Results | |
| # 3) Score (big box) | |
| #################################### | |
| with gr.Row(equal_height=True): | |
| # Square #1: Upload CSV | |
| with gr.Column(scale=1): | |
| gr.Markdown("#### Upload CSV") | |
| csv_in = gr.File(label="CSV File", type="filepath") | |
| # Square #2: Download Results | |
| with gr.Column(scale=1): | |
| gr.Markdown("#### Download Results") | |
| csv_out = gr.File(label="Scored CSV", interactive=False) | |
| # Square #3: Score | |
| with gr.Column(scale=1): | |
| gr.Markdown("#### Score") | |
| score_html = gr.HTML( | |
| value=""" | |
| <div style="width:200px; height:200px; border:2px solid #333; | |
| display:flex; align-items:center; justify-content:center; font-size:30px;"> | |
| -- | |
| </div> | |
| """, | |
| label="Final Score" | |
| ) | |
| #################################### | |
| # Single Q&A | |
| #################################### | |
| gr.Markdown( | |
| """ | |
| --- | |
| ### Single Q&A Evaluation | |
| Enter one question and one answer below, then click **Evaluate Single Q&A** to get a 0β5 score | |
| in the same box on the right. | |
| """ | |
| ) | |
| with gr.Row(): | |
| single_q = gr.Textbox( | |
| lines=3, | |
| label="Single Question / Prompt" | |
| ) | |
| single_a = gr.Textbox( | |
| lines=3, | |
| label="Single Answer" | |
| ) | |
| def on_single_evaluate(q, a): | |
| score = judge_answer(q, a) | |
| # Show the numeric score in the same style as the CSV | |
| box = f""" | |
| <div style="width:200px; height:200px; border:2px solid #333; | |
| display:flex; align-items:center; justify-content:center; font-size:30px;"> | |
| {score} | |
| </div> | |
| """ | |
| return box | |
| #################################### | |
| # Bottom row: Model + 2 Buttons (CSV & Single) | |
| #################################### | |
| with gr.Row(): | |
| with gr.Column(): | |
| model_in = gr.Textbox( | |
| label="Evaluated Model (WIP)", | |
| value="---- Feature not yet available ---------" | |
| ) | |
| # Two buttons side by side: | |
| with gr.Row(): | |
| submit_btn = gr.Button("Submit CSV") | |
| single_btn = gr.Button("Evaluate Single Q&A") | |
| #################################### | |
| # Define both callbacks | |
| #################################### | |
| def on_submit(csv_path, model_name): | |
| box, out_path = run_evaluation(csv_path, model_name) | |
| return box, out_path | |
| # Linking the two callbacks: | |
| # 1) CSV evaluation | |
| submit_btn.click( | |
| fn=on_submit, | |
| inputs=[csv_in, model_in], | |
| outputs=[score_html, csv_out] | |
| ) | |
| # 2) Single Q&A evaluation | |
| single_btn.click( | |
| fn=on_single_evaluate, | |
| inputs=[single_q, single_a], | |
| outputs=score_html | |
| ) | |
| demo.launch() |