Spaces:

neovalle
/

H4rmonyEval

Running

App Files Files Community

H4rmonyEval / app.py

neovalle

Update app.py

d896199 verified about 2 months ago

raw

history blame contribute delete

10.9 kB

	import os
	import re
	import time
	import csv
	import tempfile
	import requests
	import pandas as pd
	import gradio as gr

	######################################
	# Environment / Secrets
	######################################

	#OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
	#if not OPENAI_API_KEY:
	# raise Exception("OPENAI_API_KEY not found in environment variables. Please add it as a secret in your Space.")


	COHERE_API_KEY = os.environ.get("COHERE_API_KEY")


	if not COHERE_API_KEY:
	raise Exception("COHERE_API_KEY not found in environment variables. Please add it as a secret in your Space.")

	HF_API_TOKEN = os.environ.get("HF_TOKEN")

	hf_headers = {}
	if HF_API_TOKEN:
	hf_headers = {"Authorization": f"Bearer {HF_API_TOKEN}"}

	######################################
	# Load System Instructions
	######################################

	#with open("system_instructions.txt", "r", encoding="utf-8") as f:
	#system_instructions = f.read()

	system_instructions = os.environ.get("ecosophy")

	######################################
	# Helper Functions
	######################################
	def call_judge(prompt: str, max_tokens=200, temperature=0.7) -> str:
	"""
	Calls judge via Cohere Chat Completion API
	and returns the model's text output.
	"""
	url = "https://api.cohere.ai/v1/chat"
	headers = {
	"Authorization": f"Bearer {COHERE_API_KEY}",
	"Content-Type": "application/json"
	}
	payload = {
	"model": "command-r-plus-08-2024",
	"message": prompt, # Changed from "prompt" to "message"
	"max_tokens": max_tokens,
	"temperature": temperature
	}

	response = requests.post(url, json=payload, headers=headers)
	if response.status_code != 200:
	raise Exception(f"Cohere API error: {response.text}")
	result = response.json()
	return result["text"] # Changed from result["generations"][0]["text"]

	def call_judge_old(prompt: str, max_tokens=200, temperature=0.7) -> str:
	"""
	Calls judge via Chat Completion API
	and returns the model's text output.
	"""
	url = "https://api.cohere.ai/v1/chat"
	headers = {
	"Authorization": f"Bearer {COHERE_API_KEY}",
	"Content-Type": "application/json"
	}
	payload = {
	"model": "command-r-plus", # Adjust based on the desired Cohere model
	"prompt": prompt,
	"max_tokens": max_tokens,
	"temperature": temperature
	}

	response = requests.post(url, json=payload, headers=headers)
	if response.status_code != 200:
	raise Exception(f"Cohere API error: {response.text}")
	result = response.json()
	return result["generations"][0]["text"]

	def call_hf(model: str, prompt: str, max_new_tokens=200, max_retries=10, delay=5) -> str:
	"""
	Calls a Hugging Face Inference endpoint for text generation.
	Retries if the model is still loading.
	"""
	api_url = f"https://api-inference.huggingface.co/models/{model}"
	payload = {
	"inputs": prompt,
	"parameters": {
	"do_sample": False,
	"max_new_tokens": max_new_tokens
	}
	}

	for attempt in range(max_retries):
	resp = requests.post(api_url, json=payload, headers=hf_headers)
	data = resp.json()
	if isinstance(data, dict) and data.get("error"):
	if "loading" in data["error"].lower():
	print(f"Attempt {attempt+1}/{max_retries}: Model is loading. Retrying in {delay} seconds...")
	time.sleep(delay)
	else:
	raise Exception(f"Error from model {model}: {data['error']}")
	else:
	# Data should be a list like [{ "generated_text": "..." }]
	return data[0]["generated_text"]
	raise Exception(f"Model {model} is still loading after {max_retries} attempts.")

	def generate_answer(question: str, evaluated_model: str) -> str:
	"""
	Generates an answer for the question, using the specified evaluated model or 'placeholder' if empty.
	"""
	if evaluated_model.strip().lower() == "please enter model to evaluate":
	return f"Placeholder answer for: {question}"
	else:
	return call_hf(evaluated_model, question)

	def judge_answer(question: str, answer: str) -> int:
	"""
	Sends question+answer to the judge with system instructions to produce a numeric score (0 to 5).
	"""
	prompt = (
	f"{system_instructions}\n\n"
	f"Question: {question}\n"
	f"Answer: {answer}\n\n"
	"Please provide a score from 0 to 5, where 5 is perfect and 0 is entirely incorrect. "
	"Provide only the numeric score in your response."
	)
	output = call_judge(prompt, max_tokens=200, temperature=0.7)
	match = re.search(r"\b([0-5])\b", output)
	if match:
	return int(match.group(1))
	return 0

	######################################
	# Main Evaluation
	######################################

	def evaluate_csv(csv_file, evaluated_model_name):
	"""
	Reads a CSV with a 'question' and a 'answer' column.
	Scores each Q&A with the judge model (0..5).
	Returns (avg_score_percent, csv_temp_path).
	"""
	df = pd.read_csv(csv_file)
	if "question" not in df.columns:
	raise ValueError("CSV must contain a 'question' column.")

	has_answer_col = ("answer" in df.columns)
	results = []
	for _, row in df.iterrows():
	q = str(row["question"])
	if has_answer_col:
	a = str(row["answer"])
	else:
	a = generate_answer(q, evaluated_model_name)
	score = judge_answer(q, a)
	results.append({"question": q, "answer": a, "score": score})

	if len(results) == 0:
	return 0.0, None

	total_score = sum(item["score"] for item in results)
	max_possible = len(results) * 5
	avg_score_percent = (total_score / max_possible) * 100

	# Build output CSV (comma-separated)
	out_df = pd.DataFrame(results)
	csv_str = out_df.to_csv(
	index=False,
	sep=',', # Comma separated
	quotechar='"',
	quoting=csv.QUOTE_ALL,
	encoding='utf-8-sig'
	)
	with tempfile.NamedTemporaryFile(mode="w", delete=False, suffix=".csv", encoding="utf-8-sig") as tmp_file:
	tmp_file.write(csv_str)
	tmp_file_path = tmp_file.name

	return avg_score_percent, tmp_file_path

	def run_evaluation(csv_file, evaluated_model_name):
	"""
	Gradio callback:
	1) Evaluates Q&A from the CSV.
	2) Returns a big box with % and a downloadable CSV.
	"""
	avg_percentage, csv_path = evaluate_csv(csv_file, evaluated_model_name)
	# Build the same style box as the single Q&A will use
	score_box = f"""
	<div style="width:200px; height:200px; border:2px solid #333;
	display:flex; align-items:center; justify-content:center; font-size:30px;">
	{avg_percentage:.2f}%
	</div>
	"""
	return score_box, csv_path

	######################################
	# Gradio Interface
	######################################

	with gr.Blocks() as demo:
	####################################
	# Top row: Logo (left), Title + instructions (right)
	####################################
	with gr.Row():
	with gr.Column(scale=1, min_width=220):
	gr.Image("logo.png", show_label=False, interactive=False, width=220, height=220)
	with gr.Column(scale=5):
	gr.Markdown("## H4rmony Eval")
	gr.Markdown(
	"- The evaluation can be requested by CSV or by single Prompt/completion.\n"

	"- The CSV, if present, should have both a 'question' and an 'answer' column.\n\n"

	"The judge model scores each Q&A on a 0–5 scale, and you'll see the final percentage o score."
	)

	####################################
	# Middle row:
	# 1) Upload CSV
	# 2) Download Results
	# 3) Score (big box)
	####################################
	with gr.Row(equal_height=True):
	# Square #1: Upload CSV
	with gr.Column(scale=1):
	gr.Markdown("#### Upload CSV")
	csv_in = gr.File(label="CSV File", type="filepath")

	# Square #2: Download Results
	with gr.Column(scale=1):
	gr.Markdown("#### Download Results")
	csv_out = gr.File(label="Scored CSV", interactive=False)

	# Square #3: Score
	with gr.Column(scale=1):
	gr.Markdown("#### Score")
	score_html = gr.HTML(
	value="""
	<div style="width:200px; height:200px; border:2px solid #333;
	display:flex; align-items:center; justify-content:center; font-size:30px;">
	--
	</div>
	""",
	label="Final Score"
	)

	####################################
	# Single Q&A
	####################################
	gr.Markdown(
	"""
	---
	### Single Q&A Evaluation
	Enter one question and one answer below, then click Evaluate Single Q&A to get a 0–5 score
	in the same box on the right.
	"""
	)

	with gr.Row():
	single_q = gr.Textbox(
	lines=3,
	label="Single Question / Prompt"
	)
	single_a = gr.Textbox(
	lines=3,
	label="Single Answer"
	)

	def on_single_evaluate(q, a):
	score = judge_answer(q, a)
	# Show the numeric score in the same style as the CSV
	box = f"""
	<div style="width:200px; height:200px; border:2px solid #333;
	display:flex; align-items:center; justify-content:center; font-size:30px;">
	{score}
	</div>
	"""
	return box

	####################################
	# Bottom row: Model + 2 Buttons (CSV & Single)
	####################################
	with gr.Row():
	with gr.Column():
	model_in = gr.Textbox(
	label="Evaluated Model (WIP)",
	value="---- Feature not yet available ---------"
	)

	# Two buttons side by side:
	with gr.Row():
	submit_btn = gr.Button("Submit CSV")
	single_btn = gr.Button("Evaluate Single Q&A")

	####################################
	# Define both callbacks
	####################################
	def on_submit(csv_path, model_name):
	box, out_path = run_evaluation(csv_path, model_name)
	return box, out_path

	# Linking the two callbacks:
	# 1) CSV evaluation
	submit_btn.click(
	fn=on_submit,
	inputs=[csv_in, model_in],
	outputs=[score_html, csv_out]
	)
	# 2) Single Q&A evaluation
	single_btn.click(
	fn=on_single_evaluate,
	inputs=[single_q, single_a],
	outputs=score_html
	)

	demo.launch()