Spaces:

SaylorTwift
/

OpenEvalsModelDetails

Runtime error

App Files Files Community

OpenEvalsModelDetails / app.py

Linker1907

init

c5bf87e 8 months ago

raw

history blame

9.56 kB

	from datasets import load_dataset
	from collections import defaultdict
	import json
	import gradio as gr

	# Load models and experiments

	with open("experiments.json") as f:
	experiments = json.load(f)

	MODELS = list(experiments.keys())
	MODELS = [m for m in MODELS if m.startswith("google/gemma-3")]

	def load_details_and_results(model, benchmark, experiment_tag):
	def worker(example):
	example["predictions"] = example["predictions"]
	example["gold"] = example["gold"][0]
	example["metrics"] = example["metrics"]
	return example

	repo = f"OpenEvals/details_{model.replace('/', '__')}_private"
	subset = experiments[model]["benchmarks"][benchmark]["subset"].replace("\|", "_").replace(":", "_")
	split = experiments[model]["benchmarks"][benchmark]["tags"][experiment_tag].replace("-", "_")

	details = load_dataset(repo, subset, split=split)
	results = load_dataset(repo, "results", split=split)

	results = eval(results[0]["results"])

	columns_to_keep = ['full_prompt', 'gold', 'metrics', 'predictions']
	details = details.select_columns(columns_to_keep)
	details = details.map(worker)

	return details, results

	# Load all experiment details
	experiment_details = defaultdict(dict)

	for model in MODELS:
	for benchmark, benchmark_details in experiments[model]["benchmarks"].items():
	subset = benchmark_details["subset"]
	for experiment_tag in benchmark_details["tags"]:
	details, _ = load_details_and_results(model, benchmark, experiment_tag)
	experiment_details[model][subset] = details

	def display_model_comparison(selected_models, benchmark, example_index):
	if not selected_models:
	return "Please select at least one model to compare."

	# Filter out models that don't have the selected benchmark
	available_models = [model for model in selected_models if benchmark in experiment_details[model]]

	if not available_models:
	return f"No models have results for benchmark: {benchmark}"

	outputs = []
	for model in available_models: # Changed from selected_models to available_models
	try:
	example = experiment_details[model][benchmark][example_index]
	outputs.append({
	'Model': model.split('/')[-1],
	'Prediction': example['predictions'][0] if example['predictions'] else '',
	'Prompt': example['full_prompt'],
	'Metrics': example['metrics'],
	'Gold': example['gold']
	})
	except (KeyError, IndexError):
	continue

	if not outputs:
	return "No results found for the selected combination."

	# Create HTML output with all models
	html_output = "<div style='max-width: 800px; margin: 0 auto;'>\n\n"

	# Show gold answer at the top with distinct styling
	if outputs:
	html_output += "<div style='background: #e6f3e6; padding: 20px; border-radius: 10px; margin-bottom: 20px;'>\n"
	html_output += "<h3 style='margin-top: 0;'>Ground Truth</h3>\n"
	html_output += "<div style='overflow-x: auto; max-width: 100%;'>\n"
	html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{outputs[0]['Gold']}</code></pre>\n"
	html_output += "</div>\n"
	html_output += "</div>\n"

	for output in outputs:
	html_output += "<div style='background: #f5f5f5; padding: 20px; margin-bottom: 20px; border-radius: 10px;'>\n"
	html_output += f"<h2 style='margin-top: 0;'>{output['Model']}</h2>\n"

	# Format metrics as a clean table
	html_output += "<details open style='margin-bottom: 15px;'>\n"
	html_output += "<summary><h3 style='display: inline; margin: 0;'>Metrics</h3></summary>\n"
	metrics = output['Metrics']
	if isinstance(metrics, str):
	metrics = eval(metrics)
	html_output += "<div style='overflow-x: auto;'>\n"
	html_output += "<table style='width: 100%; margin: 10px 0; border-collapse: collapse;'>\n"
	for key, value in metrics.items():
	if isinstance(value, float):
	value = f"{value:.3f}"
	html_output += f"<tr><td style='padding: 5px; border-bottom: 1px solid #ddd;'><strong>{key}</strong></td><td style='padding: 5px; border-bottom: 1px solid #ddd;'>{value}</td></tr>\n"
	html_output += "</table>\n"
	html_output += "</div>\n"
	html_output += "</details>\n\n"

	# Handle prompt formatting with better styling
	html_output += "<details style='margin-bottom: 15px;'>\n"
	html_output += "<summary><h3 style='display: inline; margin: 0;'>Prompt</h3></summary>\n"
	html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"

	prompt_text = output['Prompt']
	if isinstance(prompt_text, list):
	for i, msg in enumerate(prompt_text):
	if isinstance(msg, dict) and 'content' in msg:
	role = msg.get('role', 'message').title()
	html_output += "<div style='margin-bottom: 10px;'>\n"
	html_output += f"<strong>{role}:</strong>\n"
	html_output += "<div style='overflow-x: auto;'>\n"
	# Escape HTML in content
	content = msg['content'].replace('<', '<').replace('>', '>')
	html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
	html_output += "</div>\n"
	html_output += "</div>\n"
	else:
	html_output += "<div style='margin-bottom: 10px;'>\n"
	html_output += "<div style='overflow-x: auto;'>\n"
	html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{json.dumps(msg, indent=2)}</code></pre>\n"
	html_output += "</div>\n"
	html_output += "</div>\n"
	else:
	html_output += "<div style='overflow-x: auto;'>\n"
	if isinstance(prompt_text, dict) and 'content' in prompt_text:
	# Escape HTML in content
	content = prompt_text['content'].replace('<', '<').replace('>', '>')
	html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{content}</code></pre>\n"
	else:
	# Escape HTML if prompt_text is a string
	if isinstance(prompt_text, str):
	prompt_text = prompt_text.replace('<', '<').replace('>', '>')
	html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 5px 0;'><code>{prompt_text}</code></pre>\n"
	html_output += "</div>\n"

	html_output += "</div>\n"
	html_output += "</details>\n\n"

	# Style prediction output - now in a collapsible section
	html_output += "<details open style='margin-bottom: 15px;'>\n"
	html_output += "<summary><h3 style='display: inline; margin: 0;'>Prediction</h3>"
	# Add word count in a muted style
	word_count = len(output['Prediction'].split())
	html_output += f"<span style='color: #666; font-size: 0.8em; margin-left: 10px;'>({word_count} words)</span>"
	html_output += "</summary>\n"
	html_output += "<div style='background: #ffffff; padding: 15px; border-radius: 5px; margin-top: 10px;'>\n"
	html_output += "<div style='overflow-x: auto;'>\n"
	# Escape HTML in prediction
	prediction = output['Prediction'].replace('<', '<').replace('>', '>')
	html_output += f"<pre style='white-space: pre-wrap; word-wrap: break-word; margin: 0;'><code>{prediction}</code></pre>\n"
	html_output += "</div>\n"
	html_output += "</div>\n"
	html_output += "</details>\n"
	html_output += "</div>\n\n"

	html_output += "</div>"
	return html_output

	# Get unique benchmarks
	available_benchmarks = list(set(
	benchmark
	for model in MODELS
	for benchmark in experiment_details[model].keys()
	))

	# Update the Gradio interface to dynamically filter models based on benchmark
	def update_model_choices(benchmark):
	available_models = [model for model in MODELS if benchmark in experiment_details[model]]
	return gr.Dropdown(choices=sorted(available_models), value=sorted(available_models))

	# Create the Gradio interface
	demo = gr.Interface(
	fn=display_model_comparison,
	inputs=[
	gr.Dropdown(
	choices=sorted(MODELS),
	label="Models",
	multiselect=True,
	value=MODELS,
	info="Select models to compare"
	),
	gr.Dropdown(
	choices=sorted(available_benchmarks),
	label="Benchmark",
	value=sorted(available_benchmarks)[0] if available_benchmarks else None,
	info="Choose the evaluation benchmark"
	),
	gr.Number(
	label="Example Index",
	value=0,
	step=1,
	info="Navigate through different examples"
	)
	],
	outputs=gr.HTML(),
	title="Model Generation Comparison",
	description="Compare model outputs across different benchmarks and prompts",
	theme=gr.themes.Soft(),
	css="button { margin: 0 10px; padding: 5px 15px; }"
	)

	if __name__ == "__main__":
	demo.launch()