Spaces:

TeddyYao
/

grok4-gpqa-eval

Sleeping

App Files Files Community

grok4-gpqa-eval / run_evaluation.py

TeddyYao

Upload 38 files

8474f02 verified 4 months ago

raw

history blame contribute delete

7.47 kB

	"""Main script to run AI model evaluation benchmarks"""

	import argparse
	import asyncio
	import json
	import os
	import yaml
	from datetime import datetime
	from typing import List, Dict, Any
	from dotenv import load_dotenv
	import pandas as pd

	from apis.api_factory import APIFactory
	from benchmarks import get_benchmark, BenchmarkResult

	# Load environment variables
	load_dotenv()

	def load_config(config_path: str = 'official_config.yaml') -> dict:
	"""Load configuration from YAML file"""
	with open(config_path, 'r') as f:
	config = yaml.safe_load(f)

	# Replace environment variables
	def replace_env_vars(obj):
	if isinstance(obj, str) and obj.startswith('${') and obj.endswith('}'):
	env_var = obj[2:-1]
	return os.getenv(env_var, obj)
	elif isinstance(obj, dict):
	return {k: replace_env_vars(v) for k, v in obj.items()}
	elif isinstance(obj, list):
	return [replace_env_vars(item) for item in obj]
	return obj

	return replace_env_vars(config)

	def save_results(results: List[BenchmarkResult], output_dir: str):
	"""Save evaluation results"""
	os.makedirs(output_dir, exist_ok=True)

	# Create timestamp for this run
	timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')

	# Save detailed results as JSON
	detailed_results = []
	for result in results:
	detailed_results.append({
	'benchmark': result.benchmark_name,
	'model': result.model_name,
	'total_questions': result.total_questions,
	'correct': result.correct,
	'accuracy': result.accuracy,
	'avg_response_time': result.avg_response_time,
	'timestamp': timestamp
	})

	json_path = os.path.join(output_dir, f'results_{timestamp}.json')
	with open(json_path, 'w') as f:
	json.dump(detailed_results, f, indent=2)

	# Save summary as CSV
	df = pd.DataFrame(detailed_results)
	csv_path = os.path.join(output_dir, f'summary_{timestamp}.csv')
	df.to_csv(csv_path, index=False)

	# Save raw results for debugging
	for result in results:
	raw_path = os.path.join(output_dir, f'{result.model_name}_{result.benchmark_name}_{timestamp}_raw.json')
	with open(raw_path, 'w') as f:
	json.dump(result.raw_results, f, indent=2)

	return json_path, csv_path

	def print_results_table(results: List[BenchmarkResult]):
	"""Print results in a nice table format"""
	if not results:
	return

	# Group by model
	model_results = {}
	for result in results:
	if result.model_name not in model_results:
	model_results[result.model_name] = {}
	model_results[result.model_name][result.benchmark_name] = result

	# Print header
	benchmarks = list(set(r.benchmark_name for r in results))
	benchmarks.sort()

	print("\n" + "="*80)
	print("EVALUATION RESULTS")
	print("="*80)

	# Create table
	header = ["Model"] + benchmarks + ["Average"]
	print(f"{'Model':<20}", end="")
	for bench in benchmarks:
	print(f"{bench:<15}", end="")
	print(f"{'Average':<10}")
	print("-"*80)

	# Print results for each model
	for model, bench_results in model_results.items():
	print(f"{model:<20}", end="")
	scores = []

	for bench in benchmarks:
	if bench in bench_results:
	score = bench_results[bench].accuracy * 100
	scores.append(score)
	print(f"{score:>6.1f}% ", end="")
	else:
	print(f"{'N/A':<15}", end="")

	# Calculate average
	if scores:
	avg = sum(scores) / len(scores)
	print(f"{avg:>6.1f}%")
	else:
	print("N/A")

	print("="*80)

	async def run_single_evaluation(api, benchmark_name: str, config: dict) -> BenchmarkResult:
	"""Run a single benchmark evaluation"""
	benchmark = get_benchmark(benchmark_name)

	# Get benchmark-specific config
	bench_config = config['benchmarks'].get(benchmark_name, {})
	eval_config = config['evaluation']

	# Merge configs
	kwargs = {
	**eval_config,
	'concurrent_requests': eval_config.get('concurrent_requests', 5)
	}

	# Add benchmark-specific configs but exclude sample_size
	for key, value in bench_config.items():
	if key != 'sample_size':
	kwargs[key] = value

	# Run benchmark
	result = await benchmark.run_benchmark(
	api,
	sample_size=bench_config.get('sample_size'),
	**kwargs
	)

	return result

	async def main():
	parser = argparse.ArgumentParser(description='Run AI benchmark evaluation')
	parser.add_argument('--models', nargs='+', help='Models to evaluate (e.g., gpt-4o claude-3-opus)')
	parser.add_argument('--benchmarks', nargs='+', help='Benchmarks to run (e.g., mmlu gsm8k)')
	parser.add_argument('--config', default='config.yaml', help='Config file path')
	parser.add_argument('--output-dir', default='results', help='Output directory for results')
	parser.add_argument('--no-save', action='store_true', help='Do not save results to files')

	args = parser.parse_args()

	# Load configuration
	config = load_config(args.config)

	# Determine which models to evaluate
	if args.models:
	models_to_eval = args.models
	else:
	# Get all models from config
	models_to_eval = []
	for provider, provider_config in config['models'].items():
	for model in provider_config.get('models', []):
	models_to_eval.append(model)

	# Determine which benchmarks to run
	if args.benchmarks:
	benchmarks_to_run = args.benchmarks
	else:
	# Get enabled benchmarks from config
	benchmarks_to_run = [
	name for name, bench_config in config['benchmarks'].items()
	if bench_config.get('enabled', True)
	]

	print(f"Models to evaluate: {models_to_eval}")
	print(f"Benchmarks to run: {benchmarks_to_run}")

	# Run evaluations
	all_results = []

	for model_name in models_to_eval:
	print(f"\n{'='*60}")
	print(f"Evaluating model: {model_name}")
	print(f"{'='*60}")

	try:
	# Create API instance
	api = APIFactory.create_api(model_name, config)

	# Run each benchmark
	for benchmark_name in benchmarks_to_run:
	print(f"\nRunning {benchmark_name} benchmark...")
	try:
	result = await run_single_evaluation(api, benchmark_name, config)
	all_results.append(result)
	print(f"[OK] {benchmark_name}: {result.accuracy*100:.1f}% accuracy")
	except Exception as e:
	print(f"[ERROR] {benchmark_name}: Error - {e}")

	except Exception as e:
	print(f"Failed to create API for {model_name}: {e}")
	continue

	# Print results table
	print_results_table(all_results)

	# Save results
	if not args.no_save and all_results:
	json_path, csv_path = save_results(all_results, args.output_dir)
	print(f"\nResults saved to:")
	print(f" - {json_path}")
	print(f" - {csv_path}")

	if __name__ == "__main__":
	asyncio.run(main())