Spaces:
Sleeping
Sleeping
| """Main script to run AI model evaluation benchmarks""" | |
| import argparse | |
| import asyncio | |
| import json | |
| import os | |
| import yaml | |
| from datetime import datetime | |
| from typing import List, Dict, Any | |
| from dotenv import load_dotenv | |
| import pandas as pd | |
| from apis.api_factory import APIFactory | |
| from benchmarks import get_benchmark, BenchmarkResult | |
| # Load environment variables | |
| load_dotenv() | |
| def load_config(config_path: str = 'official_config.yaml') -> dict: | |
| """Load configuration from YAML file""" | |
| with open(config_path, 'r') as f: | |
| config = yaml.safe_load(f) | |
| # Replace environment variables | |
| def replace_env_vars(obj): | |
| if isinstance(obj, str) and obj.startswith('${') and obj.endswith('}'): | |
| env_var = obj[2:-1] | |
| return os.getenv(env_var, obj) | |
| elif isinstance(obj, dict): | |
| return {k: replace_env_vars(v) for k, v in obj.items()} | |
| elif isinstance(obj, list): | |
| return [replace_env_vars(item) for item in obj] | |
| return obj | |
| return replace_env_vars(config) | |
| def save_results(results: List[BenchmarkResult], output_dir: str): | |
| """Save evaluation results""" | |
| os.makedirs(output_dir, exist_ok=True) | |
| # Create timestamp for this run | |
| timestamp = datetime.now().strftime('%Y%m%d_%H%M%S') | |
| # Save detailed results as JSON | |
| detailed_results = [] | |
| for result in results: | |
| detailed_results.append({ | |
| 'benchmark': result.benchmark_name, | |
| 'model': result.model_name, | |
| 'total_questions': result.total_questions, | |
| 'correct': result.correct, | |
| 'accuracy': result.accuracy, | |
| 'avg_response_time': result.avg_response_time, | |
| 'timestamp': timestamp | |
| }) | |
| json_path = os.path.join(output_dir, f'results_{timestamp}.json') | |
| with open(json_path, 'w') as f: | |
| json.dump(detailed_results, f, indent=2) | |
| # Save summary as CSV | |
| df = pd.DataFrame(detailed_results) | |
| csv_path = os.path.join(output_dir, f'summary_{timestamp}.csv') | |
| df.to_csv(csv_path, index=False) | |
| # Save raw results for debugging | |
| for result in results: | |
| raw_path = os.path.join(output_dir, f'{result.model_name}_{result.benchmark_name}_{timestamp}_raw.json') | |
| with open(raw_path, 'w') as f: | |
| json.dump(result.raw_results, f, indent=2) | |
| return json_path, csv_path | |
| def print_results_table(results: List[BenchmarkResult]): | |
| """Print results in a nice table format""" | |
| if not results: | |
| return | |
| # Group by model | |
| model_results = {} | |
| for result in results: | |
| if result.model_name not in model_results: | |
| model_results[result.model_name] = {} | |
| model_results[result.model_name][result.benchmark_name] = result | |
| # Print header | |
| benchmarks = list(set(r.benchmark_name for r in results)) | |
| benchmarks.sort() | |
| print("\n" + "="*80) | |
| print("EVALUATION RESULTS") | |
| print("="*80) | |
| # Create table | |
| header = ["Model"] + benchmarks + ["Average"] | |
| print(f"{'Model':<20}", end="") | |
| for bench in benchmarks: | |
| print(f"{bench:<15}", end="") | |
| print(f"{'Average':<10}") | |
| print("-"*80) | |
| # Print results for each model | |
| for model, bench_results in model_results.items(): | |
| print(f"{model:<20}", end="") | |
| scores = [] | |
| for bench in benchmarks: | |
| if bench in bench_results: | |
| score = bench_results[bench].accuracy * 100 | |
| scores.append(score) | |
| print(f"{score:>6.1f}% ", end="") | |
| else: | |
| print(f"{'N/A':<15}", end="") | |
| # Calculate average | |
| if scores: | |
| avg = sum(scores) / len(scores) | |
| print(f"{avg:>6.1f}%") | |
| else: | |
| print("N/A") | |
| print("="*80) | |
| async def run_single_evaluation(api, benchmark_name: str, config: dict) -> BenchmarkResult: | |
| """Run a single benchmark evaluation""" | |
| benchmark = get_benchmark(benchmark_name) | |
| # Get benchmark-specific config | |
| bench_config = config['benchmarks'].get(benchmark_name, {}) | |
| eval_config = config['evaluation'] | |
| # Merge configs | |
| kwargs = { | |
| **eval_config, | |
| 'concurrent_requests': eval_config.get('concurrent_requests', 5) | |
| } | |
| # Add benchmark-specific configs but exclude sample_size | |
| for key, value in bench_config.items(): | |
| if key != 'sample_size': | |
| kwargs[key] = value | |
| # Run benchmark | |
| result = await benchmark.run_benchmark( | |
| api, | |
| sample_size=bench_config.get('sample_size'), | |
| **kwargs | |
| ) | |
| return result | |
| async def main(): | |
| parser = argparse.ArgumentParser(description='Run AI benchmark evaluation') | |
| parser.add_argument('--models', nargs='+', help='Models to evaluate (e.g., gpt-4o claude-3-opus)') | |
| parser.add_argument('--benchmarks', nargs='+', help='Benchmarks to run (e.g., mmlu gsm8k)') | |
| parser.add_argument('--config', default='config.yaml', help='Config file path') | |
| parser.add_argument('--output-dir', default='results', help='Output directory for results') | |
| parser.add_argument('--no-save', action='store_true', help='Do not save results to files') | |
| args = parser.parse_args() | |
| # Load configuration | |
| config = load_config(args.config) | |
| # Determine which models to evaluate | |
| if args.models: | |
| models_to_eval = args.models | |
| else: | |
| # Get all models from config | |
| models_to_eval = [] | |
| for provider, provider_config in config['models'].items(): | |
| for model in provider_config.get('models', []): | |
| models_to_eval.append(model) | |
| # Determine which benchmarks to run | |
| if args.benchmarks: | |
| benchmarks_to_run = args.benchmarks | |
| else: | |
| # Get enabled benchmarks from config | |
| benchmarks_to_run = [ | |
| name for name, bench_config in config['benchmarks'].items() | |
| if bench_config.get('enabled', True) | |
| ] | |
| print(f"Models to evaluate: {models_to_eval}") | |
| print(f"Benchmarks to run: {benchmarks_to_run}") | |
| # Run evaluations | |
| all_results = [] | |
| for model_name in models_to_eval: | |
| print(f"\n{'='*60}") | |
| print(f"Evaluating model: {model_name}") | |
| print(f"{'='*60}") | |
| try: | |
| # Create API instance | |
| api = APIFactory.create_api(model_name, config) | |
| # Run each benchmark | |
| for benchmark_name in benchmarks_to_run: | |
| print(f"\nRunning {benchmark_name} benchmark...") | |
| try: | |
| result = await run_single_evaluation(api, benchmark_name, config) | |
| all_results.append(result) | |
| print(f"[OK] {benchmark_name}: {result.accuracy*100:.1f}% accuracy") | |
| except Exception as e: | |
| print(f"[ERROR] {benchmark_name}: Error - {e}") | |
| except Exception as e: | |
| print(f"Failed to create API for {model_name}: {e}") | |
| continue | |
| # Print results table | |
| print_results_table(all_results) | |
| # Save results | |
| if not args.no_save and all_results: | |
| json_path, csv_path = save_results(all_results, args.output_dir) | |
| print(f"\nResults saved to:") | |
| print(f" - {json_path}") | |
| print(f" - {csv_path}") | |
| if __name__ == "__main__": | |
| asyncio.run(main()) |