Spaces:
Sleeping
Sleeping
| # Official benchmark configuration matching lm-eval settings | |
| models: | |
| openai: | |
| api_key: "${OPENAI_API_KEY}" | |
| models: | |
| - "gpt-4o" | |
| - "gpt-4-turbo" | |
| - "gpt-3.5-turbo" | |
| anthropic: | |
| api_key: "${ANTHROPIC_API_KEY}" | |
| models: | |
| - "claude-3-5-sonnet-20241022" | |
| - "claude-3-opus-20240229" | |
| - "claude-3-haiku-20240307" | |
| grok: | |
| api_key: "${GROK_API_KEY}" | |
| base_url: "https://api.x.ai/v1" | |
| models: | |
| - "grok-4-0709" | |
| - "grok-beta" | |
| - "grok-2-latest" | |
| benchmarks: | |
| mmlu: | |
| enabled: true | |
| sample_size: null # Use full dataset | |
| subjects: ["all"] | |
| # Official settings | |
| num_fewshot: 5 | |
| doc_to_choice: ["A", "B", "C", "D"] | |
| gsm8k: | |
| enabled: true | |
| sample_size: null # Full test set (1319 samples) | |
| # Official settings | |
| num_fewshot: 8 # 8-shot CoT | |
| use_cot: true | |
| humaneval: | |
| enabled: true | |
| sample_size: null # Full test set (164 samples) | |
| # Official settings | |
| pass_at_k: [1] # Calculate Pass@1 | |
| do_sample: false # Deterministic generation | |
| gpqa: | |
| enabled: true | |
| sample_size: null | |
| subset: "gpqa_main" # or "gpqa_diamond" for harder subset | |
| math: | |
| enabled: true | |
| sample_size: null # Full test set (5000 samples) | |
| # Official settings | |
| use_sympy: true # Use SymPy for equivalence checking | |
| evaluation: | |
| # Generation settings matching lm-eval | |
| temperature: 0.0 # Deterministic for evaluation | |
| max_tokens: 2048 | |
| top_p: 1.0 | |
| # For HumanEval code generation | |
| humaneval_max_tokens: 1024 | |
| # System settings | |
| timeout: 60 # Increased for complex problems | |
| max_retries: 3 | |
| concurrent_requests: 5 | |
| rate_limit_delay: 0.5 | |
| output: | |
| save_results: true | |
| results_dir: "results" | |
| generate_report: true | |
| plot_graphs: true | |
| save_raw_outputs: true # Save all model outputs for debugging |