Spaces:
Running
Running
| #!/usr/bin/env python3 | |
| """ | |
| Setup script for Hugging Face Space deployment | |
| Ensures GPQA benchmark can run successfully on HF | |
| """ | |
| import os | |
| import sys | |
| import subprocess | |
| from pathlib import Path | |
| def create_deployment_files(): | |
| """Create necessary files for HF deployment""" | |
| print("π Setting up Hugging Face Space deployment...") | |
| # 1. Update requirements.txt with HF dependencies | |
| requirements_path = Path("requirements.txt") | |
| existing_reqs = requirements_path.read_text() if requirements_path.exists() else "" | |
| hf_deps = [ | |
| "huggingface_hub>=0.20.0", | |
| "gradio>=4.31.0", | |
| "python-dotenv>=0.19.0" | |
| ] | |
| for dep in hf_deps: | |
| if dep.split(">=")[0] not in existing_reqs: | |
| existing_reqs += f"\n{dep}" | |
| requirements_path.write_text(existing_reqs.strip() + "\n") | |
| print("β Updated requirements.txt") | |
| # 2. Create .env.example | |
| env_example = """# Hugging Face Space Configuration | |
| # Copy this to .env or set in HF Secrets | |
| # Required: Your Grok API key from x.ai | |
| GROK_API_KEY=your_grok_api_key_here | |
| # Required: Your Hugging Face token for GPQA dataset access | |
| # Get it from: https://huggingface.co/settings/tokens | |
| HF_TOKEN=your_hugging_face_token_here | |
| # Optional: OpenAI and Anthropic keys for comparison | |
| # OPENAI_API_KEY=your_openai_key_here | |
| # ANTHROPIC_API_KEY=your_anthropic_key_here | |
| """ | |
| with open(".env.example", "w") as f: | |
| f.write(env_example) | |
| print("β Created .env.example") | |
| # 3. Create HF-specific run script | |
| run_script = """#!/usr/bin/env python3 | |
| \"\"\" | |
| Hugging Face Space entry point for GPQA evaluation | |
| \"\"\" | |
| import os | |
| import sys | |
| from dotenv import load_dotenv | |
| # Load environment variables | |
| load_dotenv() | |
| # Set HF token if available | |
| hf_token = os.getenv('HF_TOKEN') | |
| if hf_token: | |
| os.environ['HUGGING_FACE_HUB_TOKEN'] = hf_token | |
| print("β HF Token configured") | |
| # Import and run the app | |
| from app import create_ui, start_evaluation_safe, check_environment | |
| if __name__ == "__main__": | |
| # Check environment | |
| issues = check_environment() | |
| if issues: | |
| print("\\nβ οΈ Configuration issues:") | |
| for issue in issues: | |
| print(f" - {issue}") | |
| print("\\nThe app will run in demo mode.") | |
| print("To enable GPQA evaluation, please set the required secrets in HF Space settings.") | |
| else: | |
| print("β All environment variables configured") | |
| # Start evaluation in background | |
| start_evaluation_safe() | |
| # Create and launch UI | |
| ui = create_ui() | |
| ui.launch() | |
| """ | |
| with open("run_hf_space.py", "w") as f: | |
| f.write(run_script) | |
| os.chmod("run_hf_space.py", 0o755) | |
| print("β Created run_hf_space.py") | |
| # 4. Create README for HF Space | |
| readme_content = """--- | |
| title: Grok-4 GPQA Evaluation | |
| emoji: π§ | |
| colorFrom: blue | |
| colorTo: green | |
| sdk: gradio | |
| sdk_version: "4.31.0" | |
| app_file: run_hf_space.py | |
| pinned: false | |
| --- | |
| # Grok-4 GPQA Evaluation Dashboard | |
| Real-time evaluation of Grok-4 model on the GPQA (Graduate-Level Google-Proof Q&A) benchmark. | |
| ## π§ Configuration | |
| This Space requires the following secrets to be set in your HF Space settings: | |
| 1. **GROK_API_KEY** (Required) | |
| - Get from: https://x.ai | |
| - Your Grok API key for running evaluations | |
| 2. **HF_TOKEN** (Required) | |
| - Get from: https://huggingface.co/settings/tokens | |
| - Required for accessing the GPQA dataset | |
| - Make sure you have requested access to: https://huggingface.co/datasets/Idavidrein/gpqa | |
| ## π Features | |
| - Real-time progress tracking | |
| - Accuracy metrics and performance stats | |
| - Detailed results export | |
| - Support for full GPQA dataset (448 questions) | |
| ## π Quick Start | |
| 1. Fork this Space | |
| 2. Set the required secrets in your Space settings | |
| 3. The evaluation will start automatically | |
| 4. Monitor progress in the dashboard | |
| ## β οΈ Known Issues | |
| - GPQA dataset requires access approval (usually 1-2 days) | |
| - Grok-4-0709 uses extensive reasoning tokens (~2500-3000 per question) | |
| - Full evaluation takes ~3-4 hours due to model response times | |
| ## π Expected Performance | |
| Based on our testing: | |
| - Accuracy: ~80-90% (excluding timeouts) | |
| - Avg Response Time: ~50s per question | |
| - Total Runtime: ~3-4 hours for full dataset | |
| """ | |
| with open("README_HF.md", "w") as f: | |
| f.write(readme_content) | |
| print("β Created README_HF.md") | |
| # 5. Create pre-flight check script | |
| check_script = """#!/usr/bin/env python3 | |
| \"\"\" | |
| Pre-deployment checklist for HF Space | |
| \"\"\" | |
| import os | |
| import sys | |
| from pathlib import Path | |
| def check_deployment_ready(): | |
| \"\"\"Check if everything is ready for HF deployment\"\"\" | |
| print("π Pre-deployment checklist:\\n") | |
| checks = [] | |
| # Check files exist | |
| required_files = [ | |
| "app.py", | |
| "run_evaluation.py", | |
| "requirements.txt", | |
| ".env.example", | |
| "run_hf_space.py", | |
| "official_config.yaml" | |
| ] | |
| for file in required_files: | |
| if Path(file).exists(): | |
| checks.append((f"β {file} exists", True)) | |
| else: | |
| checks.append((f"β {file} missing", False)) | |
| # Check API directories | |
| if Path("apis").is_dir() and list(Path("apis").glob("*.py")): | |
| checks.append(("β APIs directory configured", True)) | |
| else: | |
| checks.append(("β APIs directory missing or empty", False)) | |
| # Check benchmarks directory | |
| if Path("benchmarks").is_dir() and Path("benchmarks/gpqa_benchmark.py").exists(): | |
| checks.append(("β GPQA benchmark implementation found", True)) | |
| else: | |
| checks.append(("β GPQA benchmark missing", False)) | |
| # Check for sensitive data | |
| if Path(".env").exists(): | |
| checks.append(("β οΈ .env file exists - make sure it's in .gitignore!", None)) | |
| # Print results | |
| for check, status in checks: | |
| print(check) | |
| all_good = all(status is not False for _, status in checks) | |
| if all_good: | |
| print("\\nβ Ready for deployment!") | |
| print("\\nNext steps:") | |
| print("1. Set GROK_API_KEY and HF_TOKEN in HF Space secrets") | |
| print("2. Make sure you have GPQA dataset access") | |
| print("3. Push to Hugging Face") | |
| else: | |
| print("\\nβ Issues found - please fix before deploying") | |
| return all_good | |
| if __name__ == "__main__": | |
| check_deployment_ready() | |
| """ | |
| with open("check_deployment.py", "w") as f: | |
| f.write(check_script) | |
| os.chmod("check_deployment.py", 0o755) | |
| print("β Created check_deployment.py") | |
| print("\nπ Deployment files created successfully!") | |
| print("\nNext steps:") | |
| print("1. Run: python check_deployment.py") | |
| print("2. Set your API keys in HF Space secrets") | |
| print("3. Push to Hugging Face") | |
| if __name__ == "__main__": | |
| create_deployment_files() |