Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import pandas as pd | |
| import json | |
| import os | |
| from datetime import datetime | |
| from dotenv import load_dotenv | |
| import time | |
| # Load environment variables | |
| load_dotenv() | |
| RESULTS_DIR = "results" | |
| PROGRESS_FILE = os.path.join(RESULTS_DIR, "gpqa_progress.json") | |
| def load_progress(): | |
| if not os.path.exists(PROGRESS_FILE): | |
| return pd.DataFrame(), "No progress file found. The evaluation might be starting up.", "N/A" | |
| try: | |
| df = pd.read_json(PROGRESS_FILE) | |
| if df.empty: | |
| return pd.DataFrame(), "Progress file is empty.", "N/A" | |
| # Calculate metrics | |
| total_questions = len(df) | |
| correct_answers = df['is_correct'].sum() | |
| accuracy = (correct_answers / total_questions) * 100 if total_questions > 0 else 0 | |
| avg_response_time = df['response_time'].mean() | |
| summary_text = f""" | |
| ## Evaluation Progress | |
| - **Questions Processed:** {total_questions} / 448 | |
| - **Current Accuracy:** {accuracy:.2f}% | |
| - **Correct Answers:** {correct_answers} | |
| - **Average Response Time:** {avg_response_time:.2f} seconds/question | |
| """ | |
| # Get last modified time | |
| last_modified_time = datetime.fromtimestamp(os.path.getmtime(PROGRESS_FILE)).strftime('%Y-%m-%d %H:%M:%S') | |
| return df, summary_text, f"Last updated: {last_modified_time}" | |
| except Exception as e: | |
| return pd.DataFrame(), f"Error loading progress file: {e}", "N/A" | |
| def create_ui(): | |
| df, summary, last_updated = load_progress() | |
| with gr.Blocks(theme=gr.themes.Soft(), title="GPQA Evaluation Progress") as demo: | |
| gr.Markdown("# Real-Time GPQA Evaluation Dashboard") | |
| gr.Markdown("This dashboard shows the progress of the GPQA benchmark evaluation for the `grok-4-0709` model.") | |
| with gr.Row(): | |
| summary_box = gr.Markdown(summary) | |
| last_updated_box = gr.Markdown(last_updated) | |
| with gr.Row(): | |
| # Create a simple plot: number of correct vs incorrect answers | |
| if not df.empty: | |
| correct_counts = df['is_correct'].value_counts().rename({True: 'Correct', False: 'Incorrect'}) | |
| plot = gr.BarPlot(correct_counts, x="Answer Status", y="Count", title="Correct vs. Incorrect Answers", interactive=False) | |
| gr.Markdown("## Raw Results") | |
| gr.DataFrame(df, wrap=True) | |
| return demo | |
| def check_environment(): | |
| """Check if all required environment variables are set""" | |
| issues = [] | |
| if not os.getenv('GROK_API_KEY'): | |
| issues.append("GROK_API_KEY not found in environment") | |
| if not os.getenv('HF_TOKEN'): | |
| issues.append("HF_TOKEN not found (required for GPQA dataset access)") | |
| return issues | |
| def start_evaluation_safe(): | |
| """Safely start the evaluation process with error handling""" | |
| issues = check_environment() | |
| if issues: | |
| print("⚠️ Environment issues detected:") | |
| for issue in issues: | |
| print(f" - {issue}") | |
| print("\nPlease set the required environment variables in .env or Hugging Face Secrets") | |
| return None | |
| import subprocess | |
| import sys | |
| print("Starting background evaluation process...") | |
| command = [ | |
| sys.executable, | |
| "run_evaluation.py", | |
| "--config", "official_config.yaml", | |
| "--models", "grok-4-0709", | |
| "--benchmarks", "gpqa" | |
| ] | |
| try: | |
| # Use Popen to run in the background | |
| process = subprocess.Popen(command) | |
| print(f"Evaluation process started with PID: {process.pid}") | |
| return process | |
| except Exception as e: | |
| print(f"Failed to start evaluation: {e}") | |
| return None | |
| if __name__ == "__main__": | |
| # Check environment first | |
| issues = check_environment() | |
| if issues: | |
| # Create UI with warning message | |
| ui = create_ui() | |
| print("\n⚠️ Running in demo mode due to missing configuration") | |
| else: | |
| # Start evaluation process | |
| process = start_evaluation_safe() | |
| ui = create_ui() | |
| # Launch the UI | |
| ui.launch(server_name="0.0.0.0", server_port=7860) | |