Spaces:

burtenshaw
/

autotrain-mcp

Runtime error

App Files Files Community

burtenshaw commited on Jun 11

Commit

061fdd4

1 Parent(s): df41d15

fix wandb integration

Browse files

Files changed (1) hide show

app.py +53 -37

app.py CHANGED Viewed

@@ -9,7 +9,6 @@ This single Gradio app:
 import os
 import json
-import time
 import uuid
 import threading
 from datetime import datetime
@@ -18,7 +17,6 @@ import socket
 import gradio as gr
 import pandas as pd
-import wandb
 from autotrain.project import AutoTrainProject
 from autotrain.params import (
     LLMTrainingParams,
@@ -189,24 +187,19 @@ def run_training_background(run_id: str, params: Any, backend: str):
     save_runs(runs)
     try:
-        # Initialize W&B
-        wandb_run = wandb.init(
-            project=WANDB_PROJECT,
-            name=f"{params.project_name}-{int(time.time())}",
-            tags=["autotrain", "mcp"],
-            config={
-                "base_model": params.model,
-                "dataset": params.data_path,
-                "epochs": params.epochs,
-                "batch_size": params.batch_size,
-                "learning_rate": params.lr,
-                "backend": backend,
-            },
-        )
-        wandb_url = (
-            wandb_run.url if wandb_run.url else f"https://wandb.ai/{WANDB_PROJECT}"
-        )
         # Update with W&B URL
         runs = load_runs()
@@ -216,14 +209,12 @@ def run_training_background(run_id: str, params: Any, backend: str):
                 break
         save_runs(runs)
-        # Create and start AutoTrain project
-        project = AutoTrainProject(params=params, backend=backend, process=True)
-        job_id = project.create()
-        print(f"Training started for run {run_id} with job ID: {job_id}")
-        # For demo purposes, simulate training completion after a short delay
-        time.sleep(10)  # In real implementation, monitor actual training
         # Update status to completed
         runs = load_runs()
@@ -231,13 +222,16 @@ def run_training_background(run_id: str, params: Any, backend: str):
             if run["run_id"] == run_id:
                 run["status"] = "completed"
                 run["completed_at"] = datetime.utcnow().isoformat()
                 break
         save_runs(runs)
-        wandb.finish()
     except Exception as e:
         print(f"Training failed for run {run_id}: {str(e)}")
         # Update status to failed
         runs = load_runs()
@@ -249,9 +243,6 @@ def run_training_background(run_id: str, params: Any, backend: str):
                 break
         save_runs(runs)
-        if wandb.run:
-            wandb.finish()
 # MCP Tool Functions (these automatically become MCP tools)
 def start_training_job(
@@ -633,13 +624,19 @@ def get_system_status(random_string: str = "") -> str:
         }
 💡 **Access Points:**
-• Gradio UI: http://localhost:7860
-• MCP Server: http://localhost:7860/gradio_api/mcp/sse
-• MCP Schema: http://localhost:7860/gradio_api/mcp/schema
 🛠️ **W&B Integration:**
 • Project: {WANDB_PROJECT}
-• Set WANDB_PROJECT environment variable to customize"""
         return status_text
@@ -864,8 +861,8 @@ with gr.Blocks(
             This Gradio app automatically serves as an MCP server.
-            **MCP Endpoint:** `http://localhost:7860/gradio_api/mcp/sse`
-            **MCP Schema:** `http://localhost:7860/gradio_api/mcp/schema`
             ### Available MCP Tools:
@@ -875,6 +872,24 @@ with gr.Blocks(
             - `get_task_recommendations` - Get training recommendations
             - `get_system_status` - Check system status
             ### 🤗 Hugging Face Hub Integration:
             To push models to the Hub, set these environment variables:
@@ -906,6 +921,7 @@ with gr.Blocks(
             Total Runs: {len(load_runs())}
             W&B Project: {WANDB_PROJECT}
             Hub Auth: {"✅ Configured" if os.environ.get("HF_TOKEN") else "❌ Missing HF_TOKEN"}
             """)

 import os
 import json
 import uuid
 import threading
 from datetime import datetime
 import gradio as gr
 import pandas as pd
 from autotrain.project import AutoTrainProject
 from autotrain.params import (
     LLMTrainingParams,
     save_runs(runs)
     try:
+        # Set W&B environment variables for AutoTrain to use
+        os.environ["WANDB_PROJECT"] = WANDB_PROJECT
+        print(f"Starting real training for run {run_id}")
+        print(f"Model: {params.model}")
+        print(f"Dataset: {params.data_path}")
+        print(f"Backend: {backend}")
+        # Create AutoTrain project - this will handle W&B internally
+        project = AutoTrainProject(params=params, backend=backend, process=True)
+        # Generate approximate W&B URL
+        wandb_url = f"https://wandb.ai/{WANDB_PROJECT}"
         # Update with W&B URL
         runs = load_runs()
                 break
         save_runs(runs)
+        # Actually run the training - this blocks until completion
+        print(f"Executing training job for run {run_id}...")
+        result = project.create()
+        print(f"Training completed successfully for run {run_id}")
+        print(f"Result: {result}")
         # Update status to completed
         runs = load_runs()
             if run["run_id"] == run_id:
                 run["status"] = "completed"
                 run["completed_at"] = datetime.utcnow().isoformat()
+                if result:
+                    run["result"] = str(result)
                 break
         save_runs(runs)
     except Exception as e:
         print(f"Training failed for run {run_id}: {str(e)}")
+        import traceback
+        traceback.print_exc()
         # Update status to failed
         runs = load_runs()
                 break
         save_runs(runs)
 # MCP Tool Functions (these automatically become MCP tools)
 def start_training_job(
         }
 💡 **Access Points:**
+• Gradio UI: http://SPACE_URL
+• MCP Server: http://SPACE_URL/gradio_api/mcp/sse
+• MCP Schema: http://SPACE_URL/gradio_api/mcp/schema
 🛠️ **W&B Integration:**
 • Project: {WANDB_PROJECT}
+• API Key: {"✅ Configured" if os.environ.get("WANDB_API_KEY") else "❌ Missing"}
+• Training Metrics: {
+            "✅ Enabled"
+            if os.environ.get("WANDB_API_KEY")
+            else "❌ System metrics only"
+        }
+• Set WANDB_API_KEY for complete training metrics logging"""
         return status_text
             This Gradio app automatically serves as an MCP server.
+            **MCP Endpoint:** `http://SPACE_URL/gradio_api/mcp/sse`
+            **MCP Schema:** `http://SPACE_URL/gradio_api/mcp/schema`
             ### Available MCP Tools:
             - `get_task_recommendations` - Get training recommendations
             - `get_system_status` - Check system status
+            ### 📊 Weights & Biases Integration:
+            For **complete training metrics** (loss, accuracy, etc.), set:
+            ```bash
+            export WANDB_API_KEY="your-wandb-api-key"
+            export WANDB_PROJECT="autotrain-mcp"  # Optional: custom project name
+            ```
+            Get your API key from: https://wandb.ai/authorize
+            **What gets logged by AutoTrain:**
+            - ✅ Training/validation loss
+            - ✅ Learning rate schedule
+            - ✅ Gradient norms
+            - ✅ Model checkpoints
+            - ✅ System metrics (GPU, CPU, memory)
             ### 🤗 Hugging Face Hub Integration:
             To push models to the Hub, set these environment variables:
             Total Runs: {len(load_runs())}
             W&B Project: {WANDB_PROJECT}
+            W&B Auth: {"✅ Configured" if os.environ.get("WANDB_API_KEY") else "❌ Missing WANDB_API_KEY"}
             Hub Auth: {"✅ Configured" if os.environ.get("HF_TOKEN") else "❌ Missing HF_TOKEN"}
             """)