Spaces:

Fred808
/

control1

Paused

App Files Files Community

Fred808 commited on Oct 14

Commit

f3a8698

verified ·

1 Parent(s): 336d73a

Update app.py

Browse files

Files changed (1) hide show

app.py +125 -14

app.py CHANGED Viewed

@@ -136,26 +136,39 @@ class ControllerState:
         self.chunks_dir: str = ""              # Directory containing chunk files
         self.operation_results: Dict[str, Dict] = {}  # Track operation results from tensor servers
         self.pending_operations: Dict[str, asyncio.Task] = {}  # Track ongoing operations
 state = ControllerState()
 # ===== Helper Functions =====
 async def split_model_weights():
-    """Split model files into chunks based on available servers without loading into memory"""
     try:
         import os
         import math
         import shutil
         from pathlib import Path
-        # Find model file (safetensors or pytorch)
         try:
-            model_file = next(f for f in state.model_files.values() if f.endswith('.safetensors'))
-            print(f"[INFO] Found safetensors file: {model_file}")
         except StopIteration:
             try:
-                model_file = next(f for f in state.model_files.values() if f.endswith('.bin'))
-                print(f"[INFO] Found PyTorch file: {model_file}")
             except StopIteration:
                 raise Exception("No model weight files found")
@@ -397,6 +410,7 @@ async def send_chunk_to_server(server_url: str, chunk_id: int, chunk_info: Dict)
         }
         async with aiohttp.ClientSession() as session:
             async with session.post(
                 f"{server_url}/load_chunk",
                 json=chunk_data,
@@ -404,11 +418,58 @@ async def send_chunk_to_server(server_url: str, chunk_id: int, chunk_info: Dict)
             ) as response:
                 if response.status != 200:
                     error_msg = await response.text()
-                    raise Exception(f"Failed to load chunk: {error_msg}")
                 result = await response.json()
-                print(f"[INFO] Successfully loaded chunk {chunk_id} to {server_url}")
-                return True
     except Exception as e:
         print(f"[ERROR] Failed to send chunk {chunk_id} to {server_url}: {str(e)}")
@@ -923,16 +984,66 @@ async def redistribute_chunks():
 @app.get("/chunks/{chunk_id}/status")
 async def get_chunk_status(chunk_id: int):
-    """Get the status and assignments of a specific chunk"""
     if chunk_id not in state.model_chunks:
         raise HTTPException(status_code=404, detail="Chunk not found")
     chunk = state.model_chunks[chunk_id]
     return {
         "chunk_id": chunk_id,
         "status": chunk.status,
-        "server_assignments": chunk.server_assignments,
-        "metrics": chunk.metrics
     }
 @app.post("/initialize")
@@ -1042,7 +1153,7 @@ if __name__ == "__main__":
     print(f"[INFO] API Documentation available at http://localhost:{port}/docs")
     uvicorn.run(
-        "app:app",
         host="0.0.0.0",
         port=port,
         reload=False

         self.chunks_dir: str = ""              # Directory containing chunk files
         self.operation_results: Dict[str, Dict] = {}  # Track operation results from tensor servers
         self.pending_operations: Dict[str, asyncio.Task] = {}  # Track ongoing operations
+        self.chunk_assignments: Dict[int, List[Dict[str, any]]] = {}  # Track which chunks are on which servers
+        self.chunk_distribution_history: List[Dict[str, any]] = []  # Track distribution history with timestamps
 state = ControllerState()
 # ===== Helper Functions =====
 async def split_model_weights():
+    """Split model files into chunks and convert to safetensors format"""
     try:
         import os
         import math
         import shutil
+        import torch
+        from safetensors.torch import save_file, load_file
         from pathlib import Path
+        # Find model file and convert to safetensors if needed
         try:
+            model_file = next(f for f in state.model_files.values() if f.endswith('.bin'))
+            print(f"[INFO] Found PyTorch file: {model_file}")
+            # Convert to safetensors
+            print("[INFO] Converting model to safetensors format...")
+            weights = torch.load(model_file, map_location='cpu')
+            safetensors_path = os.path.join(state.model_path, "model.safetensors")
+            save_file(weights, safetensors_path)
+            model_file = safetensors_path
+            print(f"[INFO] Converted model to safetensors format: {model_file}")
         except StopIteration:
             try:
+                model_file = next(f for f in state.model_files.values() if f.endswith('.safetensors'))
+                print(f"[INFO] Found existing safetensors file: {model_file}")
             except StopIteration:
                 raise Exception("No model weight files found")
         }
         async with aiohttp.ClientSession() as session:
+            # Step 1: Send chunk configuration
             async with session.post(
                 f"{server_url}/load_chunk",
                 json=chunk_data,
             ) as response:
                 if response.status != 200:
                     error_msg = await response.text()
+                    raise Exception(f"Failed to register chunk: {error_msg}")
                 result = await response.json()
+                if not result.get("ready_for_data", False):
+                    raise Exception("Server not ready for chunk data")
+                # Step 2: Upload chunk data
+                with open(chunk_path, 'rb') as f:
+                    chunk_file = f.read()
+                form = aiohttp.FormData()
+                form.add_field('file',
+                             chunk_file,
+                             filename=os.path.basename(chunk_path),
+                             content_type='application/octet-stream')
+                async with session.post(
+                    f"{server_url}/upload_chunk_data/{chunk_id}",
+                    data=form,
+                    timeout=Settings.TENSOR_SERVER_TIMEOUT
+                ) as upload_response:
+                    if upload_response.status != 200:
+                        error_msg = await upload_response.text()
+                        raise Exception(f"Failed to upload chunk data: {error_msg}")
+                    upload_result = await upload_response.json()
+                    # Track the assignment
+                    if chunk_id not in state.chunk_assignments:
+                        state.chunk_assignments[chunk_id] = []
+                    assignment = {
+                        "server_url": server_url,
+                        "timestamp": datetime.now().isoformat(),
+                        "status": "loaded",
+                        "size_bytes": upload_result.get('size_bytes', 0)
+                    }
+                    state.chunk_assignments[chunk_id].append(assignment)
+                    # Add to history
+                    state.chunk_distribution_history.append({
+                        "chunk_id": chunk_id,
+                        "server_url": server_url,
+                        "timestamp": datetime.now().isoformat(),
+                        "action": "upload",
+                        "status": "success",
+                        "size_bytes": upload_result.get('size_bytes', 0)
+                    })
+                    print(f"[INFO] Successfully uploaded chunk {chunk_id} to {server_url} ({upload_result.get('size_bytes', 0)} bytes)")
+                    print(f"[INFO] Current assignments for chunk {chunk_id}: {len(state.chunk_assignments[chunk_id])} servers")
+                    return True
     except Exception as e:
         print(f"[ERROR] Failed to send chunk {chunk_id} to {server_url}: {str(e)}")
 @app.get("/chunks/{chunk_id}/status")
 async def get_chunk_status(chunk_id: int):
+    """Get detailed status and assignments of a specific chunk"""
     if chunk_id not in state.model_chunks:
         raise HTTPException(status_code=404, detail="Chunk not found")
     chunk = state.model_chunks[chunk_id]
+    assignments = state.chunk_assignments.get(chunk_id, [])
+    # Get current server status for each assignment
+    current_status = []
+    for assignment in assignments:
+        server_url = assignment["server_url"]
+        if server_url in state.tensor_servers:
+            server = state.tensor_servers[server_url]
+            current_status.append({
+                "server_url": server_url,
+                "server_status": server.status,
+                "last_heartbeat": server.last_heartbeat.isoformat(),
+                "metrics": server.metrics.dict(),
+                "assignment_time": assignment["timestamp"]
+            })
     return {
         "chunk_id": chunk_id,
         "status": chunk.status,
+        "size_bytes": chunk.size_bytes,
+        "current_assignments": current_status,
+        "assignment_history": [
+            h for h in state.chunk_distribution_history
+            if h["chunk_id"] == chunk_id
+        ],
+        "metrics": chunk.metrics,
+        "config": chunk.config
+    }
+@app.get("/distribution/status")
+async def get_distribution_status():
+    """Get overall distribution status of all chunks"""
+    distribution_summary = {}
+    for chunk_id, chunk in state.model_chunks.items():
+        assignments = state.chunk_assignments.get(chunk_id, [])
+        active_servers = [
+            a["server_url"] for a in assignments
+            if a["server_url"] in state.tensor_servers and
+            state.tensor_servers[a["server_url"]].status in ["ready", "busy"]
+        ]
+        distribution_summary[chunk_id] = {
+            "total_assignments": len(assignments),
+            "active_servers": len(active_servers),
+            "server_urls": active_servers,
+            "size_bytes": chunk.size_bytes,
+            "status": chunk.status
+        }
+    return {
+        "total_chunks": len(state.model_chunks),
+        "total_servers": len(state.tensor_servers),
+        "chunks": distribution_summary,
+        "history": state.chunk_distribution_history[-10:]  # Last 10 events
     }
 @app.post("/initialize")
     print(f"[INFO] API Documentation available at http://localhost:{port}/docs")
     uvicorn.run(
+        "controller_server_new:app",
         host="0.0.0.0",
         port=port,
         reload=False