Spaces:

Fred808
/

control1

Paused

App Files Files Community

Fred808 commited on Oct 13

Commit

501d5b0

verified ·

1 Parent(s): 19f193d

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -55

app.py CHANGED Viewed

@@ -18,10 +18,12 @@ class Settings:
     CONTROLLER_BASE_URL = os.getenv("CONTROLLER_BASE_URL", "http://192.168.1.100:8000")
     # List of tensor server URLs - should be actual IP addresses or hostnames
-    TENSOR_SERVER_URLS = os.getenv("TENSOR_SERVER_URLS", "").split(",") or [
         "https://fred808-ilob.hf.space",
-        "https://fred808-tserv.hf.space",
-        "https://fred808-tserve2.hf.space"
     ]
     AGGREGATOR_URL = os.getenv("AGGREGATOR_URL", "http://192.168.1.104:8002")
@@ -881,71 +883,49 @@ async def initialize_system():
 # ===== Main Execution =====
 @app.on_event("startup")
 async def startup_event():
-    """Initialize the server and start background tasks"""
     print("[INFO] Initializing system...")
     try:
         # Initialize system and download model
         await initialize_system()
         print("[INFO] Model initialization complete")
-        # Try to connect to pre-configured tensor servers
-        connected_servers = []
-        print(f"[INFO] Attempting to connect to tensor servers...")
-        for url in Settings.TENSOR_SERVER_URLS:
-            try:
-                print(f"[INFO] Testing connection to {url}...")
-                if await check_tensor_server_health(url):
-                    server = TensorServer(url=url)
-                    state.tensor_servers[str(url)] = server
-                    connected_servers.append(server)
-                    print(f"[INFO] Successfully connected to tensor server at {url}")
-            except Exception as e:
-                print(f"[WARN] Failed to connect to tensor server {url}: {str(e)}")
-        if connected_servers:
-            print(f"[INFO] Connected to {len(connected_servers)} tensor servers")
-            # Split model into chunks
-            print("[INFO] Splitting model into chunks...")
-            if await split_model_weights():
-                print(f"[INFO] Successfully split model into {len(state.model_chunks)} chunks")
-                # Actively distribute chunks to servers
-                print("[INFO] Starting chunk distribution...")
-                distribution_tasks = []
-                for chunk_id, chunk in state.model_chunks.items():
-                    # Send each chunk to at least 2 servers if available
-                    target_servers = connected_servers[:2]
-                    for server in target_servers:
-                        print(f"[INFO] Preparing to send chunk {chunk_id} to {server.url}")
-                        task = asyncio.create_task(
-                            send_chunk_to_server(str(server.url), chunk_id, chunk)
-                        )
-                        distribution_tasks.append(task)
-                        # Update assignments
-                        if str(server.url) not in chunk.server_assignments:
-                            chunk.server_assignments.append(str(server.url))
-                        if chunk_id not in server.model_chunks:
-                            server.model_chunks.append(chunk_id)
-                if distribution_tasks:
-                    print(f"[INFO] Waiting for {len(distribution_tasks)} distribution tasks to complete...")
-                    results = await asyncio.gather(*distribution_tasks, return_exceptions=True)
-                    success_count = sum(1 for r in results if r is True)
-                    print(f"[INFO] Successfully distributed {success_count} chunks out of {len(distribution_tasks)} attempts")
-            else:
-                print("[ERROR] Failed to split model weights")
         else:
-            print("[WARN] No tensor servers available for distribution")
     except Exception as e:
         print(f"[ERROR] Startup error: {str(e)}")
-    # Start monitoring task
-    asyncio.create_task(monitor_tensor_servers())
-    print("[INFO] Server monitoring started")
 if __name__ == "__main__":
     port = int(os.getenv("PORT", 8000))

     CONTROLLER_BASE_URL = os.getenv("CONTROLLER_BASE_URL", "http://192.168.1.100:8000")
     # List of tensor server URLs - should be actual IP addresses or hostnames
+    TENSOR_SERVER_URLS = [
+        url for url in os.getenv("TENSOR_SERVER_URLS", "").split(",") if url
+    ] or [
         "https://fred808-ilob.hf.space",
+        "https://fred808-tserv.hf.space",
+        "https://fred808-tserve2.hf.space",
     ]
     AGGREGATOR_URL = os.getenv("AGGREGATOR_URL", "http://192.168.1.104:8002")
 # ===== Main Execution =====
 @app.on_event("startup")
 async def startup_event():
+    """Initialize the server and start distribution"""
     print("[INFO] Initializing system...")
     try:
         # Initialize system and download model
         await initialize_system()
         print("[INFO] Model initialization complete")
+        # Split model into chunks
+        if await split_model_weights():
+            print(f"[INFO] Successfully split model into {len(state.model_chunks)} chunks")
+            # Distribute chunks to tensor servers
+            print("[INFO] Starting chunk distribution...")
+            distribution_tasks = []
+            # Round-robin distribution to tensor servers
+            for chunk_id, chunk in state.model_chunks.items():
+                # Determine target servers (distribute each chunk to 2 servers for redundancy)
+                server_indices = [i % len(Settings.TENSOR_SERVER_URLS) for i in range(chunk_id * 2, chunk_id * 2 + 2)]
+                target_servers = [Settings.TENSOR_SERVER_URLS[i] for i in server_indices]
+                for server_url in target_servers:
+                    print(f"[INFO] Sending chunk {chunk_id} to {server_url}")
+                    task = asyncio.create_task(
+                        send_chunk_to_server(server_url, chunk_id, chunk)
+                    )
+                    distribution_tasks.append(task)
+                    # Track assignments for future reference
+                    chunk.server_assignments.append(server_url)
+            if distribution_tasks:
+                print(f"[INFO] Distributing {len(distribution_tasks)} chunks...")
+                results = await asyncio.gather(*distribution_tasks, return_exceptions=True)
+                success_count = sum(1 for r in results if r is True)
+                print(f"[INFO] Successfully distributed {success_count} chunks out of {len(distribution_tasks)} attempts")
         else:
+            print("[ERROR] Failed to split model weights")
     except Exception as e:
         print(f"[ERROR] Startup error: {str(e)}")
+    print("[INFO] Startup complete")
 if __name__ == "__main__":
     port = int(os.getenv("PORT", 8000))