Spaces:

Fred808
/

control1

Paused

App Files Files Community

Fred808 commited on Oct 13

Commit

d3e88f3

verified ·

1 Parent(s): 6b04d23

Update app.py

Browse files

Files changed (1) hide show

app.py +80 -5

app.py CHANGED Viewed

@@ -28,7 +28,7 @@ class Settings:
     AGGREGATOR_URL = os.getenv("AGGREGATOR_URL", "http://192.168.1.104:8002")
     # Model settings
-    MODEL_REPO = "https://huggingface.co/microsoft/Florence-2-large"
     # Server settings
     TENSOR_SERVER_TIMEOUT = 30  # seconds
@@ -634,10 +634,31 @@ async def register_tensor_server(server_url: HttpUrl):
     state.tensor_servers[str(server_url)] = TensorServer(url=server_url)
     print(f"[INFO] Registered new tensor server at {server_url}")
     return {
         "status": "registered",
         "registered_servers": len(state.tensor_servers),
-        "server_id": str(server_url)
     }
 @app.delete("/unregister_tensor_server")
@@ -721,6 +742,30 @@ async def initialize_system():
         else:
             files_status[filename] = {"exists": exists, "size_bytes": 0}
     return {
         "status": "initialized",
         "model_loaded": state.is_model_loaded,
@@ -728,7 +773,10 @@ async def initialize_system():
         "total_size_bytes": total_size,
         "config_loaded": bool(state.model_config),
         "model_type": state.model_config.get("model_type", "unknown"),
-        "architecture": state.model_config.get("architectures", ["unknown"])[0]
     }
 # ===== Main Execution =====
@@ -736,8 +784,35 @@ async def initialize_system():
 async def startup_event():
     """Initialize the server and start background tasks"""
     print("[INFO] Initializing system...")
-    await initialize_system()
-    print("[INFO] Model initialization complete")
     # Start monitoring task
     asyncio.create_task(monitor_tensor_servers())

     AGGREGATOR_URL = os.getenv("AGGREGATOR_URL", "http://192.168.1.104:8002")
     # Model settings
+    MODEL_REPO = "https://huggingface.co/facebook/opt-125m"
     # Server settings
     TENSOR_SERVER_TIMEOUT = 30  # seconds
     state.tensor_servers[str(server_url)] = TensorServer(url=server_url)
     print(f"[INFO] Registered new tensor server at {server_url}")
+    # If model is loaded, automatically distribute chunks
+    if state.is_model_loaded:
+        print(f"[INFO] Model is loaded, starting distribution for new server {server_url}")
+        try:
+            # Create chunks if they don't exist
+            if not state.model_chunks:
+                if await split_model_weights():
+                    print(f"[INFO] Successfully split model into {len(state.model_chunks)} chunks")
+                else:
+                    print("[ERROR] Failed to split model weights")
+            # Distribute chunks
+            if await distribute_model_chunks():
+                print("[INFO] Successfully distributed chunks to tensor servers")
+            else:
+                print("[ERROR] Failed to distribute chunks")
+        except Exception as e:
+            print(f"[ERROR] Distribution error during server registration: {str(e)}")
     return {
         "status": "registered",
         "registered_servers": len(state.tensor_servers),
+        "server_id": str(server_url),
+        "model_loaded": state.is_model_loaded,
+        "chunks_distributed": len(state.model_chunks) if state.model_chunks else 0
     }
 @app.delete("/unregister_tensor_server")
         else:
             files_status[filename] = {"exists": exists, "size_bytes": 0}
+    # Start model distribution if we have tensor servers
+    distribution_status = "not_started"
+    if state.tensor_servers:
+        print("[INFO] Starting automatic model distribution...")
+        try:
+            # Split model into chunks
+            if await split_model_weights():
+                print(f"[INFO] Successfully split model into {len(state.model_chunks)} chunks")
+                # Distribute chunks to servers
+                if await distribute_model_chunks():
+                    print("[INFO] Successfully distributed chunks to tensor servers")
+                    distribution_status = "completed"
+                else:
+                    print("[ERROR] Failed to distribute chunks")
+                    distribution_status = "distribution_failed"
+            else:
+                print("[ERROR] Failed to split model weights")
+                distribution_status = "split_failed"
+        except Exception as e:
+            print(f"[ERROR] Distribution error: {str(e)}")
+            distribution_status = f"error: {str(e)}"
+    else:
+        print("[INFO] No tensor servers registered yet. Will distribute when servers register.")
     return {
         "status": "initialized",
         "model_loaded": state.is_model_loaded,
         "total_size_bytes": total_size,
         "config_loaded": bool(state.model_config),
         "model_type": state.model_config.get("model_type", "unknown"),
+        "architecture": state.model_config.get("architectures", ["unknown"])[0],
+        "distribution_status": distribution_status,
+        "registered_servers": len(state.tensor_servers),
+        "chunks_created": len(state.model_chunks) if state.model_chunks else 0
     }
 # ===== Main Execution =====
 async def startup_event():
     """Initialize the server and start background tasks"""
     print("[INFO] Initializing system...")
+    try:
+        # Initialize system and download model
+        await initialize_system()
+        print("[INFO] Model initialization complete")
+        # If we have pre-configured tensor servers, try to connect to them
+        if Settings.TENSOR_SERVER_URLS:
+            print(f"[INFO] Attempting to connect to {len(Settings.TENSOR_SERVER_URLS)} pre-configured tensor servers...")
+            for url in Settings.TENSOR_SERVER_URLS:
+                try:
+                    if await check_tensor_server_health(url):
+                        state.tensor_servers[str(url)] = TensorServer(url=url)
+                        print(f"[INFO] Successfully registered pre-configured server at {url}")
+                except Exception as e:
+                    print(f"[WARN] Failed to connect to pre-configured server {url}: {str(e)}")
+        # If we have both model and servers, start distribution
+        if state.is_model_loaded and state.tensor_servers:
+            print("[INFO] Starting initial model distribution...")
+            if await split_model_weights():
+                print(f"[INFO] Split model into {len(state.model_chunks)} chunks")
+                if await distribute_model_chunks():
+                    print("[INFO] Successfully completed initial distribution")
+                else:
+                    print("[WARN] Initial distribution failed")
+            else:
+                print("[WARN] Failed to split model weights")
+    except Exception as e:
+        print(f"[ERROR] Startup error: {str(e)}")
     # Start monitoring task
     asyncio.create_task(monitor_tensor_servers())