Spaces:

Fred808
/

control1

Paused

App Files Files Community

Fred808 commited on Oct 14

Commit

73df3b2

verified ·

1 Parent(s): 5b3f4f4

Update app.py

Browse files

Files changed (1) hide show

app.py +82 -34

app.py CHANGED Viewed

@@ -139,46 +139,94 @@ state = ControllerState()
 # ===== Helper Functions =====
 async def split_model_weights():
-    """Split model weights into chunks based on available servers"""
     try:
-        import torch
         import math
-        # Install required packages if not present
-        try:
-            import safetensors
-        except ImportError:
-            print("[INFO] Installing required packages...")
-            import subprocess
-            subprocess.check_call(["pip", "install", "safetensors", "packaging"])
-        # Load the full model weights
-        import torch
-        from safetensors.torch import load_file as load_safetensors
-        # Try safetensors first with chunked loading, then fallback to pytorch
         try:
             model_file = next(f for f in state.model_files.values() if f.endswith('.safetensors'))
-            print(f"[INFO] Loading weights from safetensors file: {model_file}")
-            try:
-                # Try direct loading first
-                weights = load_safetensors(model_file)
-            except Exception as e:
-                if "header too large" in str(e):
-                    print("[INFO] Large header detected, attempting chunked loading...")
-                    from safetensors import safe_open
-                    weights = {}
-                    with safe_open(model_file, framework="pt") as f:
-                        for key in f.keys():
-                            weights[key] = f.get_tensor(key)
-                    print("[INFO] Successfully loaded weights using chunked loading")
-                else:
-                    raise e
         except StopIteration:
-            # No safetensors file found, try pytorch
-            model_file = next(f for f in state.model_files.values() if f.endswith('.bin'))
-            print(f"[INFO] Loading weights from PyTorch file: {model_file}")
-            weights = torch.load(model_file, map_location='cpu')
         # Calculate total model size and chunks
         total_size_bytes = sum(p.nelement() * p.element_size() for p in weights.values())

 # ===== Helper Functions =====
 async def split_model_weights():
+    """Split model files into chunks based on available servers without loading into memory"""
     try:
+        import os
         import math
+        import shutil
+        from pathlib import Path
+        # Find model file (safetensors or pytorch)
         try:
             model_file = next(f for f in state.model_files.values() if f.endswith('.safetensors'))
+            print(f"[INFO] Found safetensors file: {model_file}")
         except StopIteration:
+            try:
+                model_file = next(f for f in state.model_files.values() if f.endswith('.bin'))
+                print(f"[INFO] Found PyTorch file: {model_file}")
+            except StopIteration:
+                raise Exception("No model weight files found")
+        # Get file size and calculate chunks
+        file_size = os.path.getsize(model_file)
+        num_servers = len(state.tensor_servers) or len(Settings.TENSOR_SERVER_URLS)
+        num_chunks = num_servers  # One chunk per server initially
+        chunk_size = math.ceil(file_size / num_chunks)
+        print(f"[INFO] Model file size: {file_size / (1024*1024*1024):.2f} GB")
+        print(f"[INFO] Creating {num_chunks} chunks of {chunk_size / (1024*1024):.2f} MB each")
+        # Create chunks directory if it doesn't exist
+        chunks_dir = os.path.join(os.path.dirname(model_file), "chunks")
+        os.makedirs(chunks_dir, exist_ok=True)
+        # Split the file into chunks
+        with open(model_file, 'rb') as f:
+            chunk_sizes = []  # Track actual chunk sizes
+            for chunk_id in range(num_chunks):
+                chunk_path = os.path.join(chunks_dir, f"chunk_{chunk_id}.bin")
+                # Calculate chunk boundaries
+                start_pos = chunk_id * chunk_size
+                remaining = file_size - start_pos
+                current_chunk_size = min(chunk_size, remaining)
+                if current_chunk_size <= 0:
+                    break
+                # Read and write chunk
+                f.seek(start_pos)
+                chunk_data = f.read(current_chunk_size)
+                with open(chunk_path, 'wb') as chunk_file:
+                    chunk_file.write(chunk_data)
+                chunk_sizes.append(current_chunk_size)
+                # Create chunk metadata
+                state.model_chunks[chunk_id] = ModelChunk(
+                    chunk_id=chunk_id,
+                    files=[f"chunk_{chunk_id}.bin"],
+                    config={
+                        "start_offset": start_pos,
+                        "size_bytes": current_chunk_size,
+                        "is_last_chunk": chunk_id == num_chunks - 1,
+                        "total_chunks": num_chunks,
+                        "original_file": os.path.basename(model_file)
+                    },
+                    size_bytes=current_chunk_size,
+                    status="ready"
+                )
+                print(f"[INFO] Created chunk {chunk_id}: {current_chunk_size / (1024*1024):.2f} MB")
+        # Verify distribution
+        total_size_actual = sum(chunk_sizes)
+        if total_size_actual != file_size:
+            print(f"[WARN] Total chunk size ({total_size_actual}) differs from original file size ({file_size})")
+        print(f"\n[INFO] Distribution Summary:")
+        print(f"- Original file: {os.path.basename(model_file)}")
+        print(f"- Total size: {file_size / (1024*1024*1024):.2f} GB")
+        print(f"- Number of chunks: {len(state.model_chunks)}")
+        print(f"- Chunks directory: {chunks_dir}")
+        print(f"- Chunk size: {chunk_size / (1024*1024):.2f} MB")
+        return True
+    except Exception as e:
+        print(f"[ERROR] Failed to split model weights: {str(e)}")
+        return False
         # Calculate total model size and chunks
         total_size_bytes = sum(p.nelement() * p.element_size() for p in weights.values())