Update app.py
Browse files
app.py
CHANGED
|
@@ -28,7 +28,8 @@ class Settings:
|
|
| 28 |
AGGREGATOR_URL = os.getenv("AGGREGATOR_URL", "http://192.168.1.104:8002")
|
| 29 |
|
| 30 |
# Model settings
|
| 31 |
-
MODEL_REPO = "https://huggingface.co/
|
|
|
|
| 32 |
# Server settings
|
| 33 |
TENSOR_SERVER_TIMEOUT = 30 # seconds
|
| 34 |
MAX_ERROR_THRESHOLD = 5 # maximum number of errors
|
|
@@ -159,7 +160,24 @@ async def split_model_weights():
|
|
| 159 |
raise Exception("No model weight files found")
|
| 160 |
|
| 161 |
# Get file size and calculate chunks
|
| 162 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 163 |
num_servers = len(state.tensor_servers) or len(Settings.TENSOR_SERVER_URLS)
|
| 164 |
num_chunks = num_servers # One chunk per server initially
|
| 165 |
|
|
@@ -168,13 +186,13 @@ async def split_model_weights():
|
|
| 168 |
# Format sizes for display
|
| 169 |
def format_size(size_bytes):
|
| 170 |
if size_bytes >= 1024*1024*1024: # GB
|
| 171 |
-
return f"{size_bytes / (1024*1024*1024):.2f} GB"
|
| 172 |
elif size_bytes >= 1024*1024: # MB
|
| 173 |
-
return f"{size_bytes / (1024*1024):.2f} MB"
|
| 174 |
elif size_bytes >= 1024: # KB
|
| 175 |
-
return f"{size_bytes / 1024:.2f} KB"
|
| 176 |
else:
|
| 177 |
-
return f"{size_bytes} bytes"
|
| 178 |
|
| 179 |
print(f"[INFO] Model file size: {format_size(file_size)}")
|
| 180 |
print(f"[INFO] Creating {num_chunks} chunks of approximately {format_size(chunk_size)} each")
|
|
@@ -197,13 +215,21 @@ async def split_model_weights():
|
|
| 197 |
break
|
| 198 |
|
| 199 |
# Read and write chunk
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
# Create chunk metadata
|
| 209 |
state.model_chunks[chunk_id] = ModelChunk(
|
|
@@ -1065,7 +1091,7 @@ if __name__ == "__main__":
|
|
| 1065 |
print(f"[INFO] API Documentation available at http://localhost:{port}/docs")
|
| 1066 |
|
| 1067 |
uvicorn.run(
|
| 1068 |
-
"
|
| 1069 |
host="0.0.0.0",
|
| 1070 |
port=port,
|
| 1071 |
reload=False
|
|
|
|
| 28 |
AGGREGATOR_URL = os.getenv("AGGREGATOR_URL", "http://192.168.1.104:8002")
|
| 29 |
|
| 30 |
# Model settings
|
| 31 |
+
MODEL_REPO = "https://huggingface.co/facebook/opt-125m"
|
| 32 |
+
|
| 33 |
# Server settings
|
| 34 |
TENSOR_SERVER_TIMEOUT = 30 # seconds
|
| 35 |
MAX_ERROR_THRESHOLD = 5 # maximum number of errors
|
|
|
|
| 160 |
raise Exception("No model weight files found")
|
| 161 |
|
| 162 |
# Get file size and calculate chunks
|
| 163 |
+
try:
|
| 164 |
+
with open(model_file, 'rb') as f:
|
| 165 |
+
# Get actual file size by seeking to end
|
| 166 |
+
f.seek(0, 2) # Seek to end
|
| 167 |
+
file_size = f.tell() # Get position (total size)
|
| 168 |
+
f.seek(0) # Reset to beginning
|
| 169 |
+
|
| 170 |
+
# Read first few bytes to verify file isn't corrupted
|
| 171 |
+
header = f.read(8)
|
| 172 |
+
if len(header) == 0:
|
| 173 |
+
raise ValueError(f"File is empty: {model_file}")
|
| 174 |
+
except Exception as e:
|
| 175 |
+
raise Exception(f"Failed to read model file {model_file}: {str(e)}")
|
| 176 |
+
|
| 177 |
+
# Verify file size is reasonable
|
| 178 |
+
if file_size < 1024: # Less than 1KB
|
| 179 |
+
raise ValueError(f"Model file suspiciously small ({file_size} bytes). Possible corruption or incomplete download.")
|
| 180 |
+
|
| 181 |
num_servers = len(state.tensor_servers) or len(Settings.TENSOR_SERVER_URLS)
|
| 182 |
num_chunks = num_servers # One chunk per server initially
|
| 183 |
|
|
|
|
| 186 |
# Format sizes for display
|
| 187 |
def format_size(size_bytes):
|
| 188 |
if size_bytes >= 1024*1024*1024: # GB
|
| 189 |
+
return f"{size_bytes / (1024*1024*1024):.2f} GB ({size_bytes:,} bytes)"
|
| 190 |
elif size_bytes >= 1024*1024: # MB
|
| 191 |
+
return f"{size_bytes / (1024*1024):.2f} MB ({size_bytes:,} bytes)"
|
| 192 |
elif size_bytes >= 1024: # KB
|
| 193 |
+
return f"{size_bytes / 1024:.2f} KB ({size_bytes:,} bytes)"
|
| 194 |
else:
|
| 195 |
+
return f"{size_bytes:,} bytes"
|
| 196 |
|
| 197 |
print(f"[INFO] Model file size: {format_size(file_size)}")
|
| 198 |
print(f"[INFO] Creating {num_chunks} chunks of approximately {format_size(chunk_size)} each")
|
|
|
|
| 215 |
break
|
| 216 |
|
| 217 |
# Read and write chunk
|
| 218 |
+
try:
|
| 219 |
+
f.seek(start_pos)
|
| 220 |
+
chunk_data = f.read(current_chunk_size)
|
| 221 |
+
actual_chunk_size = len(chunk_data)
|
| 222 |
+
|
| 223 |
+
if actual_chunk_size != current_chunk_size:
|
| 224 |
+
print(f"[WARN] Chunk {chunk_id} size mismatch. Expected: {current_chunk_size}, Got: {actual_chunk_size}")
|
| 225 |
+
|
| 226 |
+
with open(chunk_path, 'wb') as chunk_file:
|
| 227 |
+
chunk_file.write(chunk_data)
|
| 228 |
+
|
| 229 |
+
chunk_sizes.append(actual_chunk_size)
|
| 230 |
+
print(f"[DEBUG] Chunk {chunk_id} data: First few bytes: {chunk_data[:20].hex()}")
|
| 231 |
+
except Exception as e:
|
| 232 |
+
raise Exception(f"Failed to process chunk {chunk_id} at offset {start_pos}: {str(e)}")
|
| 233 |
|
| 234 |
# Create chunk metadata
|
| 235 |
state.model_chunks[chunk_id] = ModelChunk(
|
|
|
|
| 1091 |
print(f"[INFO] API Documentation available at http://localhost:{port}/docs")
|
| 1092 |
|
| 1093 |
uvicorn.run(
|
| 1094 |
+
"app:app",
|
| 1095 |
host="0.0.0.0",
|
| 1096 |
port=port,
|
| 1097 |
reload=False
|