Fred808 commited on
Commit
1eb637f
·
verified ·
1 Parent(s): 1e2f78e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +40 -14
app.py CHANGED
@@ -28,7 +28,8 @@ class Settings:
28
  AGGREGATOR_URL = os.getenv("AGGREGATOR_URL", "http://192.168.1.104:8002")
29
 
30
  # Model settings
31
- MODEL_REPO = "https://huggingface.co/microsoft/florence-2-large"
 
32
  # Server settings
33
  TENSOR_SERVER_TIMEOUT = 30 # seconds
34
  MAX_ERROR_THRESHOLD = 5 # maximum number of errors
@@ -159,7 +160,24 @@ async def split_model_weights():
159
  raise Exception("No model weight files found")
160
 
161
  # Get file size and calculate chunks
162
- file_size = os.path.getsize(model_file)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
163
  num_servers = len(state.tensor_servers) or len(Settings.TENSOR_SERVER_URLS)
164
  num_chunks = num_servers # One chunk per server initially
165
 
@@ -168,13 +186,13 @@ async def split_model_weights():
168
  # Format sizes for display
169
  def format_size(size_bytes):
170
  if size_bytes >= 1024*1024*1024: # GB
171
- return f"{size_bytes / (1024*1024*1024):.2f} GB"
172
  elif size_bytes >= 1024*1024: # MB
173
- return f"{size_bytes / (1024*1024):.2f} MB"
174
  elif size_bytes >= 1024: # KB
175
- return f"{size_bytes / 1024:.2f} KB"
176
  else:
177
- return f"{size_bytes} bytes"
178
 
179
  print(f"[INFO] Model file size: {format_size(file_size)}")
180
  print(f"[INFO] Creating {num_chunks} chunks of approximately {format_size(chunk_size)} each")
@@ -197,13 +215,21 @@ async def split_model_weights():
197
  break
198
 
199
  # Read and write chunk
200
- f.seek(start_pos)
201
- chunk_data = f.read(current_chunk_size)
202
-
203
- with open(chunk_path, 'wb') as chunk_file:
204
- chunk_file.write(chunk_data)
205
-
206
- chunk_sizes.append(current_chunk_size)
 
 
 
 
 
 
 
 
207
 
208
  # Create chunk metadata
209
  state.model_chunks[chunk_id] = ModelChunk(
@@ -1065,7 +1091,7 @@ if __name__ == "__main__":
1065
  print(f"[INFO] API Documentation available at http://localhost:{port}/docs")
1066
 
1067
  uvicorn.run(
1068
- "controller_server_new:app",
1069
  host="0.0.0.0",
1070
  port=port,
1071
  reload=False
 
28
  AGGREGATOR_URL = os.getenv("AGGREGATOR_URL", "http://192.168.1.104:8002")
29
 
30
  # Model settings
31
+ MODEL_REPO = "https://huggingface.co/facebook/opt-125m"
32
+
33
  # Server settings
34
  TENSOR_SERVER_TIMEOUT = 30 # seconds
35
  MAX_ERROR_THRESHOLD = 5 # maximum number of errors
 
160
  raise Exception("No model weight files found")
161
 
162
  # Get file size and calculate chunks
163
+ try:
164
+ with open(model_file, 'rb') as f:
165
+ # Get actual file size by seeking to end
166
+ f.seek(0, 2) # Seek to end
167
+ file_size = f.tell() # Get position (total size)
168
+ f.seek(0) # Reset to beginning
169
+
170
+ # Read first few bytes to verify file isn't corrupted
171
+ header = f.read(8)
172
+ if len(header) == 0:
173
+ raise ValueError(f"File is empty: {model_file}")
174
+ except Exception as e:
175
+ raise Exception(f"Failed to read model file {model_file}: {str(e)}")
176
+
177
+ # Verify file size is reasonable
178
+ if file_size < 1024: # Less than 1KB
179
+ raise ValueError(f"Model file suspiciously small ({file_size} bytes). Possible corruption or incomplete download.")
180
+
181
  num_servers = len(state.tensor_servers) or len(Settings.TENSOR_SERVER_URLS)
182
  num_chunks = num_servers # One chunk per server initially
183
 
 
186
  # Format sizes for display
187
  def format_size(size_bytes):
188
  if size_bytes >= 1024*1024*1024: # GB
189
+ return f"{size_bytes / (1024*1024*1024):.2f} GB ({size_bytes:,} bytes)"
190
  elif size_bytes >= 1024*1024: # MB
191
+ return f"{size_bytes / (1024*1024):.2f} MB ({size_bytes:,} bytes)"
192
  elif size_bytes >= 1024: # KB
193
+ return f"{size_bytes / 1024:.2f} KB ({size_bytes:,} bytes)"
194
  else:
195
+ return f"{size_bytes:,} bytes"
196
 
197
  print(f"[INFO] Model file size: {format_size(file_size)}")
198
  print(f"[INFO] Creating {num_chunks} chunks of approximately {format_size(chunk_size)} each")
 
215
  break
216
 
217
  # Read and write chunk
218
+ try:
219
+ f.seek(start_pos)
220
+ chunk_data = f.read(current_chunk_size)
221
+ actual_chunk_size = len(chunk_data)
222
+
223
+ if actual_chunk_size != current_chunk_size:
224
+ print(f"[WARN] Chunk {chunk_id} size mismatch. Expected: {current_chunk_size}, Got: {actual_chunk_size}")
225
+
226
+ with open(chunk_path, 'wb') as chunk_file:
227
+ chunk_file.write(chunk_data)
228
+
229
+ chunk_sizes.append(actual_chunk_size)
230
+ print(f"[DEBUG] Chunk {chunk_id} data: First few bytes: {chunk_data[:20].hex()}")
231
+ except Exception as e:
232
+ raise Exception(f"Failed to process chunk {chunk_id} at offset {start_pos}: {str(e)}")
233
 
234
  # Create chunk metadata
235
  state.model_chunks[chunk_id] = ModelChunk(
 
1091
  print(f"[INFO] API Documentation available at http://localhost:{port}/docs")
1092
 
1093
  uvicorn.run(
1094
+ "app:app",
1095
  host="0.0.0.0",
1096
  port=port,
1097
  reload=False