Spaces:
Running
Running
Removed scale options
Browse files
app.py
CHANGED
|
@@ -118,7 +118,7 @@ def download_and_upload_non_model_files(base_model_name, output_repo_name):
|
|
| 118 |
shutil.rmtree(temp_config_dir, ignore_errors=True)
|
| 119 |
|
| 120 |
def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo_name,
|
| 121 |
-
|
| 122 |
temp_lora_dir = None
|
| 123 |
try:
|
| 124 |
login(hf_token)
|
|
@@ -127,11 +127,8 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
|
|
| 127 |
info_fn("Loading LoRA adapter...")
|
| 128 |
|
| 129 |
# Load LoRA state (this downloads the adapter)
|
| 130 |
-
lora_state,
|
| 131 |
-
|
| 132 |
-
# Apply LoRA scale multiplier
|
| 133 |
-
scale = base_scale * lora_scale
|
| 134 |
-
info_fn(f"Using LoRA scale: {scale} (base: {base_scale}, multiplier: {lora_scale})")
|
| 135 |
|
| 136 |
progress(0.2, desc="Creating output repository...")
|
| 137 |
|
|
@@ -161,7 +158,6 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
|
|
| 161 |
info_fn(f"Found {len(shard_files)} model shards to process")
|
| 162 |
|
| 163 |
merged_tensors = 0
|
| 164 |
-
scaled_lm_heads = 0
|
| 165 |
total_shards = len(shard_files)
|
| 166 |
|
| 167 |
# Process each shard individually
|
|
@@ -186,7 +182,6 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
|
|
| 186 |
# Process the shard
|
| 187 |
tensors = {}
|
| 188 |
shard_merged_count = 0
|
| 189 |
-
shard_lm_head_count = 0
|
| 190 |
|
| 191 |
with safe_open(shard_path, framework='pt', device='cpu') as f:
|
| 192 |
# Get metadata if available
|
|
@@ -195,16 +190,6 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
|
|
| 195 |
for key in f.keys():
|
| 196 |
tensor = f.get_tensor(key)
|
| 197 |
|
| 198 |
-
# Apply lm_head scaling if applicable
|
| 199 |
-
if key.endswith('lm_head.weight') and lm_head_scale != 1.0:
|
| 200 |
-
info_fn(f"Scaling {key} by {lm_head_scale}")
|
| 201 |
-
original_dtype = tensor.dtype
|
| 202 |
-
tensor = tensor.to(torch.float32)
|
| 203 |
-
tensor = tensor * lm_head_scale
|
| 204 |
-
tensor = tensor.to(original_dtype)
|
| 205 |
-
shard_lm_head_count += 1
|
| 206 |
-
scaled_lm_heads += 1
|
| 207 |
-
|
| 208 |
# Try to find corresponding LoRA weights
|
| 209 |
lora_A, lora_B = find_lora_weights(lora_state, key)
|
| 210 |
|
|
@@ -241,7 +226,7 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
|
|
| 241 |
output_shard_path = os.path.join(temp_shard_dir, f"processed_{shard_filename}")
|
| 242 |
save_file(tensors, output_shard_path, metadata=metadata)
|
| 243 |
|
| 244 |
-
info_fn(f"Shard {shard_filename}
|
| 245 |
|
| 246 |
# Upload the processed shard
|
| 247 |
api.upload_file(
|
|
@@ -261,7 +246,7 @@ def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo
|
|
| 261 |
|
| 262 |
progress(1.0, desc="Upload completed!")
|
| 263 |
|
| 264 |
-
success_msg = f"β Successfully merged and uploaded model!\nModel URL: https://huggingface.co/{output_repo_name}\nProcessed {total_shards} shards\nMerged {merged_tensors} layers with LoRA weights
|
| 265 |
info_fn("Merge completed successfully!")
|
| 266 |
|
| 267 |
return success_msg
|
|
@@ -287,24 +272,16 @@ This tool merges LoRA (Low-Rank Adaptation) adapters with base models using a me
|
|
| 287 |
- **Streaming Processing**: Downloads β Processes β Uploads β Deletes each shard sequentially
|
| 288 |
- **Automatic Cleanup**: Temporary files are automatically removed after processing
|
| 289 |
- **Progress Tracking**: Real-time status updates throughout the merge process
|
| 290 |
-
- **Advanced Options**:
|
| 291 |
"""
|
| 292 |
|
| 293 |
DETAILS_TEXT = """
|
| 294 |
### How It Works
|
| 295 |
-
LoRA enables efficient fine-tuning by adding small adapter weights rather than modifying the entire model. This tool applies the LoRA transformation
|
| 296 |
|
| 297 |
- **Standard Additive-LoRA**: `W_new = W + scale Γ B^T @ A`
|
| 298 |
- **Multiplicative LoRA**: `W_new = W + scale Γ B^T @ A @ W`
|
| 299 |
|
| 300 |
-
Additionally, the model's default temperature behavior can be adjusted by scaling the `lm_head.weight` tensor:
|
| 301 |
-
|
| 302 |
-
- **Up-scaling**: Makes the model's outputs more peaked, requiring lower temperature settings for the same output distribution
|
| 303 |
-
- **Down-scaling**: Makes the model's outputs flatter, requiring higher temperature settings for the same output distribution
|
| 304 |
-
- **Examples**:
|
| 305 |
-
- Scaling `lm_head.weight` by `1.25` makes the new model with `temperature = 1.0` act like the old model with `temperature = 0.8`
|
| 306 |
-
- Scaling `lm_head.weight` by `0.667` makes the new model with `temperature = 1.0` act like the old model with `temperature = 1.5`
|
| 307 |
-
|
| 308 |
### Memory Efficiency
|
| 309 |
- **Traditional approach**: Loads entire model (~15GB+ for 7B parameter models)
|
| 310 |
- **This approach**: Peak usage determined by largest shard size, not total model size
|
|
@@ -351,22 +328,6 @@ with gr.Blocks(title="Memory-Efficient LoRA Merge", theme=gr.themes.Soft()) as d
|
|
| 351 |
)
|
| 352 |
|
| 353 |
gr.Markdown("### Advanced Options")
|
| 354 |
-
lora_scale = gr.Number(
|
| 355 |
-
label="LoRA Scale",
|
| 356 |
-
value=1.0,
|
| 357 |
-
minimum=0.0,
|
| 358 |
-
maximum=10.0,
|
| 359 |
-
step=0.1,
|
| 360 |
-
info="Multiplier for LoRA strength (1.0 = default)"
|
| 361 |
-
)
|
| 362 |
-
lm_head_scale = gr.Number(
|
| 363 |
-
label="LM Head Scale",
|
| 364 |
-
value=1.0,
|
| 365 |
-
minimum=0.1,
|
| 366 |
-
maximum=5.0,
|
| 367 |
-
step=0.05,
|
| 368 |
-
info="Multiplier for lm_head weights (1.0 = default)"
|
| 369 |
-
)
|
| 370 |
multiplicative_lora = gr.Checkbox(
|
| 371 |
label="Multiplicative LoRA",
|
| 372 |
value=False,
|
|
@@ -387,8 +348,7 @@ with gr.Blocks(title="Memory-Efficient LoRA Merge", theme=gr.themes.Soft()) as d
|
|
| 387 |
|
| 388 |
submit_btn.click(
|
| 389 |
fn=merge_lora_efficient,
|
| 390 |
-
inputs=[hf_token, base_model_name, lora_model_name, output_repo_name,
|
| 391 |
-
lora_scale, lm_head_scale, multiplicative_lora],
|
| 392 |
outputs=output_text
|
| 393 |
)
|
| 394 |
|
|
|
|
| 118 |
shutil.rmtree(temp_config_dir, ignore_errors=True)
|
| 119 |
|
| 120 |
def merge_lora_efficient(hf_token, base_model_name, lora_model_name, output_repo_name,
|
| 121 |
+
multiplicative_lora, progress=gr.Progress()):
|
| 122 |
temp_lora_dir = None
|
| 123 |
try:
|
| 124 |
login(hf_token)
|
|
|
|
| 127 |
info_fn("Loading LoRA adapter...")
|
| 128 |
|
| 129 |
# Load LoRA state (this downloads the adapter)
|
| 130 |
+
lora_state, scale, temp_lora_dir = load_lora_state(lora_model_name)
|
| 131 |
+
info_fn(f"Using LoRA scale: {scale}")
|
|
|
|
|
|
|
|
|
|
| 132 |
|
| 133 |
progress(0.2, desc="Creating output repository...")
|
| 134 |
|
|
|
|
| 158 |
info_fn(f"Found {len(shard_files)} model shards to process")
|
| 159 |
|
| 160 |
merged_tensors = 0
|
|
|
|
| 161 |
total_shards = len(shard_files)
|
| 162 |
|
| 163 |
# Process each shard individually
|
|
|
|
| 182 |
# Process the shard
|
| 183 |
tensors = {}
|
| 184 |
shard_merged_count = 0
|
|
|
|
| 185 |
|
| 186 |
with safe_open(shard_path, framework='pt', device='cpu') as f:
|
| 187 |
# Get metadata if available
|
|
|
|
| 190 |
for key in f.keys():
|
| 191 |
tensor = f.get_tensor(key)
|
| 192 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 193 |
# Try to find corresponding LoRA weights
|
| 194 |
lora_A, lora_B = find_lora_weights(lora_state, key)
|
| 195 |
|
|
|
|
| 226 |
output_shard_path = os.path.join(temp_shard_dir, f"processed_{shard_filename}")
|
| 227 |
save_file(tensors, output_shard_path, metadata=metadata)
|
| 228 |
|
| 229 |
+
info_fn(f"Shard {shard_filename}: Merged {shard_merged_count} tensors")
|
| 230 |
|
| 231 |
# Upload the processed shard
|
| 232 |
api.upload_file(
|
|
|
|
| 246 |
|
| 247 |
progress(1.0, desc="Upload completed!")
|
| 248 |
|
| 249 |
+
success_msg = f"β Successfully merged and uploaded model!\nModel URL: https://huggingface.co/{output_repo_name}\nProcessed {total_shards} shards\nMerged {merged_tensors} layers with LoRA weights"
|
| 250 |
info_fn("Merge completed successfully!")
|
| 251 |
|
| 252 |
return success_msg
|
|
|
|
| 272 |
- **Streaming Processing**: Downloads β Processes β Uploads β Deletes each shard sequentially
|
| 273 |
- **Automatic Cleanup**: Temporary files are automatically removed after processing
|
| 274 |
- **Progress Tracking**: Real-time status updates throughout the merge process
|
| 275 |
+
- **Advanced Options**: Multiplicative LoRA support
|
| 276 |
"""
|
| 277 |
|
| 278 |
DETAILS_TEXT = """
|
| 279 |
### How It Works
|
| 280 |
+
LoRA enables efficient fine-tuning by adding small adapter weights rather than modifying the entire model. This tool applies the LoRA transformation:
|
| 281 |
|
| 282 |
- **Standard Additive-LoRA**: `W_new = W + scale Γ B^T @ A`
|
| 283 |
- **Multiplicative LoRA**: `W_new = W + scale Γ B^T @ A @ W`
|
| 284 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 285 |
### Memory Efficiency
|
| 286 |
- **Traditional approach**: Loads entire model (~15GB+ for 7B parameter models)
|
| 287 |
- **This approach**: Peak usage determined by largest shard size, not total model size
|
|
|
|
| 328 |
)
|
| 329 |
|
| 330 |
gr.Markdown("### Advanced Options")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 331 |
multiplicative_lora = gr.Checkbox(
|
| 332 |
label="Multiplicative LoRA",
|
| 333 |
value=False,
|
|
|
|
| 348 |
|
| 349 |
submit_btn.click(
|
| 350 |
fn=merge_lora_efficient,
|
| 351 |
+
inputs=[hf_token, base_model_name, lora_model_name, output_repo_name, multiplicative_lora],
|
|
|
|
| 352 |
outputs=output_text
|
| 353 |
)
|
| 354 |
|