Spaces:
Running
Running
Commit
·
608088a
1
Parent(s):
3f5a4ba
adding some logs to investigate
Browse files
vms/ui/project/services/training.py
CHANGED
|
@@ -1664,25 +1664,25 @@ class TrainingService:
|
|
| 1664 |
# Check in lora_weights directory
|
| 1665 |
lora_weights_dir = self.app.output_path / "lora_weights"
|
| 1666 |
if lora_weights_dir.exists():
|
| 1667 |
-
|
| 1668 |
|
| 1669 |
# Look for the latest checkpoint directory in lora_weights
|
| 1670 |
lora_checkpoints = [d for d in lora_weights_dir.glob("*") if d.is_dir() and d.name.isdigit()]
|
| 1671 |
if lora_checkpoints:
|
| 1672 |
latest_lora_checkpoint = max(lora_checkpoints, key=lambda x: int(x.name))
|
| 1673 |
-
|
| 1674 |
|
| 1675 |
# Extract step count from directory name
|
| 1676 |
result["steps"] = int(latest_lora_checkpoint.name)
|
| 1677 |
|
| 1678 |
# List contents of the latest checkpoint directory
|
| 1679 |
checkpoint_contents = list(latest_lora_checkpoint.glob("*"))
|
| 1680 |
-
|
| 1681 |
|
| 1682 |
# Check for weights in the latest LoRA checkpoint
|
| 1683 |
lora_safetensors = latest_lora_checkpoint / "pytorch_lora_weights.safetensors"
|
| 1684 |
if lora_safetensors.exists():
|
| 1685 |
-
|
| 1686 |
result["path"] = str(lora_safetensors)
|
| 1687 |
return result
|
| 1688 |
|
|
@@ -1697,14 +1697,14 @@ class TrainingService:
|
|
| 1697 |
for weight_file in possible_weight_files:
|
| 1698 |
weight_path = latest_lora_checkpoint / weight_file
|
| 1699 |
if weight_path.exists():
|
| 1700 |
-
|
| 1701 |
result["path"] = str(weight_path)
|
| 1702 |
return result
|
| 1703 |
|
| 1704 |
# Check if any .safetensors files exist
|
| 1705 |
safetensors_files = list(latest_lora_checkpoint.glob("*.safetensors"))
|
| 1706 |
if safetensors_files:
|
| 1707 |
-
|
| 1708 |
# Return the first .safetensors file found
|
| 1709 |
result["path"] = str(safetensors_files[0])
|
| 1710 |
return result
|
|
@@ -1712,7 +1712,7 @@ class TrainingService:
|
|
| 1712 |
# Fallback: check for direct safetensors file in lora_weights root
|
| 1713 |
lora_safetensors = lora_weights_dir / "pytorch_lora_weights.safetensors"
|
| 1714 |
if lora_safetensors.exists():
|
| 1715 |
-
|
| 1716 |
result["path"] = str(lora_safetensors)
|
| 1717 |
return result
|
| 1718 |
else:
|
|
@@ -1753,7 +1753,10 @@ class TrainingService:
|
|
| 1753 |
Returns:
|
| 1754 |
Path to safetensors file or None if not found
|
| 1755 |
"""
|
| 1756 |
-
|
|
|
|
|
|
|
|
|
|
| 1757 |
|
| 1758 |
def create_training_dataset_zip(self) -> str:
|
| 1759 |
"""Create a ZIP file containing all training data
|
|
|
|
| 1664 |
# Check in lora_weights directory
|
| 1665 |
lora_weights_dir = self.app.output_path / "lora_weights"
|
| 1666 |
if lora_weights_dir.exists():
|
| 1667 |
+
logger.info(f"Found lora_weights directory: {lora_weights_dir}")
|
| 1668 |
|
| 1669 |
# Look for the latest checkpoint directory in lora_weights
|
| 1670 |
lora_checkpoints = [d for d in lora_weights_dir.glob("*") if d.is_dir() and d.name.isdigit()]
|
| 1671 |
if lora_checkpoints:
|
| 1672 |
latest_lora_checkpoint = max(lora_checkpoints, key=lambda x: int(x.name))
|
| 1673 |
+
logger.info(f"Found latest LoRA checkpoint: {latest_lora_checkpoint}")
|
| 1674 |
|
| 1675 |
# Extract step count from directory name
|
| 1676 |
result["steps"] = int(latest_lora_checkpoint.name)
|
| 1677 |
|
| 1678 |
# List contents of the latest checkpoint directory
|
| 1679 |
checkpoint_contents = list(latest_lora_checkpoint.glob("*"))
|
| 1680 |
+
logger.info(f"Contents of LoRA checkpoint {latest_lora_checkpoint.name}: {checkpoint_contents}")
|
| 1681 |
|
| 1682 |
# Check for weights in the latest LoRA checkpoint
|
| 1683 |
lora_safetensors = latest_lora_checkpoint / "pytorch_lora_weights.safetensors"
|
| 1684 |
if lora_safetensors.exists():
|
| 1685 |
+
logger.info(f"Found weights in latest LoRA checkpoint: {lora_safetensors}")
|
| 1686 |
result["path"] = str(lora_safetensors)
|
| 1687 |
return result
|
| 1688 |
|
|
|
|
| 1697 |
for weight_file in possible_weight_files:
|
| 1698 |
weight_path = latest_lora_checkpoint / weight_file
|
| 1699 |
if weight_path.exists():
|
| 1700 |
+
logger.info(f"Found weights file {weight_file} in latest LoRA checkpoint: {weight_path}")
|
| 1701 |
result["path"] = str(weight_path)
|
| 1702 |
return result
|
| 1703 |
|
| 1704 |
# Check if any .safetensors files exist
|
| 1705 |
safetensors_files = list(latest_lora_checkpoint.glob("*.safetensors"))
|
| 1706 |
if safetensors_files:
|
| 1707 |
+
logger.info(f"Found .safetensors files in LoRA checkpoint: {safetensors_files}")
|
| 1708 |
# Return the first .safetensors file found
|
| 1709 |
result["path"] = str(safetensors_files[0])
|
| 1710 |
return result
|
|
|
|
| 1712 |
# Fallback: check for direct safetensors file in lora_weights root
|
| 1713 |
lora_safetensors = lora_weights_dir / "pytorch_lora_weights.safetensors"
|
| 1714 |
if lora_safetensors.exists():
|
| 1715 |
+
logger.info(f"Found weights in lora_weights directory: {lora_safetensors}")
|
| 1716 |
result["path"] = str(lora_safetensors)
|
| 1717 |
return result
|
| 1718 |
else:
|
|
|
|
| 1753 |
Returns:
|
| 1754 |
Path to safetensors file or None if not found
|
| 1755 |
"""
|
| 1756 |
+
path = self.get_model_output_info()["path"]
|
| 1757 |
+
if not path:
|
| 1758 |
+
raise gr.Error("No model weights found. Please train a model first.")
|
| 1759 |
+
return path
|
| 1760 |
|
| 1761 |
def create_training_dataset_zip(self) -> str:
|
| 1762 |
"""Create a ZIP file containing all training data
|