Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -110,7 +110,16 @@ def log_message(output_log, msg):
|
|
| 110 |
# ==== Main Training ====
|
| 111 |
@spaces.GPU(duration=300)
|
| 112 |
def train_model(base_model, dataset_name, num_epochs, batch_size, learning_rate, hf_repo):
|
|
|
|
|
|
|
|
|
|
| 113 |
output_log = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 114 |
try:
|
| 115 |
log_message(output_log, "🔍 Initializing training sequence...")
|
| 116 |
|
|
@@ -122,9 +131,15 @@ def train_model(base_model, dataset_name, num_epochs, batch_size, learning_rate,
|
|
| 122 |
|
| 123 |
# ===== Load dataset =====
|
| 124 |
log_message(output_log, f"\n📚 Loading dataset: {dataset_name} ...")
|
| 125 |
-
dataset = load_dataset(dataset_name
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 128 |
|
| 129 |
# ===== Format examples =====
|
| 130 |
def format_example(item):
|
|
@@ -136,14 +151,17 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
|
|
| 136 |
<|assistant|>
|
| 137 |
"""
|
| 138 |
return {"text": prompt}
|
| 139 |
-
|
| 140 |
-
|
|
|
|
|
|
|
| 141 |
|
| 142 |
# ===== Load model & tokenizer =====
|
| 143 |
log_message(output_log, f"\n🤖 Loading model: {base_model}")
|
| 144 |
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
|
| 145 |
if tokenizer.pad_token is None:
|
| 146 |
tokenizer.pad_token = tokenizer.eos_token
|
|
|
|
| 147 |
model = AutoModelForCausalLM.from_pretrained(
|
| 148 |
base_model,
|
| 149 |
trust_remote_code=True,
|
|
@@ -179,7 +197,9 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
|
|
| 179 |
)
|
| 180 |
tokenized["labels"] = tokenized["input_ids"].copy()
|
| 181 |
return tokenized
|
| 182 |
-
|
|
|
|
|
|
|
| 183 |
log_message(output_log, "✅ Tokenization + labels done")
|
| 184 |
|
| 185 |
# ===== Training arguments =====
|
|
@@ -192,15 +212,18 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
|
|
| 192 |
warmup_steps=10,
|
| 193 |
logging_steps=5,
|
| 194 |
save_strategy="epoch",
|
|
|
|
| 195 |
fp16=device == "cuda",
|
| 196 |
optim="adamw_torch",
|
| 197 |
learning_rate=learning_rate,
|
| 198 |
max_steps=100,
|
| 199 |
)
|
|
|
|
| 200 |
trainer = Trainer(
|
| 201 |
model=model,
|
| 202 |
args=training_args,
|
| 203 |
-
train_dataset=
|
|
|
|
| 204 |
tokenizer=tokenizer,
|
| 205 |
)
|
| 206 |
|
|
@@ -211,7 +234,7 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
|
|
| 211 |
trainer.save_model(output_dir)
|
| 212 |
tokenizer.save_pretrained(output_dir)
|
| 213 |
|
| 214 |
-
# ===== Async upload
|
| 215 |
log_message(output_log, f"\n☁️ Initiating async upload to {hf_repo}")
|
| 216 |
start_async_upload(output_dir, hf_repo, output_log)
|
| 217 |
|
|
|
|
| 110 |
# ==== Main Training ====
|
| 111 |
@spaces.GPU(duration=300)
|
| 112 |
def train_model(base_model, dataset_name, num_epochs, batch_size, learning_rate, hf_repo):
|
| 113 |
+
"""
|
| 114 |
+
Fine-tune a base model using LoRA with train/test split and async upload.
|
| 115 |
+
"""
|
| 116 |
output_log = []
|
| 117 |
+
test_split=0.2
|
| 118 |
+
|
| 119 |
+
def log_message(log_list, msg):
|
| 120 |
+
print(msg)
|
| 121 |
+
log_list.append(msg)
|
| 122 |
+
|
| 123 |
try:
|
| 124 |
log_message(output_log, "🔍 Initializing training sequence...")
|
| 125 |
|
|
|
|
| 131 |
|
| 132 |
# ===== Load dataset =====
|
| 133 |
log_message(output_log, f"\n📚 Loading dataset: {dataset_name} ...")
|
| 134 |
+
dataset = load_dataset(dataset_name)
|
| 135 |
+
# Determine train/test split
|
| 136 |
+
dataset = dataset["train"].train_test_split(test_size=test_split)
|
| 137 |
+
train_dataset = dataset["train"]
|
| 138 |
+
test_dataset = dataset["test"]
|
| 139 |
+
|
| 140 |
+
log_message(output_log, f" Training samples: {len(train_dataset)}")
|
| 141 |
+
log_message(output_log, f" Test samples: {len(test_dataset)}")
|
| 142 |
+
log_message(output_log, f" Columns: {train_dataset.column_names}")
|
| 143 |
|
| 144 |
# ===== Format examples =====
|
| 145 |
def format_example(item):
|
|
|
|
| 151 |
<|assistant|>
|
| 152 |
"""
|
| 153 |
return {"text": prompt}
|
| 154 |
+
|
| 155 |
+
train_dataset = train_dataset.map(format_example)
|
| 156 |
+
test_dataset = test_dataset.map(format_example)
|
| 157 |
+
log_message(output_log, f"✅ Formatted {len(train_dataset)} train + {len(test_dataset)} test examples")
|
| 158 |
|
| 159 |
# ===== Load model & tokenizer =====
|
| 160 |
log_message(output_log, f"\n🤖 Loading model: {base_model}")
|
| 161 |
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
|
| 162 |
if tokenizer.pad_token is None:
|
| 163 |
tokenizer.pad_token = tokenizer.eos_token
|
| 164 |
+
|
| 165 |
model = AutoModelForCausalLM.from_pretrained(
|
| 166 |
base_model,
|
| 167 |
trust_remote_code=True,
|
|
|
|
| 197 |
)
|
| 198 |
tokenized["labels"] = tokenized["input_ids"].copy()
|
| 199 |
return tokenized
|
| 200 |
+
|
| 201 |
+
train_dataset = train_dataset.map(tokenize_fn, batched=True)
|
| 202 |
+
test_dataset = test_dataset.map(tokenize_fn, batched=True)
|
| 203 |
log_message(output_log, "✅ Tokenization + labels done")
|
| 204 |
|
| 205 |
# ===== Training arguments =====
|
|
|
|
| 212 |
warmup_steps=10,
|
| 213 |
logging_steps=5,
|
| 214 |
save_strategy="epoch",
|
| 215 |
+
evaluation_strategy="epoch",
|
| 216 |
fp16=device == "cuda",
|
| 217 |
optim="adamw_torch",
|
| 218 |
learning_rate=learning_rate,
|
| 219 |
max_steps=100,
|
| 220 |
)
|
| 221 |
+
|
| 222 |
trainer = Trainer(
|
| 223 |
model=model,
|
| 224 |
args=training_args,
|
| 225 |
+
train_dataset=train_dataset,
|
| 226 |
+
eval_dataset=test_dataset,
|
| 227 |
tokenizer=tokenizer,
|
| 228 |
)
|
| 229 |
|
|
|
|
| 234 |
trainer.save_model(output_dir)
|
| 235 |
tokenizer.save_pretrained(output_dir)
|
| 236 |
|
| 237 |
+
# ===== Async upload =====
|
| 238 |
log_message(output_log, f"\n☁️ Initiating async upload to {hf_repo}")
|
| 239 |
start_async_upload(output_dir, hf_repo, output_log)
|
| 240 |
|