rahul7star commited on
Commit
4f107f2
·
verified ·
1 Parent(s): b1fceea

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +31 -8
app.py CHANGED
@@ -110,7 +110,16 @@ def log_message(output_log, msg):
110
  # ==== Main Training ====
111
  @spaces.GPU(duration=300)
112
  def train_model(base_model, dataset_name, num_epochs, batch_size, learning_rate, hf_repo):
 
 
 
113
  output_log = []
 
 
 
 
 
 
114
  try:
115
  log_message(output_log, "🔍 Initializing training sequence...")
116
 
@@ -122,9 +131,15 @@ def train_model(base_model, dataset_name, num_epochs, batch_size, learning_rate,
122
 
123
  # ===== Load dataset =====
124
  log_message(output_log, f"\n📚 Loading dataset: {dataset_name} ...")
125
- dataset = load_dataset(dataset_name, split="train")
126
- log_message(output_log, f" Loaded {len(dataset)} samples")
127
- log_message(output_log, f" Columns: {dataset.column_names}")
 
 
 
 
 
 
128
 
129
  # ===== Format examples =====
130
  def format_example(item):
@@ -136,14 +151,17 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
136
  <|assistant|>
137
  """
138
  return {"text": prompt}
139
- dataset = dataset.map(format_example)
140
- log_message(output_log, f"✅ Formatted {len(dataset)} examples")
 
 
141
 
142
  # ===== Load model & tokenizer =====
143
  log_message(output_log, f"\n🤖 Loading model: {base_model}")
144
  tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
145
  if tokenizer.pad_token is None:
146
  tokenizer.pad_token = tokenizer.eos_token
 
147
  model = AutoModelForCausalLM.from_pretrained(
148
  base_model,
149
  trust_remote_code=True,
@@ -179,7 +197,9 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
179
  )
180
  tokenized["labels"] = tokenized["input_ids"].copy()
181
  return tokenized
182
- dataset = dataset.map(tokenize_fn, batched=True)
 
 
183
  log_message(output_log, "✅ Tokenization + labels done")
184
 
185
  # ===== Training arguments =====
@@ -192,15 +212,18 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
192
  warmup_steps=10,
193
  logging_steps=5,
194
  save_strategy="epoch",
 
195
  fp16=device == "cuda",
196
  optim="adamw_torch",
197
  learning_rate=learning_rate,
198
  max_steps=100,
199
  )
 
200
  trainer = Trainer(
201
  model=model,
202
  args=training_args,
203
- train_dataset=dataset,
 
204
  tokenizer=tokenizer,
205
  )
206
 
@@ -211,7 +234,7 @@ You are a wise teacher interpreting Bhagavad Gita with deep insights.
211
  trainer.save_model(output_dir)
212
  tokenizer.save_pretrained(output_dir)
213
 
214
- # ===== Async upload to repo from UI input =====
215
  log_message(output_log, f"\n☁️ Initiating async upload to {hf_repo}")
216
  start_async_upload(output_dir, hf_repo, output_log)
217
 
 
110
  # ==== Main Training ====
111
  @spaces.GPU(duration=300)
112
  def train_model(base_model, dataset_name, num_epochs, batch_size, learning_rate, hf_repo):
113
+ """
114
+ Fine-tune a base model using LoRA with train/test split and async upload.
115
+ """
116
  output_log = []
117
+ test_split=0.2
118
+
119
+ def log_message(log_list, msg):
120
+ print(msg)
121
+ log_list.append(msg)
122
+
123
  try:
124
  log_message(output_log, "🔍 Initializing training sequence...")
125
 
 
131
 
132
  # ===== Load dataset =====
133
  log_message(output_log, f"\n📚 Loading dataset: {dataset_name} ...")
134
+ dataset = load_dataset(dataset_name)
135
+ # Determine train/test split
136
+ dataset = dataset["train"].train_test_split(test_size=test_split)
137
+ train_dataset = dataset["train"]
138
+ test_dataset = dataset["test"]
139
+
140
+ log_message(output_log, f" Training samples: {len(train_dataset)}")
141
+ log_message(output_log, f" Test samples: {len(test_dataset)}")
142
+ log_message(output_log, f" Columns: {train_dataset.column_names}")
143
 
144
  # ===== Format examples =====
145
  def format_example(item):
 
151
  <|assistant|>
152
  """
153
  return {"text": prompt}
154
+
155
+ train_dataset = train_dataset.map(format_example)
156
+ test_dataset = test_dataset.map(format_example)
157
+ log_message(output_log, f"✅ Formatted {len(train_dataset)} train + {len(test_dataset)} test examples")
158
 
159
  # ===== Load model & tokenizer =====
160
  log_message(output_log, f"\n🤖 Loading model: {base_model}")
161
  tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)
162
  if tokenizer.pad_token is None:
163
  tokenizer.pad_token = tokenizer.eos_token
164
+
165
  model = AutoModelForCausalLM.from_pretrained(
166
  base_model,
167
  trust_remote_code=True,
 
197
  )
198
  tokenized["labels"] = tokenized["input_ids"].copy()
199
  return tokenized
200
+
201
+ train_dataset = train_dataset.map(tokenize_fn, batched=True)
202
+ test_dataset = test_dataset.map(tokenize_fn, batched=True)
203
  log_message(output_log, "✅ Tokenization + labels done")
204
 
205
  # ===== Training arguments =====
 
212
  warmup_steps=10,
213
  logging_steps=5,
214
  save_strategy="epoch",
215
+ evaluation_strategy="epoch",
216
  fp16=device == "cuda",
217
  optim="adamw_torch",
218
  learning_rate=learning_rate,
219
  max_steps=100,
220
  )
221
+
222
  trainer = Trainer(
223
  model=model,
224
  args=training_args,
225
+ train_dataset=train_dataset,
226
+ eval_dataset=test_dataset,
227
  tokenizer=tokenizer,
228
  )
229
 
 
234
  trainer.save_model(output_dir)
235
  tokenizer.save_pretrained(output_dir)
236
 
237
+ # ===== Async upload =====
238
  log_message(output_log, f"\n☁️ Initiating async upload to {hf_repo}")
239
  start_async_upload(output_dir, hf_repo, output_log)
240