Spaces:
Running
Running
File size: 11,273 Bytes
557c6b6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 |
"""
Functions for fine-tuning Gemma models
"""
import os
import time
import json
import threading
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datetime import datetime
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from peft import get_peft_model, LoraConfig, TaskType
from data_processing import create_train_val_split, format_for_training
from model_utils import load_model
from datasets import Dataset
# Global variable to store training state
_TRAINING_STATE = None
class TrainingThread(threading.Thread):
"""Thread class for running training in the background."""
def __init__(self, model_name, dataset, params):
threading.Thread.__init__(self)
self.model_name = model_name
self.dataset = dataset
self.params = params
self.stop_flag = False
self.daemon = True # Thread will exit when main program exits
def run(self):
"""Run the training process."""
try:
# Initialize training state
global _TRAINING_STATE
_TRAINING_STATE = {
"status": "initializing",
"current_epoch": 0,
"current_step": 0,
"total_steps": 0,
"elapsed_time": 0,
"loss_plot": None,
"eval_plot": None,
"log": "",
"samples": None,
"error": None
}
# Create output directory
output_dir = os.path.join("outputs", datetime.now().strftime("%Y%m%d_%H%M%S"))
os.makedirs(output_dir, exist_ok=True)
# Load the model and tokenizer
model, tokenizer = load_model(self.model_name)
# Apply LoRA configuration
lora_config = LoraConfig(
r=self.params.get("lora_r", 16),
lora_alpha=self.params.get("lora_alpha", 32),
lora_dropout=self.params.get("lora_dropout", 0.05),
bias="none",
task_type=TaskType.CAUSAL_LM
)
model = get_peft_model(model, lora_config)
# Split dataset into train and validation
train_data, val_data = create_train_val_split(self.dataset)
# Format data for training
max_length = self.params.get("max_seq_length", 512)
train_formatted = format_for_training(train_data, tokenizer, max_length)
val_formatted = format_for_training(val_data, tokenizer, max_length)
# Convert to HF Datasets
train_dataset = Dataset.from_dict(train_formatted)
val_dataset = Dataset.from_dict(val_formatted)
# Create data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)
# Set up training arguments
batch_size = self.params.get("batch_size", 4)
gradient_accumulation_steps = self.params.get("gradient_accumulation_steps", 1)
num_epochs = self.params.get("num_epochs", 3)
# Calculate total steps
train_steps = len(train_dataset) // batch_size // gradient_accumulation_steps * num_epochs
_TRAINING_STATE["total_steps"] = train_steps
# Training arguments
training_args = TrainingArguments(
output_dir=output_dir,
learning_rate=self.params.get("learning_rate", 2e-5),
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
gradient_accumulation_steps=gradient_accumulation_steps,
num_train_epochs=num_epochs,
weight_decay=self.params.get("weight_decay", 0.01),
warmup_steps=self.params.get("warmup_steps", 100),
logging_dir=os.path.join(output_dir, "logs"),
logging_steps=10,
evaluation_strategy="epoch",
save_strategy="epoch",
save_total_limit=2,
load_best_model_at_end=True,
report_to="none" # Disable wandb, tensorboard, etc.
)
# Custom callback for UI updates
class UICallback:
def __init__(self, thread):
self.thread = thread
self.start_time = time.time()
self.losses = []
self.eval_metrics = []
self.log_buffer = ""
def on_log(self, args, state, control, logs=None, **kwargs):
if self.thread.stop_flag:
control.should_training_stop = True
_TRAINING_STATE["status"] = "stopped"
return
if logs is None:
return
# Update training state
_TRAINING_STATE["elapsed_time"] = time.time() - self.start_time
# Handle training logs
if "loss" in logs:
_TRAINING_STATE["current_step"] = state.global_step
loss = logs["loss"]
self.losses.append((state.global_step, loss))
# Update loss plot
fig, ax = plt.subplots(figsize=(10, 6))
steps, losses = zip(*self.losses)
ax.plot(steps, losses)
ax.set_xlabel("Steps")
ax.set_ylabel("Loss")
ax.set_title("Training Loss")
ax.grid(True)
_TRAINING_STATE["loss_plot"] = fig
# Update log
log_entry = f"Step {state.global_step}: loss={loss:.4f}\n"
self.log_buffer += log_entry
_TRAINING_STATE["log"] = self.log_buffer
# Handle evaluation logs
if "eval_loss" in logs:
_TRAINING_STATE["current_epoch"] = state.epoch
eval_loss = logs["eval_loss"]
self.eval_metrics.append((state.epoch, eval_loss))
# Update eval plot
fig, ax = plt.subplots(figsize=(10, 6))
epochs, metrics = zip(*self.eval_metrics)
ax.plot(epochs, metrics)
ax.set_xlabel("Epochs")
ax.set_ylabel("Evaluation Loss")
ax.set_title("Validation Loss")
ax.grid(True)
_TRAINING_STATE["eval_plot"] = fig
# Generate sample outputs for visualization
sample_outputs = self.generate_samples(model, tokenizer)
_TRAINING_STATE["samples"] = sample_outputs
# Update log
log_entry = f"Epoch {state.epoch}: eval_loss={eval_loss:.4f}\n"
self.log_buffer += log_entry
_TRAINING_STATE["log"] = self.log_buffer
def generate_samples(self, model, tokenizer, num_samples=3):
"""Generate sample outputs from the current model."""
# Get random samples from validation set
val_indices = np.random.choice(len(val_data), min(num_samples, len(val_data)), replace=False)
samples = [val_data[i] for i in val_indices]
results = []
for sample in samples:
prompt = sample["prompt"]
reference = sample["completion"]
# Generate text
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=100,
temperature=0.7,
num_return_sequences=1
)
generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
# Remove the prompt from the generated text
if generated.startswith(prompt):
generated = generated[len(prompt):].strip()
results.append({
"Prompt": prompt,
"Generated Text": generated,
"Reference": reference
})
return pd.DataFrame(results)
# Create trainer
ui_callback = UICallback(self)
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
data_collator=data_collator,
callbacks=[ui_callback]
)
# Update training state
_TRAINING_STATE["status"] = "training"
# Start training
trainer.train()
# Save final model
trainer.save_model(os.path.join(output_dir, "final"))
tokenizer.save_pretrained(os.path.join(output_dir, "final"))
# Update training state
_TRAINING_STATE["status"] = "completed"
_TRAINING_STATE["fine_tuned_model_path"] = os.path.join(output_dir, "final")
except Exception as e:
# Update training state with error
_TRAINING_STATE["status"] = "error"
_TRAINING_STATE["error"] = str(e)
print(f"Training error: {str(e)}")
def stop(self):
"""Signal the thread to stop training."""
self.stop_flag = True
def start_fine_tuning(model_name, dataset, params):
"""
Start the fine-tuning process in a background thread.
Args:
model_name: Name of the model to fine-tune
dataset: Processed dataset
params: Training parameters
Returns:
TrainingThread object
"""
thread = TrainingThread(model_name, dataset, params)
thread.start()
return thread
def load_training_state():
"""
Get the current training state.
Returns:
Dictionary with training state information
"""
return _TRAINING_STATE
|