Qwen-Training / app.py
rahul7star's picture
Update app.py
8ce7a7d verified
raw
history blame
9.17 kB
"""
PromptWizard — Qwen2.5-0.5B Fine-tuning on Bhagavad Gita Dataset
Downloads CSV locally before training (for Hugging Face Spaces)
"""
import gradio as gr
import spaces
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
)
from peft import LoraConfig, get_peft_model, TaskType
from huggingface_hub import snapshot_download, HfApi
import os
import shutil
# ------------------------------------------------------
# 🧠 GPU check
# ------------------------------------------------------
def check_gpu_status():
return "🟢 Ready — GPU will be assigned at runtime (Zero GPU mode)"
# ------------------------------------------------------
# 🧩 Download Dataset to /tmp/
# ------------------------------------------------------
def download_gita_dataset():
repo_id = "rahul7star/Gita"
local_dir = "/tmp/gita_data"
if os.path.exists(local_dir):
shutil.rmtree(local_dir)
os.makedirs(local_dir, exist_ok=True)
print(f"📥 Downloading dataset from {repo_id} ...")
snapshot_download(repo_id=repo_id, local_dir=local_dir, repo_type="dataset")
# Try to locate the CSV file
csv_path = None
for root, _, files in os.walk(local_dir):
for f in files:
if f.lower().endswith(".csv"):
csv_path = os.path.join(root, f)
break
if not csv_path:
raise FileNotFoundError("No CSV file found in the Gita dataset repository.")
print(f"✅ Found CSV: {csv_path}")
return csv_path
# ------------------------------------------------------
# 🚀 Training function
# ------------------------------------------------------
@spaces.GPU(duration=300)
def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.Progress()):
logs = []
try:
progress(0.05, desc="Initializing...")
device = "cuda" if torch.cuda.is_available() else "cpu"
logs.append(f"🎮 Device: {device}")
# ------------------------------------------------------
# 📂 Step 1: Download dataset
# ------------------------------------------------------
progress(0.1, desc="Downloading dataset...")
logs.append("\n📥 Downloading Gita dataset from HF Hub...")
csv_path = download_gita_dataset()
# ------------------------------------------------------
# 🧾 Step 2: Load dataset from CSV
# ------------------------------------------------------
progress(0.2, desc="Loading dataset...")
df = pd.read_csv(csv_path)
if not all(c in df.columns for c in ["question", "answer"]):
raise ValueError("CSV must contain 'question' and 'answer' columns.")
hf_dataset = Dataset.from_pandas(df)
def format_row(row):
return {
"text": f"<|system|>\nYou are a spiritual guide explaining Gita concepts.\n"
f"<|user|>\n{row['question']}\n"
f"<|assistant|>\n{row['answer']}"
}
hf_dataset = hf_dataset.map(format_row)
logs.append(f"✅ Loaded {len(hf_dataset)} examples from {csv_path}")
# ------------------------------------------------------
# 🤖 Step 3: Load model + tokenizer
# ------------------------------------------------------
progress(0.3, desc="Loading Qwen model...")
model_name = "Qwen/Qwen2.5-0.5B"
logs.append(f"\n🔍 Loading base model: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype=torch.float16 if device == "cuda" else torch.float32,
)
if device == "cuda":
model = model.to(device)
logs.append("✅ Model and tokenizer ready")
# ------------------------------------------------------
# ⚙️ Step 4: Apply LoRA config
# ------------------------------------------------------
progress(0.4, desc="Configuring LoRA...")
lora_config = LoraConfig(
task_type=TaskType.CAUSAL_LM,
r=8,
lora_alpha=16,
lora_dropout=0.1,
target_modules=["q_proj", "v_proj"],
)
model = get_peft_model(model, lora_config)
# ------------------------------------------------------
# ✂️ Step 5: Tokenize dataset
# ------------------------------------------------------
progress(0.5, desc="Tokenizing data...")
def tokenize_fn(batch):
return tokenizer(
batch["text"],
truncation=True,
padding="max_length",
max_length=512,
)
tokenized = hf_dataset.map(tokenize_fn, batched=True)
logs.append("🧾 Dataset tokenized successfully")
# ------------------------------------------------------
# 🎯 Step 6: Training arguments
# ------------------------------------------------------
progress(0.6, desc="Configuring training...")
training_args = TrainingArguments(
output_dir="/tmp/qwen-gita-output",
num_train_epochs=num_epochs,
per_device_train_batch_size=batch_size,
gradient_accumulation_steps=2,
learning_rate=learning_rate,
warmup_steps=5,
logging_steps=5,
save_strategy="no",
fp16=device == "cuda",
max_steps=100,
report_to="none",
)
# ------------------------------------------------------
# 🏋️ Step 7: Train model
# ------------------------------------------------------
progress(0.7, desc="Training in progress...")
logs.append("\n🚀 Starting fine-tuning...")
trainer = Trainer(
model=model,
args=training_args,
train_dataset=tokenized,
tokenizer=tokenizer,
)
trainer.train()
# ------------------------------------------------------
# 💾 Step 8: Save + Upload
# ------------------------------------------------------
progress(0.9, desc="Saving and uploading...")
output_dir = "/tmp/qwen-gita-lora"
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
logs.append("\n📤 Uploading fine-tuned LoRA model to Hugging Face Hub...")
repo_id = "rahul7star/Qwen0.5-3B-Gita"
api = HfApi()
api.upload_folder(folder_path=output_dir, repo_id=repo_id)
logs.append(f"✅ Uploaded fine-tuned model to {repo_id}")
progress(1.0, desc="Complete!")
logs.append("\n🎉 Training complete!")
except Exception as e:
logs.append(f"\n❌ Error: {str(e)}")
return "\n".join(logs)
# ------------------------------------------------------
# 🎨 Gradio Interface
# ------------------------------------------------------
def create_interface():
with gr.Blocks(title="PromptWizard — Qwen Gita Trainer") as demo:
gr.Markdown("""
# 🧘 PromptWizard — Qwen2.5-0.5B Gita Trainer
Fine-tune Qwen 0.5B on your **Bhagavad Gita CSV dataset**
Automatically uploads LoRA weights to `rahul7star/Qwen0.5-3B-Gita`
""")
with gr.Row():
with gr.Column():
gpu_status = gr.Textbox(
label="GPU Status",
value=check_gpu_status(),
interactive=False,
)
model_name = gr.Textbox(
value="Qwen/Qwen2.5-0.5B",
visible=False,
)
num_epochs = gr.Slider(1, 3, 1, step=1, label="Epochs")
batch_size = gr.Slider(1, 4, 2, step=1, label="Batch Size")
learning_rate = gr.Number(value=5e-5, label="Learning Rate")
train_btn = gr.Button("🚀 Start Fine-tuning", variant="primary")
with gr.Column():
output = gr.Textbox(
label="Training Logs",
lines=25,
max_lines=40,
value="Click 'Start Fine-tuning' to train on Bhagavad Gita dataset...",
)
train_btn.click(
fn=train_model,
inputs=[model_name, num_epochs, batch_size, learning_rate],
outputs=output,
)
gr.Markdown("""
---
**Notes:**
- Downloads dataset: `rahul7star/Gita` → `/tmp/gita_data/Gita.csv`
- Trains using LoRA for efficiency
- Uploads to `rahul7star/Qwen0.5-3B-Gita`
""")
return demo
# ------------------------------------------------------
# 🚪 Launch app
# ------------------------------------------------------
if __name__ == "__main__":
demo = create_interface()
demo.launch()