rahul7star commited on
Commit
88fb0c6
·
verified ·
1 Parent(s): 61f9c52

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +119 -114
app.py CHANGED
@@ -1,188 +1,186 @@
1
  """
2
- PromptWizard Qwen Training Gita Edition (Fixed Tokenizer Import)
3
- Fine-tunes Qwen using rahul7star/Gita dataset (.csv)
4
- Uploads trained model to rahul7star/Qwen0.5-3B-Gita
5
  """
6
 
7
  import gradio as gr
8
  import spaces
9
  import torch
 
 
10
  from transformers import (
11
- AutoModelForCausalLM,
12
  AutoTokenizer,
 
13
  Trainer,
14
  TrainingArguments,
15
  )
16
- from datasets import load_dataset
17
  from peft import LoraConfig, get_peft_model, TaskType
18
- from huggingface_hub import HfApi, HfFolder, Repository
19
- import os, tempfile, shutil
20
 
21
 
22
- # === GPU check ===
 
 
23
  def check_gpu_status():
24
- return "🚀 Zero GPU Ready - GPU will be allocated when training starts"
25
 
26
 
27
- # === Training function ===
28
- @spaces.GPU(duration=300)
 
 
29
  def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.Progress()):
30
- progress(0, desc="Initializing...")
31
- output_log = []
32
-
33
  try:
34
- # ==== Device ====
35
  device = "cuda" if torch.cuda.is_available() else "cpu"
36
- output_log.append(f"🎮 Using device: {device}")
37
- if device == "cuda":
38
- output_log.append(f"✅ GPU: {torch.cuda.get_device_name(0)}")
39
-
40
- # ==== Load dataset ====
41
- progress(0.1, desc="Loading rahul7star/Gita dataset...")
42
- output_log.append("\n📚 Loading dataset from rahul7star/Gita...")
43
-
44
- dataset = load_dataset("rahul7star/Gita", split="train")
45
- output_log.append(f" Loaded {len(dataset)} samples")
46
- output_log.append(f" Columns: {dataset.column_names}")
47
-
48
- # ==== Format dataset ====
49
- def format_example(item):
50
- text = (
51
- item.get("text")
52
- or item.get("content")
53
- or " ".join(str(v) for v in item.values())
54
- )
55
- prompt = f"""<|system|>
56
- You are a wise teacher interpreting Bhagavad Gita with deep insights.
57
- <|user|>
58
- {text}
59
- <|assistant|>
60
- """
61
- return {"text": prompt}
62
 
63
- dataset = dataset.map(format_example)
64
- output_log.append(f" ✅ Formatted {len(dataset)} examples")
 
 
 
 
 
 
 
 
65
 
66
- # ==== Model & Tokenizer ====
67
- # Load model and tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
68
  progress(0.3, desc="Loading model and tokenizer...")
69
- output_log.append(f"\n🤖 Loading {model_name}...")
70
 
71
- # Always use smaller model for Zero GPU demo
72
- model_name = "Qwen/Qwen2.5-0.5B" # Use smallest model for Zero GPU
73
- output_log.append(" Note: Using 0.5B model for Zero GPU compatibility")
74
 
75
- # ✅ Correct tokenizer and model loading
76
  tokenizer = AutoTokenizer.from_pretrained(model_name)
77
-
78
- # Fix for padding token (Qwen sometimes lacks it)
79
  if tokenizer.pad_token is None:
80
  tokenizer.pad_token = tokenizer.eos_token
81
 
82
  model = AutoModelForCausalLM.from_pretrained(
83
- model_name,
84
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
85
  )
86
 
87
- # Move to GPU if available
88
- if device == "cuda":
89
- model = model.to(device)
90
 
91
- output_log.append(" Model loaded successfully with correct tokenizer and model setup")
92
- # ==== LoRA ====
93
- progress(0.4, desc="Configuring LoRA...")
94
- output_log.append("\n⚙️ Setting up LoRA for efficient fine-tuning...")
95
 
 
 
 
 
96
  lora_config = LoraConfig(
97
  task_type=TaskType.CAUSAL_LM,
98
  r=8,
99
  lora_alpha=16,
100
  lora_dropout=0.1,
101
  target_modules=["q_proj", "v_proj"],
102
- bias="none",
103
  )
104
  model = get_peft_model(model, lora_config)
105
 
106
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
107
- output_log.append(f" Trainable parameters: {trainable_params:,}")
 
108
 
109
- # ==== Tokenization ====
 
 
110
  progress(0.5, desc="Tokenizing dataset...")
 
111
  def tokenize_fn(examples):
112
  return tokenizer(
113
  examples["text"],
114
  padding="max_length",
115
  truncation=True,
116
- max_length=256,
117
  )
118
 
119
- dataset = dataset.map(tokenize_fn, batched=True)
120
- output_log.append(" ✅ Tokenization done")
121
 
122
- # ==== Training setup ====
123
- progress(0.6, desc="Setting up Trainer...")
124
- output_dir = "./qwen-gita-lora"
 
125
  training_args = TrainingArguments(
126
- output_dir=output_dir,
127
  num_train_epochs=num_epochs,
128
  per_device_train_batch_size=batch_size,
129
  gradient_accumulation_steps=2,
130
- warmup_steps=10,
 
131
  logging_steps=5,
132
- save_strategy="epoch",
133
  fp16=device == "cuda",
134
- optim="adamw_torch",
135
- learning_rate=learning_rate,
136
- max_steps=100,
137
  )
138
 
 
 
 
 
 
 
139
  trainer = Trainer(
140
  model=model,
141
  args=training_args,
142
- train_dataset=dataset,
143
  tokenizer=tokenizer,
144
  )
 
145
 
146
- # ==== Train ====
147
- progress(0.7, desc="Training...")
148
- output_log.append("\n🚀 Starting training...\n" + "=" * 50)
149
- train_result = trainer.train()
150
 
151
- progress(0.85, desc="Saving model...")
152
- output_log.append("\n💾 Saving model locally...")
153
- trainer.save_model(output_dir)
 
 
 
154
  tokenizer.save_pretrained(output_dir)
155
 
156
- # ==== Upload to HF Hub ====
157
- progress(0.9, desc="Uploading to Hugging Face Hub...")
158
- hf_repo = "rahul7star/Qwen0.5-3B-Gita"
159
- output_log.append(f"\n☁️ Uploading fine-tuned model to: {hf_repo}")
160
-
161
  api = HfApi()
162
- token = HfFolder.get_token()
163
- api.create_repo(repo_id=hf_repo, exist_ok=True)
164
-
165
- with tempfile.TemporaryDirectory() as tmpdir:
166
- repo = Repository(local_dir=tmpdir, clone_from=hf_repo, use_auth_token=token)
167
- shutil.copytree(output_dir, tmpdir, dirs_exist_ok=True)
168
- repo.push_to_hub(commit_message="Upload fine-tuned Qwen-Gita model")
169
 
170
  progress(1.0, desc="Complete!")
171
- output_log.append("\n Training complete & model uploaded successfully!")
172
 
173
  except Exception as e:
174
- output_log.append(f"\n❌ Error: {e}")
175
 
176
- return "\n".join(output_log)
177
 
178
 
179
- # === Gradio UI ===
 
 
180
  def create_interface():
181
  with gr.Blocks(title="PromptWizard — Qwen Gita Trainer") as demo:
182
  gr.Markdown("""
183
- # 🧘 Qwen Fine-tuning Gita Edition
184
- Fine-tune **Qwen 0.5B** using your [rahul7star/Gita](https://huggingface.co/datasets/rahul7star/Gita) dataset
185
- and automatically upload to your repo **rahul7star/Qwen0.5-3B-Gita**.
186
  """)
187
 
188
  with gr.Row():
@@ -190,38 +188,45 @@ def create_interface():
190
  gpu_status = gr.Textbox(
191
  label="GPU Status",
192
  value=check_gpu_status(),
193
- interactive=False,
194
  )
195
-
196
- # hidden constant input for model name
197
  model_name = gr.Textbox(
198
  value="Qwen/Qwen2.5-0.5B",
199
- label="Base Model",
200
- interactive=False,
201
  visible=False
202
  )
203
-
204
- num_epochs = gr.Slider(1, 3, value=1, step=1, label="Epochs")
205
- batch_size = gr.Slider(1, 4, value=2, step=1, label="Batch Size")
206
  learning_rate = gr.Number(value=5e-5, label="Learning Rate")
207
  train_btn = gr.Button("🚀 Start Fine-tuning", variant="primary")
208
 
209
  with gr.Column():
210
  output = gr.Textbox(
211
- label="Training Log",
212
  lines=25,
213
  max_lines=40,
214
- value="Click 'Start Fine-tuning' to train on the Gita dataset and upload to your model repo.",
215
  )
216
 
217
- # ✅ Fixed: pass components, not strings
218
  train_btn.click(
219
  fn=train_model,
220
  inputs=[model_name, num_epochs, batch_size, learning_rate],
221
  outputs=output,
222
  )
223
 
 
 
 
 
 
 
 
 
224
  return demo
 
 
 
 
 
225
  if __name__ == "__main__":
226
  demo = create_interface()
227
  demo.launch()
 
1
  """
2
+ PromptWizard Qwen2.5-0.5B Fine-tuning on Bhagavad Gita Dataset
3
+ Optimized for Hugging Face Spaces with Zero GPU Support
 
4
  """
5
 
6
  import gradio as gr
7
  import spaces
8
  import torch
9
+ import pandas as pd
10
+ from datasets import Dataset
11
  from transformers import (
 
12
  AutoTokenizer,
13
+ AutoModelForCausalLM,
14
  Trainer,
15
  TrainingArguments,
16
  )
 
17
  from peft import LoraConfig, get_peft_model, TaskType
18
+ from huggingface_hub import HfApi
19
+ import os
20
 
21
 
22
+ # ------------------------------------------------------
23
+ # 🧠 Helper: Check GPU availability
24
+ # ------------------------------------------------------
25
  def check_gpu_status():
26
+ return "🟢 Ready GPU will be auto-assigned when training starts."
27
 
28
 
29
+ # ------------------------------------------------------
30
+ # 🚀 Main Training Function (runs on GPU)
31
+ # ------------------------------------------------------
32
+ @spaces.GPU(duration=600) # 10 minutes GPU allocation
33
  def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.Progress()):
34
+ log = []
 
 
35
  try:
 
36
  device = "cuda" if torch.cuda.is_available() else "cpu"
37
+ log.append(f"🎮 Device: {device}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
+ # ------------------------------------------------------
40
+ # 📂 Load Dataset from Hugging Face Repo (rahul7star/Gita)
41
+ # ------------------------------------------------------
42
+ progress(0.1, desc="Loading Bhagavad Gita dataset...")
43
+ log.append("\n📚 Loading dataset from rahul7star/Gita ...")
44
+
45
+ # Load CSV from your repo
46
+ dataset = Dataset.from_pandas(
47
+ pd.read_csv("https://huggingface.co/datasets/rahul7star/Gita/resolve/main/Gita.csv")
48
+ )
49
 
50
+ # Expected columns: 'question', 'answer'
51
+ if not all(c in dataset.column_names for c in ["question", "answer"]):
52
+ raise ValueError("CSV must contain columns: 'question' and 'answer'")
53
+
54
+ def format_row(row):
55
+ return {
56
+ "text": f"<|system|>\nYou are a spiritual teacher summarizing Gita verses.\n"
57
+ f"<|user|>\n{row['question']}\n"
58
+ f"<|assistant|>\n{row['answer']}"
59
+ }
60
+
61
+ dataset = dataset.map(format_row)
62
+ log.append(f"✅ Loaded {len(dataset)} examples from Bhagavad Gita CSV")
63
+
64
+ # ------------------------------------------------------
65
+ # 🤖 Load Qwen Model and Tokenizer
66
+ # ------------------------------------------------------
67
  progress(0.3, desc="Loading model and tokenizer...")
68
+ log.append(f"\n🔍 Loading model: {model_name}")
69
 
70
+ model_name = "Qwen/Qwen2.5-0.5B" # safest base model for Zero GPU
 
 
71
 
 
72
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
 
73
  if tokenizer.pad_token is None:
74
  tokenizer.pad_token = tokenizer.eos_token
75
 
76
  model = AutoModelForCausalLM.from_pretrained(
77
+ model_name,
78
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
79
  )
80
 
81
+ if device == "cuda":
82
+ model = model.to(device)
 
83
 
84
+ log.append("Model & tokenizer loaded successfully")
 
 
 
85
 
86
+ # ------------------------------------------------------
87
+ # ⚙️ Configure LoRA for lightweight fine-tuning
88
+ # ------------------------------------------------------
89
+ progress(0.4, desc="Configuring LoRA...")
90
  lora_config = LoraConfig(
91
  task_type=TaskType.CAUSAL_LM,
92
  r=8,
93
  lora_alpha=16,
94
  lora_dropout=0.1,
95
  target_modules=["q_proj", "v_proj"],
 
96
  )
97
  model = get_peft_model(model, lora_config)
98
 
99
  trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
100
+ total_params = sum(p.numel() for p in model.parameters())
101
+ log.append(f"🧩 Trainable params: {trainable_params:,} / {total_params:,}")
102
 
103
+ # ------------------------------------------------------
104
+ # ✂️ Tokenize Dataset
105
+ # ------------------------------------------------------
106
  progress(0.5, desc="Tokenizing dataset...")
107
+
108
  def tokenize_fn(examples):
109
  return tokenizer(
110
  examples["text"],
111
  padding="max_length",
112
  truncation=True,
113
+ max_length=512,
114
  )
115
 
116
+ tokenized = dataset.map(tokenize_fn, batched=True)
 
117
 
118
+ # ------------------------------------------------------
119
+ # 🎯 Setup Training Arguments
120
+ # ------------------------------------------------------
121
+ progress(0.6, desc="Configuring training...")
122
  training_args = TrainingArguments(
123
+ output_dir="./qwen-gita-output",
124
  num_train_epochs=num_epochs,
125
  per_device_train_batch_size=batch_size,
126
  gradient_accumulation_steps=2,
127
+ learning_rate=learning_rate,
128
+ warmup_steps=5,
129
  logging_steps=5,
130
+ save_strategy="no",
131
  fp16=device == "cuda",
132
+ max_steps=100, # short demo run
133
+ report_to="none",
 
134
  )
135
 
136
+ # ------------------------------------------------------
137
+ # 🏋️ Train
138
+ # ------------------------------------------------------
139
+ progress(0.7, desc="Training model...")
140
+ log.append("\n🚀 Starting training on Bhagavad Gita dataset...")
141
+
142
  trainer = Trainer(
143
  model=model,
144
  args=training_args,
145
+ train_dataset=tokenized,
146
  tokenizer=tokenizer,
147
  )
148
+ trainer.train()
149
 
150
+ progress(0.9, desc="Finalizing and saving...")
 
 
 
151
 
152
+ # ------------------------------------------------------
153
+ # 💾 Save and Upload to HF Repo
154
+ # ------------------------------------------------------
155
+ output_dir = "./qwen-gita-lora"
156
+ os.makedirs(output_dir, exist_ok=True)
157
+ model.save_pretrained(output_dir)
158
  tokenizer.save_pretrained(output_dir)
159
 
160
+ log.append("\n📤 Uploading model to 🤗 Hugging Face Hub...")
 
 
 
 
161
  api = HfApi()
162
+ repo_id = "rahul7star/Qwen0.5-3B-Gita"
163
+ api.upload_folder(folder_path=output_dir, repo_id=repo_id)
164
+ log.append(f"✅ Uploaded LoRA fine-tuned model to {repo_id}")
 
 
 
 
165
 
166
  progress(1.0, desc="Complete!")
167
+ log.append("\n🎉 Training finished successfully!")
168
 
169
  except Exception as e:
170
+ log.append(f"\n❌ Error: {e}")
171
 
172
+ return "\n".join(log)
173
 
174
 
175
+ # ------------------------------------------------------
176
+ # 🎨 Gradio Interface
177
+ # ------------------------------------------------------
178
  def create_interface():
179
  with gr.Blocks(title="PromptWizard — Qwen Gita Trainer") as demo:
180
  gr.Markdown("""
181
+ # 🧘 Qwen Gita Trainer — Fine-tune Qwen 0.5B
182
+ Train Qwen2.5-0.5B on your **Bhagavad Gita dataset (CSV)**
183
+ and auto-upload LoRA weights to your repo: **rahul7star/Qwen0.5-3B-Gita**
184
  """)
185
 
186
  with gr.Row():
 
188
  gpu_status = gr.Textbox(
189
  label="GPU Status",
190
  value=check_gpu_status(),
191
+ interactive=False
192
  )
 
 
193
  model_name = gr.Textbox(
194
  value="Qwen/Qwen2.5-0.5B",
 
 
195
  visible=False
196
  )
197
+ num_epochs = gr.Slider(1, 3, 1, step=1, label="Epochs")
198
+ batch_size = gr.Slider(1, 4, 2, step=1, label="Batch Size")
 
199
  learning_rate = gr.Number(value=5e-5, label="Learning Rate")
200
  train_btn = gr.Button("🚀 Start Fine-tuning", variant="primary")
201
 
202
  with gr.Column():
203
  output = gr.Textbox(
204
+ label="Training Logs",
205
  lines=25,
206
  max_lines=40,
207
+ value="Press Start to fine-tune Qwen on your Gita dataset.\nZero GPU will allocate automatically.",
208
  )
209
 
 
210
  train_btn.click(
211
  fn=train_model,
212
  inputs=[model_name, num_epochs, batch_size, learning_rate],
213
  outputs=output,
214
  )
215
 
216
+ gr.Markdown("""
217
+ ---
218
+ 💡 **Notes**
219
+ - Requires your CSV on HF dataset: `rahul7star/Gita/Gita.csv`
220
+ - Columns: `question`, `answer`
221
+ - Model uploads automatically to: `rahul7star/Qwen0.5-3B-Gita`
222
+ """)
223
+
224
  return demo
225
+
226
+
227
+ # ------------------------------------------------------
228
+ # 🚪 Launch App
229
+ # ------------------------------------------------------
230
  if __name__ == "__main__":
231
  demo = create_interface()
232
  demo.launch()