rahul7star commited on
Commit
8ce7a7d
ยท
verified ยท
1 Parent(s): 88fb0c6

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +92 -67
app.py CHANGED
@@ -1,6 +1,6 @@
1
  """
2
  PromptWizard โ€” Qwen2.5-0.5B Fine-tuning on Bhagavad Gita Dataset
3
- Optimized for Hugging Face Spaces with Zero GPU Support
4
  """
5
 
6
  import gradio as gr
@@ -15,59 +15,90 @@ from transformers import (
15
  TrainingArguments,
16
  )
17
  from peft import LoraConfig, get_peft_model, TaskType
18
- from huggingface_hub import HfApi
19
  import os
 
20
 
21
 
22
  # ------------------------------------------------------
23
- # ๐Ÿง  Helper: Check GPU availability
24
  # ------------------------------------------------------
25
  def check_gpu_status():
26
- return "๐ŸŸข Ready โ€” GPU will be auto-assigned when training starts."
27
 
28
 
29
  # ------------------------------------------------------
30
- # ๐Ÿš€ Main Training Function (runs on GPU)
31
  # ------------------------------------------------------
32
- @spaces.GPU(duration=600) # 10 minutes GPU allocation
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.Progress()):
34
- log = []
35
  try:
 
36
  device = "cuda" if torch.cuda.is_available() else "cpu"
37
- log.append(f"๐ŸŽฎ Device: {device}")
38
 
39
  # ------------------------------------------------------
40
- # ๐Ÿ“‚ Load Dataset from Hugging Face Repo (rahul7star/Gita)
41
  # ------------------------------------------------------
42
- progress(0.1, desc="Loading Bhagavad Gita dataset...")
43
- log.append("\n๐Ÿ“š Loading dataset from rahul7star/Gita ...")
 
44
 
45
- # Load CSV from your repo
46
- dataset = Dataset.from_pandas(
47
- pd.read_csv("https://huggingface.co/datasets/rahul7star/Gita/resolve/main/Gita.csv")
48
- )
 
 
 
49
 
50
- # Expected columns: 'question', 'answer'
51
- if not all(c in dataset.column_names for c in ["question", "answer"]):
52
- raise ValueError("CSV must contain columns: 'question' and 'answer'")
53
 
54
  def format_row(row):
55
  return {
56
- "text": f"<|system|>\nYou are a spiritual teacher summarizing Gita verses.\n"
57
  f"<|user|>\n{row['question']}\n"
58
  f"<|assistant|>\n{row['answer']}"
59
  }
60
 
61
- dataset = dataset.map(format_row)
62
- log.append(f"โœ… Loaded {len(dataset)} examples from Bhagavad Gita CSV")
63
 
64
  # ------------------------------------------------------
65
- # ๐Ÿค– Load Qwen Model and Tokenizer
66
  # ------------------------------------------------------
67
- progress(0.3, desc="Loading model and tokenizer...")
68
- log.append(f"\n๐Ÿ” Loading model: {model_name}")
69
-
70
- model_name = "Qwen/Qwen2.5-0.5B" # safest base model for Zero GPU
71
 
72
  tokenizer = AutoTokenizer.from_pretrained(model_name)
73
  if tokenizer.pad_token is None:
@@ -77,14 +108,12 @@ def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.P
77
  model_name,
78
  torch_dtype=torch.float16 if device == "cuda" else torch.float32,
79
  )
80
-
81
  if device == "cuda":
82
  model = model.to(device)
83
-
84
- log.append("โœ… Model & tokenizer loaded successfully")
85
 
86
  # ------------------------------------------------------
87
- # โš™๏ธ Configure LoRA for lightweight fine-tuning
88
  # ------------------------------------------------------
89
  progress(0.4, desc="Configuring LoRA...")
90
  lora_config = LoraConfig(
@@ -96,31 +125,28 @@ def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.P
96
  )
97
  model = get_peft_model(model, lora_config)
98
 
99
- trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
100
- total_params = sum(p.numel() for p in model.parameters())
101
- log.append(f"๐Ÿงฉ Trainable params: {trainable_params:,} / {total_params:,}")
102
-
103
  # ------------------------------------------------------
104
- # โœ‚๏ธ Tokenize Dataset
105
  # ------------------------------------------------------
106
- progress(0.5, desc="Tokenizing dataset...")
107
 
108
- def tokenize_fn(examples):
109
  return tokenizer(
110
- examples["text"],
111
- padding="max_length",
112
  truncation=True,
 
113
  max_length=512,
114
  )
115
 
116
- tokenized = dataset.map(tokenize_fn, batched=True)
 
117
 
118
  # ------------------------------------------------------
119
- # ๐ŸŽฏ Setup Training Arguments
120
  # ------------------------------------------------------
121
  progress(0.6, desc="Configuring training...")
122
  training_args = TrainingArguments(
123
- output_dir="./qwen-gita-output",
124
  num_train_epochs=num_epochs,
125
  per_device_train_batch_size=batch_size,
126
  gradient_accumulation_steps=2,
@@ -129,15 +155,15 @@ def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.P
129
  logging_steps=5,
130
  save_strategy="no",
131
  fp16=device == "cuda",
132
- max_steps=100, # short demo run
133
  report_to="none",
134
  )
135
 
136
  # ------------------------------------------------------
137
- # ๐Ÿ‹๏ธ Train
138
  # ------------------------------------------------------
139
- progress(0.7, desc="Training model...")
140
- log.append("\n๐Ÿš€ Starting training on Bhagavad Gita dataset...")
141
 
142
  trainer = Trainer(
143
  model=model,
@@ -147,29 +173,28 @@ def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.P
147
  )
148
  trainer.train()
149
 
150
- progress(0.9, desc="Finalizing and saving...")
151
-
152
  # ------------------------------------------------------
153
- # ๐Ÿ’พ Save and Upload to HF Repo
154
  # ------------------------------------------------------
155
- output_dir = "./qwen-gita-lora"
 
156
  os.makedirs(output_dir, exist_ok=True)
157
  model.save_pretrained(output_dir)
158
  tokenizer.save_pretrained(output_dir)
159
 
160
- log.append("\n๐Ÿ“ค Uploading model to ๐Ÿค— Hugging Face Hub...")
161
- api = HfApi()
162
  repo_id = "rahul7star/Qwen0.5-3B-Gita"
 
163
  api.upload_folder(folder_path=output_dir, repo_id=repo_id)
164
- log.append(f"โœ… Uploaded LoRA fine-tuned model to {repo_id}")
165
 
166
  progress(1.0, desc="Complete!")
167
- log.append("\n๐ŸŽ‰ Training finished successfully!")
168
 
169
  except Exception as e:
170
- log.append(f"\nโŒ Error: {e}")
171
 
172
- return "\n".join(log)
173
 
174
 
175
  # ------------------------------------------------------
@@ -178,9 +203,9 @@ def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.P
178
  def create_interface():
179
  with gr.Blocks(title="PromptWizard โ€” Qwen Gita Trainer") as demo:
180
  gr.Markdown("""
181
- # ๐Ÿง˜ Qwen Gita Trainer โ€” Fine-tune Qwen 0.5B
182
- Train Qwen2.5-0.5B on your **Bhagavad Gita dataset (CSV)**
183
- and auto-upload LoRA weights to your repo: **rahul7star/Qwen0.5-3B-Gita**
184
  """)
185
 
186
  with gr.Row():
@@ -188,11 +213,11 @@ def create_interface():
188
  gpu_status = gr.Textbox(
189
  label="GPU Status",
190
  value=check_gpu_status(),
191
- interactive=False
192
  )
193
  model_name = gr.Textbox(
194
  value="Qwen/Qwen2.5-0.5B",
195
- visible=False
196
  )
197
  num_epochs = gr.Slider(1, 3, 1, step=1, label="Epochs")
198
  batch_size = gr.Slider(1, 4, 2, step=1, label="Batch Size")
@@ -204,7 +229,7 @@ def create_interface():
204
  label="Training Logs",
205
  lines=25,
206
  max_lines=40,
207
- value="Press Start to fine-tune Qwen on your Gita dataset.\nZero GPU will allocate automatically.",
208
  )
209
 
210
  train_btn.click(
@@ -215,17 +240,17 @@ def create_interface():
215
 
216
  gr.Markdown("""
217
  ---
218
- ๐Ÿ’ก **Notes**
219
- - Requires your CSV on HF dataset: `rahul7star/Gita/Gita.csv`
220
- - Columns: `question`, `answer`
221
- - Model uploads automatically to: `rahul7star/Qwen0.5-3B-Gita`
222
  """)
223
 
224
  return demo
225
 
226
 
227
  # ------------------------------------------------------
228
- # ๐Ÿšช Launch App
229
  # ------------------------------------------------------
230
  if __name__ == "__main__":
231
  demo = create_interface()
 
1
  """
2
  PromptWizard โ€” Qwen2.5-0.5B Fine-tuning on Bhagavad Gita Dataset
3
+ Downloads CSV locally before training (for Hugging Face Spaces)
4
  """
5
 
6
  import gradio as gr
 
15
  TrainingArguments,
16
  )
17
  from peft import LoraConfig, get_peft_model, TaskType
18
+ from huggingface_hub import snapshot_download, HfApi
19
  import os
20
+ import shutil
21
 
22
 
23
  # ------------------------------------------------------
24
+ # ๐Ÿง  GPU check
25
  # ------------------------------------------------------
26
  def check_gpu_status():
27
+ return "๐ŸŸข Ready โ€” GPU will be assigned at runtime (Zero GPU mode)"
28
 
29
 
30
  # ------------------------------------------------------
31
+ # ๐Ÿงฉ Download Dataset to /tmp/
32
  # ------------------------------------------------------
33
+ def download_gita_dataset():
34
+ repo_id = "rahul7star/Gita"
35
+ local_dir = "/tmp/gita_data"
36
+
37
+ if os.path.exists(local_dir):
38
+ shutil.rmtree(local_dir)
39
+ os.makedirs(local_dir, exist_ok=True)
40
+
41
+ print(f"๐Ÿ“ฅ Downloading dataset from {repo_id} ...")
42
+ snapshot_download(repo_id=repo_id, local_dir=local_dir, repo_type="dataset")
43
+
44
+ # Try to locate the CSV file
45
+ csv_path = None
46
+ for root, _, files in os.walk(local_dir):
47
+ for f in files:
48
+ if f.lower().endswith(".csv"):
49
+ csv_path = os.path.join(root, f)
50
+ break
51
+ if not csv_path:
52
+ raise FileNotFoundError("No CSV file found in the Gita dataset repository.")
53
+
54
+ print(f"โœ… Found CSV: {csv_path}")
55
+ return csv_path
56
+
57
+
58
+ # ------------------------------------------------------
59
+ # ๐Ÿš€ Training function
60
+ # ------------------------------------------------------
61
+ @spaces.GPU(duration=300)
62
  def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.Progress()):
63
+ logs = []
64
  try:
65
+ progress(0.05, desc="Initializing...")
66
  device = "cuda" if torch.cuda.is_available() else "cpu"
67
+ logs.append(f"๐ŸŽฎ Device: {device}")
68
 
69
  # ------------------------------------------------------
70
+ # ๐Ÿ“‚ Step 1: Download dataset
71
  # ------------------------------------------------------
72
+ progress(0.1, desc="Downloading dataset...")
73
+ logs.append("\n๐Ÿ“ฅ Downloading Gita dataset from HF Hub...")
74
+ csv_path = download_gita_dataset()
75
 
76
+ # ------------------------------------------------------
77
+ # ๐Ÿงพ Step 2: Load dataset from CSV
78
+ # ------------------------------------------------------
79
+ progress(0.2, desc="Loading dataset...")
80
+ df = pd.read_csv(csv_path)
81
+ if not all(c in df.columns for c in ["question", "answer"]):
82
+ raise ValueError("CSV must contain 'question' and 'answer' columns.")
83
 
84
+ hf_dataset = Dataset.from_pandas(df)
 
 
85
 
86
  def format_row(row):
87
  return {
88
+ "text": f"<|system|>\nYou are a spiritual guide explaining Gita concepts.\n"
89
  f"<|user|>\n{row['question']}\n"
90
  f"<|assistant|>\n{row['answer']}"
91
  }
92
 
93
+ hf_dataset = hf_dataset.map(format_row)
94
+ logs.append(f"โœ… Loaded {len(hf_dataset)} examples from {csv_path}")
95
 
96
  # ------------------------------------------------------
97
+ # ๐Ÿค– Step 3: Load model + tokenizer
98
  # ------------------------------------------------------
99
+ progress(0.3, desc="Loading Qwen model...")
100
+ model_name = "Qwen/Qwen2.5-0.5B"
101
+ logs.append(f"\n๐Ÿ” Loading base model: {model_name}")
 
102
 
103
  tokenizer = AutoTokenizer.from_pretrained(model_name)
104
  if tokenizer.pad_token is None:
 
108
  model_name,
109
  torch_dtype=torch.float16 if device == "cuda" else torch.float32,
110
  )
 
111
  if device == "cuda":
112
  model = model.to(device)
113
+ logs.append("โœ… Model and tokenizer ready")
 
114
 
115
  # ------------------------------------------------------
116
+ # โš™๏ธ Step 4: Apply LoRA config
117
  # ------------------------------------------------------
118
  progress(0.4, desc="Configuring LoRA...")
119
  lora_config = LoraConfig(
 
125
  )
126
  model = get_peft_model(model, lora_config)
127
 
 
 
 
 
128
  # ------------------------------------------------------
129
+ # โœ‚๏ธ Step 5: Tokenize dataset
130
  # ------------------------------------------------------
131
+ progress(0.5, desc="Tokenizing data...")
132
 
133
+ def tokenize_fn(batch):
134
  return tokenizer(
135
+ batch["text"],
 
136
  truncation=True,
137
+ padding="max_length",
138
  max_length=512,
139
  )
140
 
141
+ tokenized = hf_dataset.map(tokenize_fn, batched=True)
142
+ logs.append("๐Ÿงพ Dataset tokenized successfully")
143
 
144
  # ------------------------------------------------------
145
+ # ๐ŸŽฏ Step 6: Training arguments
146
  # ------------------------------------------------------
147
  progress(0.6, desc="Configuring training...")
148
  training_args = TrainingArguments(
149
+ output_dir="/tmp/qwen-gita-output",
150
  num_train_epochs=num_epochs,
151
  per_device_train_batch_size=batch_size,
152
  gradient_accumulation_steps=2,
 
155
  logging_steps=5,
156
  save_strategy="no",
157
  fp16=device == "cuda",
158
+ max_steps=100,
159
  report_to="none",
160
  )
161
 
162
  # ------------------------------------------------------
163
+ # ๐Ÿ‹๏ธ Step 7: Train model
164
  # ------------------------------------------------------
165
+ progress(0.7, desc="Training in progress...")
166
+ logs.append("\n๐Ÿš€ Starting fine-tuning...")
167
 
168
  trainer = Trainer(
169
  model=model,
 
173
  )
174
  trainer.train()
175
 
 
 
176
  # ------------------------------------------------------
177
+ # ๐Ÿ’พ Step 8: Save + Upload
178
  # ------------------------------------------------------
179
+ progress(0.9, desc="Saving and uploading...")
180
+ output_dir = "/tmp/qwen-gita-lora"
181
  os.makedirs(output_dir, exist_ok=True)
182
  model.save_pretrained(output_dir)
183
  tokenizer.save_pretrained(output_dir)
184
 
185
+ logs.append("\n๐Ÿ“ค Uploading fine-tuned LoRA model to Hugging Face Hub...")
 
186
  repo_id = "rahul7star/Qwen0.5-3B-Gita"
187
+ api = HfApi()
188
  api.upload_folder(folder_path=output_dir, repo_id=repo_id)
189
+ logs.append(f"โœ… Uploaded fine-tuned model to {repo_id}")
190
 
191
  progress(1.0, desc="Complete!")
192
+ logs.append("\n๐ŸŽ‰ Training complete!")
193
 
194
  except Exception as e:
195
+ logs.append(f"\nโŒ Error: {str(e)}")
196
 
197
+ return "\n".join(logs)
198
 
199
 
200
  # ------------------------------------------------------
 
203
  def create_interface():
204
  with gr.Blocks(title="PromptWizard โ€” Qwen Gita Trainer") as demo:
205
  gr.Markdown("""
206
+ # ๐Ÿง˜ PromptWizard โ€” Qwen2.5-0.5B Gita Trainer
207
+ Fine-tune Qwen 0.5B on your **Bhagavad Gita CSV dataset**
208
+ Automatically uploads LoRA weights to `rahul7star/Qwen0.5-3B-Gita`
209
  """)
210
 
211
  with gr.Row():
 
213
  gpu_status = gr.Textbox(
214
  label="GPU Status",
215
  value=check_gpu_status(),
216
+ interactive=False,
217
  )
218
  model_name = gr.Textbox(
219
  value="Qwen/Qwen2.5-0.5B",
220
+ visible=False,
221
  )
222
  num_epochs = gr.Slider(1, 3, 1, step=1, label="Epochs")
223
  batch_size = gr.Slider(1, 4, 2, step=1, label="Batch Size")
 
229
  label="Training Logs",
230
  lines=25,
231
  max_lines=40,
232
+ value="Click 'Start Fine-tuning' to train on Bhagavad Gita dataset...",
233
  )
234
 
235
  train_btn.click(
 
240
 
241
  gr.Markdown("""
242
  ---
243
+ **Notes:**
244
+ - Downloads dataset: `rahul7star/Gita` โ†’ `/tmp/gita_data/Gita.csv`
245
+ - Trains using LoRA for efficiency
246
+ - Uploads to `rahul7star/Qwen0.5-3B-Gita`
247
  """)
248
 
249
  return demo
250
 
251
 
252
  # ------------------------------------------------------
253
+ # ๐Ÿšช Launch app
254
  # ------------------------------------------------------
255
  if __name__ == "__main__":
256
  demo = create_interface()