rahul7star commited on
Commit
4ba54ec
·
verified ·
1 Parent(s): 8ce7a7d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +124 -221
app.py CHANGED
@@ -1,236 +1,150 @@
1
- """
2
- PromptWizard — Qwen2.5-0.5B Fine-tuning on Bhagavad Gita Dataset
3
- Downloads CSV locally before training (for Hugging Face Spaces)
4
- """
5
-
6
- import gradio as gr
7
- import spaces
8
  import torch
9
- import pandas as pd
10
- from datasets import Dataset
11
  from transformers import (
12
  AutoTokenizer,
13
  AutoModelForCausalLM,
14
  Trainer,
15
  TrainingArguments,
 
16
  )
17
- from peft import LoraConfig, get_peft_model, TaskType
18
- from huggingface_hub import snapshot_download, HfApi
19
- import os
20
- import shutil
21
 
 
 
 
 
22
 
23
- # ------------------------------------------------------
24
- # 🧠 GPU check
25
- # ------------------------------------------------------
26
  def check_gpu_status():
27
- return "🟢 Ready GPU will be assigned at runtime (Zero GPU mode)"
28
-
29
-
30
- # ------------------------------------------------------
31
- # 🧩 Download Dataset to /tmp/
32
- # ------------------------------------------------------
33
- def download_gita_dataset():
34
- repo_id = "rahul7star/Gita"
35
- local_dir = "/tmp/gita_data"
36
-
37
- if os.path.exists(local_dir):
38
- shutil.rmtree(local_dir)
39
- os.makedirs(local_dir, exist_ok=True)
40
-
41
- print(f"📥 Downloading dataset from {repo_id} ...")
42
- snapshot_download(repo_id=repo_id, local_dir=local_dir, repo_type="dataset")
43
-
44
- # Try to locate the CSV file
45
- csv_path = None
46
- for root, _, files in os.walk(local_dir):
47
- for f in files:
48
- if f.lower().endswith(".csv"):
49
- csv_path = os.path.join(root, f)
50
- break
51
- if not csv_path:
52
- raise FileNotFoundError("No CSV file found in the Gita dataset repository.")
53
-
54
- print(f"✅ Found CSV: {csv_path}")
55
- return csv_path
56
-
57
-
58
- # ------------------------------------------------------
59
- # 🚀 Training function
60
- # ------------------------------------------------------
61
- @spaces.GPU(duration=300)
62
- def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.Progress()):
63
- logs = []
64
- try:
65
- progress(0.05, desc="Initializing...")
66
- device = "cuda" if torch.cuda.is_available() else "cpu"
67
- logs.append(f"🎮 Device: {device}")
68
-
69
- # ------------------------------------------------------
70
- # 📂 Step 1: Download dataset
71
- # ------------------------------------------------------
72
- progress(0.1, desc="Downloading dataset...")
73
- logs.append("\n📥 Downloading Gita dataset from HF Hub...")
74
- csv_path = download_gita_dataset()
75
-
76
- # ------------------------------------------------------
77
- # 🧾 Step 2: Load dataset from CSV
78
- # ------------------------------------------------------
79
- progress(0.2, desc="Loading dataset...")
80
- df = pd.read_csv(csv_path)
81
- if not all(c in df.columns for c in ["question", "answer"]):
82
- raise ValueError("CSV must contain 'question' and 'answer' columns.")
83
-
84
- hf_dataset = Dataset.from_pandas(df)
85
-
86
- def format_row(row):
87
- return {
88
- "text": f"<|system|>\nYou are a spiritual guide explaining Gita concepts.\n"
89
- f"<|user|>\n{row['question']}\n"
90
- f"<|assistant|>\n{row['answer']}"
91
- }
92
-
93
- hf_dataset = hf_dataset.map(format_row)
94
- logs.append(f"✅ Loaded {len(hf_dataset)} examples from {csv_path}")
95
-
96
- # ------------------------------------------------------
97
- # 🤖 Step 3: Load model + tokenizer
98
- # ------------------------------------------------------
99
- progress(0.3, desc="Loading Qwen model...")
100
- model_name = "Qwen/Qwen2.5-0.5B"
101
- logs.append(f"\n🔍 Loading base model: {model_name}")
102
-
103
- tokenizer = AutoTokenizer.from_pretrained(model_name)
104
- if tokenizer.pad_token is None:
105
- tokenizer.pad_token = tokenizer.eos_token
106
-
107
- model = AutoModelForCausalLM.from_pretrained(
108
- model_name,
109
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
110
  )
111
- if device == "cuda":
112
- model = model.to(device)
113
- logs.append("✅ Model and tokenizer ready")
 
 
 
 
114
 
115
- # ------------------------------------------------------
116
- # ⚙️ Step 4: Apply LoRA config
117
- # ------------------------------------------------------
118
- progress(0.4, desc="Configuring LoRA...")
119
- lora_config = LoraConfig(
120
- task_type=TaskType.CAUSAL_LM,
121
- r=8,
122
- lora_alpha=16,
123
- lora_dropout=0.1,
124
- target_modules=["q_proj", "v_proj"],
125
- )
126
- model = get_peft_model(model, lora_config)
127
 
128
- # ------------------------------------------------------
129
- # ✂️ Step 5: Tokenize dataset
130
- # ------------------------------------------------------
131
- progress(0.5, desc="Tokenizing data...")
132
 
133
- def tokenize_fn(batch):
134
- return tokenizer(
135
- batch["text"],
136
- truncation=True,
137
- padding="max_length",
138
- max_length=512,
139
- )
140
 
141
- tokenized = hf_dataset.map(tokenize_fn, batched=True)
142
- logs.append("🧾 Dataset tokenized successfully")
 
143
 
144
- # ------------------------------------------------------
145
- # 🎯 Step 6: Training arguments
146
- # ------------------------------------------------------
147
- progress(0.6, desc="Configuring training...")
148
- training_args = TrainingArguments(
149
- output_dir="/tmp/qwen-gita-output",
150
- num_train_epochs=num_epochs,
151
- per_device_train_batch_size=batch_size,
152
- gradient_accumulation_steps=2,
153
- learning_rate=learning_rate,
154
- warmup_steps=5,
155
- logging_steps=5,
156
- save_strategy="no",
157
- fp16=device == "cuda",
158
- max_steps=100,
159
- report_to="none",
160
- )
161
 
162
- # ------------------------------------------------------
163
- # 🏋️ Step 7: Train model
164
- # ------------------------------------------------------
165
- progress(0.7, desc="Training in progress...")
166
- logs.append("\n🚀 Starting fine-tuning...")
167
 
168
- trainer = Trainer(
169
- model=model,
170
- args=training_args,
171
- train_dataset=tokenized,
172
- tokenizer=tokenizer,
 
173
  )
174
- trainer.train()
175
-
176
- # ------------------------------------------------------
177
- # 💾 Step 8: Save + Upload
178
- # ------------------------------------------------------
179
- progress(0.9, desc="Saving and uploading...")
180
- output_dir = "/tmp/qwen-gita-lora"
181
- os.makedirs(output_dir, exist_ok=True)
182
- model.save_pretrained(output_dir)
183
- tokenizer.save_pretrained(output_dir)
184
 
185
- logs.append("\n📤 Uploading fine-tuned LoRA model to Hugging Face Hub...")
186
- repo_id = "rahul7star/Qwen0.5-3B-Gita"
187
- api = HfApi()
188
- api.upload_folder(folder_path=output_dir, repo_id=repo_id)
189
- logs.append(f" Uploaded fine-tuned model to {repo_id}")
190
-
191
- progress(1.0, desc="Complete!")
192
- logs.append("\n🎉 Training complete!")
193
-
194
- except Exception as e:
195
- logs.append(f"\n❌ Error: {str(e)}")
196
-
197
- return "\n".join(logs)
198
-
199
-
200
- # ------------------------------------------------------
201
- # 🎨 Gradio Interface
202
- # ------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
203
  def create_interface():
204
- with gr.Blocks(title="PromptWizard Qwen Gita Trainer") as demo:
205
  gr.Markdown("""
206
- # 🧘 PromptWizard Qwen2.5-0.5B Gita Trainer
207
- Fine-tune Qwen 0.5B on your **Bhagavad Gita CSV dataset**
208
- Automatically uploads LoRA weights to `rahul7star/Qwen0.5-3B-Gita`
209
  """)
210
 
211
- with gr.Row():
212
- with gr.Column():
213
- gpu_status = gr.Textbox(
214
- label="GPU Status",
215
- value=check_gpu_status(),
216
- interactive=False,
217
- )
218
- model_name = gr.Textbox(
219
- value="Qwen/Qwen2.5-0.5B",
220
- visible=False,
221
- )
222
- num_epochs = gr.Slider(1, 3, 1, step=1, label="Epochs")
223
- batch_size = gr.Slider(1, 4, 2, step=1, label="Batch Size")
224
- learning_rate = gr.Number(value=5e-5, label="Learning Rate")
225
- train_btn = gr.Button("🚀 Start Fine-tuning", variant="primary")
226
-
227
- with gr.Column():
228
- output = gr.Textbox(
229
- label="Training Logs",
230
- lines=25,
231
- max_lines=40,
232
- value="Click 'Start Fine-tuning' to train on Bhagavad Gita dataset...",
233
- )
234
 
235
  train_btn.click(
236
  fn=train_model,
@@ -238,20 +152,9 @@ def create_interface():
238
  outputs=output,
239
  )
240
 
241
- gr.Markdown("""
242
- ---
243
- **Notes:**
244
- - Downloads dataset: `rahul7star/Gita` → `/tmp/gita_data/Gita.csv`
245
- - Trains using LoRA for efficiency
246
- - Uploads to `rahul7star/Qwen0.5-3B-Gita`
247
- """)
248
-
249
  return demo
250
 
 
251
 
252
- # ------------------------------------------------------
253
- # 🚪 Launch app
254
- # ------------------------------------------------------
255
  if __name__ == "__main__":
256
- demo = create_interface()
257
  demo.launch()
 
1
+ import os
 
 
 
 
 
 
2
  import torch
3
+ import gradio as gr
4
+ from datasets import load_dataset
5
  from transformers import (
6
  AutoTokenizer,
7
  AutoModelForCausalLM,
8
  Trainer,
9
  TrainingArguments,
10
+ DataCollatorForLanguageModeling,
11
  )
12
+ from huggingface_hub import HfApi, HfFolder
 
 
 
13
 
14
+ # ---------------------------------------------------------------------
15
+ # GPU check
16
+ # ---------------------------------------------------------------------
17
+ device = "cuda" if torch.cuda.is_available() else "cpu"
18
 
 
 
 
19
  def check_gpu_status():
20
+ return f" GPU: {torch.cuda.get_device_name(0)}" if device == "cuda" else "⚠️ Using CPU only"
21
+
22
+ # ---------------------------------------------------------------------
23
+ # Training Logic
24
+ # ---------------------------------------------------------------------
25
+ def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.Progress(track_tqdm=True)):
26
+ output_log = []
27
+
28
+ # ==== Load dataset ====
29
+ progress(0.1, desc="Loading rahul7star/Gita dataset...")
30
+ output_log.append("\n📚 Loading dataset from rahul7star/Gita...")
31
+
32
+ dataset = load_dataset("rahul7star/Gita", split="train")
33
+ output_log.append(f" Loaded {len(dataset)} samples")
34
+ output_log.append(f" Columns: {dataset.column_names}")
35
+
36
+ # ==== Format dataset ====
37
+ def format_example(item):
38
+ text = (
39
+ item.get("text")
40
+ or item.get("content")
41
+ or item.get("verse")
42
+ or " ".join(str(v) for v in item.values())
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  )
44
+ prompt = f"""<|system|>
45
+ You are a wise teacher interpreting Bhagavad Gita with deep insights.
46
+ <|user|>
47
+ {text}
48
+ <|assistant|>
49
+ """
50
+ return {"text": prompt}
51
 
52
+ dataset = dataset.map(format_example)
53
+ output_log.append(f" ✅ Formatted {len(dataset)} examples")
 
 
 
 
 
 
 
 
 
 
54
 
55
+ # ==== Load tokenizer & model ====
56
+ progress(0.3, desc="Loading model and tokenizer...")
57
+ output_log.append("\n🤖 Loading Qwen model and tokenizer...")
 
58
 
59
+ base_model = "Qwen/Qwen2.5-0.5B"
60
+ tokenizer = AutoTokenizer.from_pretrained(base_model)
 
 
 
 
 
61
 
62
+ # Fix missing pad token
63
+ if tokenizer.pad_token is None:
64
+ tokenizer.pad_token = tokenizer.eos_token
65
 
66
+ model = AutoModelForCausalLM.from_pretrained(
67
+ base_model,
68
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
69
+ ).to(device)
 
 
 
 
 
 
 
 
 
 
 
 
 
70
 
71
+ # ==== Tokenize dataset ====
72
+ progress(0.4, desc="Tokenizing dataset...")
73
+ output_log.append("\n✏️ Tokenizing dataset...")
 
 
74
 
75
+ def tokenize_function(examples):
76
+ return tokenizer(
77
+ examples["text"],
78
+ truncation=True,
79
+ padding="max_length",
80
+ max_length=512,
81
  )
 
 
 
 
 
 
 
 
 
 
82
 
83
+ tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
84
+ output_log.append(f" ✅ Tokenized {len(tokenized_dataset)} samples")
85
+
86
+ # ==== Training setup ====
87
+ progress(0.5, desc="Starting training...")
88
+ output_log.append("\n⚙️ Preparing Trainer...")
89
+
90
+ output_dir = "./Qwen-Gita-Checkpoints"
91
+ training_args = TrainingArguments(
92
+ output_dir=output_dir,
93
+ overwrite_output_dir=True,
94
+ per_device_train_batch_size=batch_size,
95
+ num_train_epochs=num_epochs,
96
+ learning_rate=learning_rate,
97
+ fp16=device == "cuda",
98
+ save_steps=100,
99
+ logging_steps=10,
100
+ save_total_limit=1,
101
+ )
102
+
103
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
104
+
105
+ trainer = Trainer(
106
+ model=model,
107
+ args=training_args,
108
+ train_dataset=tokenized_dataset,
109
+ tokenizer=tokenizer,
110
+ data_collator=data_collator,
111
+ )
112
+
113
+ # ==== Train ====
114
+ output_log.append("\n🚀 Training started ...")
115
+ trainer.train()
116
+ output_log.append("✅ Training complete!")
117
+
118
+ # ==== Push to Hugging Face Hub ====
119
+ repo_id = "rahul7star/Qwen0.5-3B-Gita"
120
+ output_log.append(f"\n☁️ Uploading to Hugging Face Hub: {repo_id}")
121
+ api = HfApi()
122
+ token = HfFolder.get_token()
123
+
124
+ model.push_to_hub(repo_id, token=token)
125
+ tokenizer.push_to_hub(repo_id, token=token)
126
+ output_log.append(f"✅ Model uploaded successfully to {repo_id}")
127
+
128
+ return "\n".join(output_log)
129
+
130
+ # ---------------------------------------------------------------------
131
+ # Gradio Interface
132
+ # ---------------------------------------------------------------------
133
  def create_interface():
134
+ with gr.Blocks(title="🧘 Qwen Gita Trainer") as demo:
135
  gr.Markdown("""
136
+ # 🧘 Fine-tune Qwen 0.5B on Bhagavad Gita
137
+ This app downloads `rahul7star/Gita`, trains the model to become a Gita teacher,
138
+ and uploads results to `rahul7star/Qwen0.5-3B-Gita`.
139
  """)
140
 
141
+ gpu_status = gr.Textbox(value=check_gpu_status(), label="GPU Status", interactive=False)
142
+ model_name = gr.Textbox(value="Qwen/Qwen2.5-0.5B", label="Base Model", visible=False)
143
+ num_epochs = gr.Slider(1, 3, value=1, step=1, label="Epochs")
144
+ batch_size = gr.Slider(1, 4, value=2, step=1, label="Batch Size")
145
+ learning_rate = gr.Number(value=5e-5, label="Learning Rate")
146
+ train_btn = gr.Button("🚀 Start Fine-tuning", variant="primary")
147
+ output = gr.Textbox(label="Training Log", lines=30)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
148
 
149
  train_btn.click(
150
  fn=train_model,
 
152
  outputs=output,
153
  )
154
 
 
 
 
 
 
 
 
 
155
  return demo
156
 
157
+ demo = create_interface()
158
 
 
 
 
159
  if __name__ == "__main__":
 
160
  demo.launch()