rahul7star commited on
Commit
94f255b
·
verified ·
1 Parent(s): ce14469

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +178 -148
app.py CHANGED
@@ -1,183 +1,213 @@
 
 
 
 
 
 
 
1
  import spaces
2
- import os
3
  import torch
4
- import gradio as gr
5
- from datasets import load_dataset
6
  from transformers import (
7
- AutoTokenizer,
8
  AutoModelForCausalLM,
 
9
  Trainer,
10
  TrainingArguments,
11
- DataCollatorForLanguageModeling,
12
  )
13
- from huggingface_hub import HfApi, HfFolder
 
 
 
14
 
15
- # ---------------------------------------------------------------------
16
- # GPU check
17
- # ---------------------------------------------------------------------
18
- device = "cuda" if torch.cuda.is_available() else "cpu"
19
 
 
20
  def check_gpu_status():
21
- return f" GPU: {torch.cuda.get_device_name(0)}" if device == "cuda" else "⚠️ Using CPU only"
22
- # ------------------------------------------------------
23
- # 🧩 Download Dataset to /tmp/
24
- # ------------------------------------------------------
25
- def download_gita_dataset():
26
- repo_id = "rahul7star/Gita"
27
- local_dir = "/tmp/gita_data"
28
-
29
- if os.path.exists(local_dir):
30
- shutil.rmtree(local_dir)
31
- os.makedirs(local_dir, exist_ok=True)
32
-
33
- print(f"📥 Downloading dataset from {repo_id} ...")
34
- snapshot_download(repo_id=repo_id, local_dir=local_dir, repo_type="dataset")
35
-
36
- # Try to locate the CSV file
37
- csv_path = None
38
- for root, _, files in os.walk(local_dir):
39
- for f in files:
40
- if f.lower().endswith(".csv"):
41
- csv_path = os.path.join(root, f)
42
- break
43
- if not csv_path:
44
- raise FileNotFoundError("No CSV file found in the Gita dataset repository.")
45
-
46
- print(f"✅ Found CSV: {csv_path}")
47
- return csv_path
48
-
49
-
50
- # ------------------------------------------------------
51
- # 🚀 Training function
52
- # ------------------------------------------------------
53
-
54
- # ---------------------------------------------------------------------
55
- # Training Logic
56
- # ---------------------------------------------------------------------
57
  @spaces.GPU(duration=300)
58
- def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.Progress(track_tqdm=True)):
 
59
  output_log = []
60
 
61
- # ==== Load dataset ====
62
- progress(0.1, desc="Loading rahul7star/Gita dataset...")
63
- output_log.append("\n📚 Loading dataset from rahul7star/Gita...")
64
-
65
- dataset = load_dataset("rahul7star/Gita", split="train")
66
- output_log.append(f" Loaded {len(dataset)} samples")
67
- output_log.append(f" Columns: {dataset.column_names}")
68
-
69
- # ==== Format dataset ====
70
- def format_example(item):
71
- text = (
72
- item.get("text")
73
- or item.get("content")
74
- or item.get("verse")
75
- or " ".join(str(v) for v in item.values())
76
- )
77
- prompt = f"""<|system|>
 
 
 
 
 
 
 
 
78
  You are a wise teacher interpreting Bhagavad Gita with deep insights.
79
  <|user|>
80
  {text}
81
  <|assistant|>
82
  """
83
- return {"text": prompt}
84
 
85
- dataset = dataset.map(format_example)
86
- output_log.append(f" ✅ Formatted {len(dataset)} examples")
87
 
88
- # ==== Load tokenizer & model ====
89
- progress(0.3, desc="Loading model and tokenizer...")
90
- output_log.append("\n🤖 Loading Qwen model and tokenizer...")
 
91
 
92
- base_model = "Qwen/Qwen2.5-0.5B"
93
- tokenizer = AutoTokenizer.from_pretrained(base_model)
 
94
 
95
- # Fix missing pad token
96
- if tokenizer.pad_token is None:
97
- tokenizer.pad_token = tokenizer.eos_token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
 
99
- model = AutoModelForCausalLM.from_pretrained(
100
- base_model,
101
- torch_dtype=torch.float16 if device == "cuda" else torch.float32,
102
- ).to(device)
 
 
103
 
104
- # ==== Tokenize dataset ====
105
- progress(0.4, desc="Tokenizing dataset...")
106
- output_log.append("\n✏️ Tokenizing dataset...")
 
107
 
108
- def tokenize_function(examples):
109
- return tokenizer(
110
- examples["text"],
111
- truncation=True,
112
- padding="max_length",
113
- max_length=512,
114
- )
 
 
115
 
116
- tokenized_dataset = dataset.map(tokenize_function, batched=True, remove_columns=dataset.column_names)
117
- output_log.append(f" ✅ Tokenized {len(tokenized_dataset)} samples")
118
-
119
- # ==== Training setup ====
120
- progress(0.5, desc="Starting training...")
121
- output_log.append("\n⚙️ Preparing Trainer...")
122
-
123
- output_dir = "./Qwen-Gita-Checkpoints"
124
- training_args = TrainingArguments(
125
- output_dir=output_dir,
126
- overwrite_output_dir=True,
127
- per_device_train_batch_size=batch_size,
128
- num_train_epochs=num_epochs,
129
- learning_rate=learning_rate,
130
- fp16=device == "cuda",
131
- save_steps=100,
132
- logging_steps=10,
133
- save_total_limit=1,
134
- )
135
-
136
- data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
137
-
138
- trainer = Trainer(
139
- model=model,
140
- args=training_args,
141
- train_dataset=tokenized_dataset,
142
- tokenizer=tokenizer,
143
- data_collator=data_collator,
144
- )
145
-
146
- # ==== Train ====
147
- output_log.append("\n🚀 Training started ...")
148
- trainer.train()
149
- output_log.append("✅ Training complete!")
150
-
151
- # ==== Push to Hugging Face Hub ====
152
- repo_id = "rahul7star/Qwen0.5-3B-Gita"
153
- output_log.append(f"\n☁️ Uploading to Hugging Face Hub: {repo_id}")
154
- api = HfApi()
155
- token = HfFolder.get_token()
156
-
157
- model.push_to_hub(repo_id, token=token)
158
- tokenizer.push_to_hub(repo_id, token=token)
159
- output_log.append(f"✅ Model uploaded successfully to {repo_id}")
160
 
161
  return "\n".join(output_log)
162
 
163
- # ---------------------------------------------------------------------
164
- # Gradio Interface
165
- # ---------------------------------------------------------------------
166
  def create_interface():
167
- with gr.Blocks(title="🧘 Qwen Gita Trainer") as demo:
168
  gr.Markdown("""
169
- # 🧘 Fine-tune Qwen 0.5B on Bhagavad Gita
170
- This app downloads `rahul7star/Gita`, trains the model to become a Gita teacher,
171
- and uploads results to `rahul7star/Qwen0.5-3B-Gita`.
172
  """)
173
 
174
- gpu_status = gr.Textbox(value=check_gpu_status(), label="GPU Status", interactive=False)
175
- model_name = gr.Textbox(value="Qwen/Qwen2.5-0.5B", label="Base Model", visible=False)
176
- num_epochs = gr.Slider(1, 3, value=1, step=1, label="Epochs")
177
- batch_size = gr.Slider(1, 4, value=2, step=1, label="Batch Size")
178
- learning_rate = gr.Number(value=5e-5, label="Learning Rate")
179
- train_btn = gr.Button("🚀 Start Fine-tuning", variant="primary")
180
- output = gr.Textbox(label="Training Log", lines=30)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
  train_btn.click(
183
  fn=train_model,
@@ -187,7 +217,7 @@ def create_interface():
187
 
188
  return demo
189
 
190
- demo = create_interface()
191
 
192
  if __name__ == "__main__":
 
193
  demo.launch()
 
1
+ """
2
+ PromptWizard Qwen Training — Gita Edition
3
+ Fine-tunes Qwen using rahul7star/Gita dataset (.csv)
4
+ Uploads trained model to rahul7star/Qwen0.5-3B-Gita on Hugging Face Hub
5
+ """
6
+
7
+ import gradio as gr
8
  import spaces
 
9
  import torch
 
 
10
  from transformers import (
 
11
  AutoModelForCausalLM,
12
+ AutoTokenizer,
13
  Trainer,
14
  TrainingArguments,
 
15
  )
16
+ from datasets import load_dataset, Dataset
17
+ from peft import LoraConfig, get_peft_model, TaskType
18
+ from huggingface_hub import HfApi, HfFolder, Repository
19
+ import os, tempfile, shutil
20
 
 
 
 
 
21
 
22
+ # === GPU check (Zero GPU compatible) ===
23
  def check_gpu_status():
24
+ return "🚀 Zero GPU Ready - GPU will be allocated when training starts"
25
+
26
+
27
+ # === Main Training ===
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  @spaces.GPU(duration=300)
29
+ def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.Progress()):
30
+ progress(0, desc="Initializing...")
31
  output_log = []
32
 
33
+ try:
34
+ # ==== Device ====
35
+ device = "cuda" if torch.cuda.is_available() else "cpu"
36
+ output_log.append(f"🎮 Using device: {device}")
37
+ if device == "cuda":
38
+ output_log.append(f" GPU: {torch.cuda.get_device_name(0)}")
39
+
40
+ # ==== Load dataset ====
41
+ progress(0.1, desc="Loading rahul7star/Gita dataset...")
42
+ output_log.append("\n📚 Loading dataset from rahul7star/Gita...")
43
+
44
+ dataset = load_dataset("rahul7star/Gita", split="train")
45
+ output_log.append(f" Loaded {len(dataset)} samples from CSV")
46
+ output_log.append(f" Columns: {dataset.column_names}")
47
+
48
+ # ==== Format data ====
49
+ def format_example(item):
50
+ # Use "text" or "content" column if available
51
+ text = (
52
+ item.get("text")
53
+ or item.get("content")
54
+ or " ".join(str(v) for v in item.values())
55
+ )
56
+
57
+ prompt = f"""<|system|>
58
  You are a wise teacher interpreting Bhagavad Gita with deep insights.
59
  <|user|>
60
  {text}
61
  <|assistant|>
62
  """
63
+ return {"text": prompt}
64
 
65
+ dataset = dataset.map(format_example)
66
+ output_log.append(f" ✅ Formatted {len(dataset)} examples")
67
 
68
+ # ==== Model ====
69
+ progress(0.3, desc="Loading model & tokenizer...")
70
+ model_name = "Qwen/Qwen2.5-0.5B"
71
+ output_log.append(f"\n���� Loading model: {model_name}")
72
 
73
+ tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
74
+ if tokenizer.pad_token is None:
75
+ tokenizer.pad_token = tokenizer.eos_token
76
 
77
+ model = AutoModelForCausalLM.from_pretrained(
78
+ model_name,
79
+ trust_remote_code=True,
80
+ torch_dtype=torch.float16 if device == "cuda" else torch.float32,
81
+ low_cpu_mem_usage=True,
82
+ )
83
+ if device == "cuda":
84
+ model = model.to(device)
85
+
86
+ output_log.append(" ✅ Model loaded successfully")
87
+
88
+ # ==== LoRA ====
89
+ progress(0.4, desc="Configuring LoRA...")
90
+ output_log.append("\n⚙️ Setting up LoRA for efficient fine-tuning...")
91
+
92
+ lora_config = LoraConfig(
93
+ task_type=TaskType.CAUSAL_LM,
94
+ r=8,
95
+ lora_alpha=16,
96
+ lora_dropout=0.1,
97
+ target_modules=["q_proj", "v_proj"],
98
+ bias="none",
99
+ )
100
+ model = get_peft_model(model, lora_config)
101
+
102
+ trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
103
+ output_log.append(f" Trainable params: {trainable_params:,}")
104
+
105
+ # ==== Tokenization ====
106
+ progress(0.5, desc="Tokenizing dataset...")
107
+ def tokenize_fn(examples):
108
+ return tokenizer(
109
+ examples["text"],
110
+ padding="max_length",
111
+ truncation=True,
112
+ max_length=256,
113
+ )
114
+
115
+ dataset = dataset.map(tokenize_fn, batched=True)
116
+ output_log.append(" ✅ Tokenization done")
117
+
118
+ # ==== Training arguments ====
119
+ progress(0.6, desc="Setting up training...")
120
+ output_dir = "./qwen-gita-lora"
121
+ training_args = TrainingArguments(
122
+ output_dir=output_dir,
123
+ num_train_epochs=num_epochs,
124
+ per_device_train_batch_size=batch_size,
125
+ gradient_accumulation_steps=2,
126
+ warmup_steps=10,
127
+ logging_steps=5,
128
+ save_strategy="epoch",
129
+ fp16=device == "cuda",
130
+ optim="adamw_torch",
131
+ learning_rate=learning_rate,
132
+ max_steps=100,
133
+ )
134
 
135
+ trainer = Trainer(
136
+ model=model,
137
+ args=training_args,
138
+ train_dataset=dataset,
139
+ tokenizer=tokenizer,
140
+ )
141
 
142
+ # ==== Train ====
143
+ progress(0.7, desc="Training...")
144
+ output_log.append("\n🚀 Starting training...\n" + "=" * 50)
145
+ train_result = trainer.train()
146
 
147
+ progress(0.85, desc="Saving model...")
148
+ output_log.append("\n💾 Saving model locally...")
149
+ trainer.save_model(output_dir)
150
+ tokenizer.save_pretrained(output_dir)
151
+
152
+ # ==== Upload to HF Hub ====
153
+ progress(0.9, desc="Uploading to Hugging Face Hub...")
154
+ hf_repo = "rahul7star/Qwen0.5-3B-Gita"
155
+ output_log.append(f"\n☁️ Uploading fine-tuned model to: {hf_repo}")
156
 
157
+ api = HfApi()
158
+ token = HfFolder.get_token()
159
+
160
+ # Create repo if not exists
161
+ api.create_repo(repo_id=hf_repo, exist_ok=True)
162
+
163
+ # Clone & push
164
+ with tempfile.TemporaryDirectory() as tmpdir:
165
+ repo = Repository(local_dir=tmpdir, clone_from=hf_repo, use_auth_token=token)
166
+ shutil.copytree(output_dir, tmpdir, dirs_exist_ok=True)
167
+ repo.push_to_hub(commit_message="Upload fine-tuned Qwen-Gita LoRA model")
168
+
169
+ progress(1.0, desc="Complete!")
170
+ output_log.append("\n✅ Training complete & model uploaded successfully!")
171
+
172
+ except Exception as e:
173
+ output_log.append(f"\n❌ Error: {e}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
175
  return "\n".join(output_log)
176
 
177
+
178
+ # === Gradio Interface ===
 
179
  def create_interface():
180
+ with gr.Blocks(title="PromptWizard Qwen Gita Trainer") as demo:
181
  gr.Markdown("""
182
+ # 🧘 PromptWizard Qwen Fine-tuning Gita Edition
183
+ Fine-tune **Qwen 0.5B** on your dataset [rahul7star/Gita](https://huggingface.co/datasets/rahul7star/Gita)
184
+ and auto-upload to your model repo **rahul7star/Qwen0.5-3B-Gita**.
185
  """)
186
 
187
+ with gr.Row():
188
+ with gr.Column():
189
+ gpu_status = gr.Textbox(
190
+ label="GPU Status",
191
+ value=check_gpu_status(),
192
+ interactive=False,
193
+ )
194
+ model_name = gr.Textbox(
195
+ label="Base Model",
196
+ value="Qwen/Qwen2.5-0.5B",
197
+ interactive=False,
198
+ )
199
+ num_epochs = gr.Slider(1, 3, value=1, step=1, label="Epochs")
200
+ batch_size = gr.Slider(1, 4, value=2, step=1, label="Batch Size")
201
+ learning_rate = gr.Number(value=5e-5, label="Learning Rate")
202
+ train_btn = gr.Button("🚀 Start Fine-tuning", variant="primary")
203
+
204
+ with gr.Column():
205
+ output = gr.Textbox(
206
+ label="Training Log",
207
+ lines=25,
208
+ max_lines=40,
209
+ value="Click 'Start Fine-tuning' to train on the Gita dataset and upload to your model repo.",
210
+ )
211
 
212
  train_btn.click(
213
  fn=train_model,
 
217
 
218
  return demo
219
 
 
220
 
221
  if __name__ == "__main__":
222
+ demo = create_interface()
223
  demo.launch()