File size: 9,166 Bytes
af86d16
88fb0c6
8ce7a7d
af86d16
 
 
 
 
88fb0c6
 
31a42db
 
88fb0c6
31a42db
 
 
af86d16
8ce7a7d
88fb0c6
8ce7a7d
af86d16
31a42db
88fb0c6
8ce7a7d
88fb0c6
af86d16
8ce7a7d
af86d16
31a42db
88fb0c6
8ce7a7d
88fb0c6
8ce7a7d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
af86d16
8ce7a7d
af86d16
8ce7a7d
af86d16
8ce7a7d
af86d16
88fb0c6
8ce7a7d
88fb0c6
8ce7a7d
 
 
88fb0c6
8ce7a7d
 
 
 
 
 
 
af86d16
8ce7a7d
88fb0c6
 
 
8ce7a7d
88fb0c6
 
 
 
8ce7a7d
 
88fb0c6
 
8ce7a7d
88fb0c6
8ce7a7d
 
 
af86d16
61f9c52
af86d16
 
 
 
88fb0c6
 
af86d16
88fb0c6
 
8ce7a7d
af86d16
88fb0c6
8ce7a7d
88fb0c6
 
af86d16
 
 
 
 
 
 
 
31a42db
88fb0c6
8ce7a7d
88fb0c6
8ce7a7d
88fb0c6
8ce7a7d
af86d16
8ce7a7d
af86d16
8ce7a7d
88fb0c6
af86d16
31a42db
8ce7a7d
 
af86d16
88fb0c6
8ce7a7d
88fb0c6
 
af86d16
8ce7a7d
af86d16
 
 
88fb0c6
 
af86d16
88fb0c6
af86d16
8ce7a7d
88fb0c6
af86d16
 
88fb0c6
8ce7a7d
88fb0c6
8ce7a7d
 
88fb0c6
af86d16
 
 
88fb0c6
af86d16
 
88fb0c6
af86d16
88fb0c6
8ce7a7d
88fb0c6
8ce7a7d
 
88fb0c6
 
31a42db
 
8ce7a7d
88fb0c6
8ce7a7d
88fb0c6
8ce7a7d
af86d16
 
8ce7a7d
31a42db
af86d16
8ce7a7d
af86d16
8ce7a7d
af86d16
 
88fb0c6
 
 
af86d16
31a42db
af86d16
8ce7a7d
 
 
af86d16
 
 
 
 
 
 
8ce7a7d
af86d16
4b02213
 
8ce7a7d
4b02213
88fb0c6
 
af86d16
31a42db
af86d16
 
 
88fb0c6
31a42db
 
8ce7a7d
af86d16
 
 
 
4b02213
31a42db
af86d16
 
88fb0c6
 
8ce7a7d
 
 
 
88fb0c6
 
af86d16
88fb0c6
 
 
8ce7a7d
88fb0c6
af86d16
 
31a42db
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
"""
PromptWizard โ€” Qwen2.5-0.5B Fine-tuning on Bhagavad Gita Dataset
Downloads CSV locally before training (for Hugging Face Spaces)
"""

import gradio as gr
import spaces
import torch
import pandas as pd
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    Trainer,
    TrainingArguments,
)
from peft import LoraConfig, get_peft_model, TaskType
from huggingface_hub import snapshot_download, HfApi
import os
import shutil


# ------------------------------------------------------
# ๐Ÿง  GPU check
# ------------------------------------------------------
def check_gpu_status():
    return "๐ŸŸข Ready โ€” GPU will be assigned at runtime (Zero GPU mode)"


# ------------------------------------------------------
# ๐Ÿงฉ Download Dataset to /tmp/
# ------------------------------------------------------
def download_gita_dataset():
    repo_id = "rahul7star/Gita"
    local_dir = "/tmp/gita_data"

    if os.path.exists(local_dir):
        shutil.rmtree(local_dir)
    os.makedirs(local_dir, exist_ok=True)

    print(f"๐Ÿ“ฅ Downloading dataset from {repo_id} ...")
    snapshot_download(repo_id=repo_id, local_dir=local_dir, repo_type="dataset")

    # Try to locate the CSV file
    csv_path = None
    for root, _, files in os.walk(local_dir):
        for f in files:
            if f.lower().endswith(".csv"):
                csv_path = os.path.join(root, f)
                break
    if not csv_path:
        raise FileNotFoundError("No CSV file found in the Gita dataset repository.")

    print(f"โœ… Found CSV: {csv_path}")
    return csv_path


# ------------------------------------------------------
# ๐Ÿš€ Training function
# ------------------------------------------------------
@spaces.GPU(duration=300)
def train_model(model_name, num_epochs, batch_size, learning_rate, progress=gr.Progress()):
    logs = []
    try:
        progress(0.05, desc="Initializing...")
        device = "cuda" if torch.cuda.is_available() else "cpu"
        logs.append(f"๐ŸŽฎ Device: {device}")

        # ------------------------------------------------------
        # ๐Ÿ“‚ Step 1: Download dataset
        # ------------------------------------------------------
        progress(0.1, desc="Downloading dataset...")
        logs.append("\n๐Ÿ“ฅ Downloading Gita dataset from HF Hub...")
        csv_path = download_gita_dataset()

        # ------------------------------------------------------
        # ๐Ÿงพ Step 2: Load dataset from CSV
        # ------------------------------------------------------
        progress(0.2, desc="Loading dataset...")
        df = pd.read_csv(csv_path)
        if not all(c in df.columns for c in ["question", "answer"]):
            raise ValueError("CSV must contain 'question' and 'answer' columns.")

        hf_dataset = Dataset.from_pandas(df)

        def format_row(row):
            return {
                "text": f"<|system|>\nYou are a spiritual guide explaining Gita concepts.\n"
                        f"<|user|>\n{row['question']}\n"
                        f"<|assistant|>\n{row['answer']}"
            }

        hf_dataset = hf_dataset.map(format_row)
        logs.append(f"โœ… Loaded {len(hf_dataset)} examples from {csv_path}")

        # ------------------------------------------------------
        # ๐Ÿค– Step 3: Load model + tokenizer
        # ------------------------------------------------------
        progress(0.3, desc="Loading Qwen model...")
        model_name = "Qwen/Qwen2.5-0.5B"
        logs.append(f"\n๐Ÿ” Loading base model: {model_name}")

        tokenizer = AutoTokenizer.from_pretrained(model_name)
        if tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.eos_token

        model = AutoModelForCausalLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32,
        )
        if device == "cuda":
            model = model.to(device)
        logs.append("โœ… Model and tokenizer ready")

        # ------------------------------------------------------
        # โš™๏ธ Step 4: Apply LoRA config
        # ------------------------------------------------------
        progress(0.4, desc="Configuring LoRA...")
        lora_config = LoraConfig(
            task_type=TaskType.CAUSAL_LM,
            r=8,
            lora_alpha=16,
            lora_dropout=0.1,
            target_modules=["q_proj", "v_proj"],
        )
        model = get_peft_model(model, lora_config)

        # ------------------------------------------------------
        # โœ‚๏ธ Step 5: Tokenize dataset
        # ------------------------------------------------------
        progress(0.5, desc="Tokenizing data...")

        def tokenize_fn(batch):
            return tokenizer(
                batch["text"],
                truncation=True,
                padding="max_length",
                max_length=512,
            )

        tokenized = hf_dataset.map(tokenize_fn, batched=True)
        logs.append("๐Ÿงพ Dataset tokenized successfully")

        # ------------------------------------------------------
        # ๐ŸŽฏ Step 6: Training arguments
        # ------------------------------------------------------
        progress(0.6, desc="Configuring training...")
        training_args = TrainingArguments(
            output_dir="/tmp/qwen-gita-output",
            num_train_epochs=num_epochs,
            per_device_train_batch_size=batch_size,
            gradient_accumulation_steps=2,
            learning_rate=learning_rate,
            warmup_steps=5,
            logging_steps=5,
            save_strategy="no",
            fp16=device == "cuda",
            max_steps=100,
            report_to="none",
        )

        # ------------------------------------------------------
        # ๐Ÿ‹๏ธ Step 7: Train model
        # ------------------------------------------------------
        progress(0.7, desc="Training in progress...")
        logs.append("\n๐Ÿš€ Starting fine-tuning...")

        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized,
            tokenizer=tokenizer,
        )
        trainer.train()

        # ------------------------------------------------------
        # ๐Ÿ’พ Step 8: Save + Upload
        # ------------------------------------------------------
        progress(0.9, desc="Saving and uploading...")
        output_dir = "/tmp/qwen-gita-lora"
        os.makedirs(output_dir, exist_ok=True)
        model.save_pretrained(output_dir)
        tokenizer.save_pretrained(output_dir)

        logs.append("\n๐Ÿ“ค Uploading fine-tuned LoRA model to Hugging Face Hub...")
        repo_id = "rahul7star/Qwen0.5-3B-Gita"
        api = HfApi()
        api.upload_folder(folder_path=output_dir, repo_id=repo_id)
        logs.append(f"โœ… Uploaded fine-tuned model to {repo_id}")

        progress(1.0, desc="Complete!")
        logs.append("\n๐ŸŽ‰ Training complete!")

    except Exception as e:
        logs.append(f"\nโŒ Error: {str(e)}")

    return "\n".join(logs)


# ------------------------------------------------------
# ๐ŸŽจ Gradio Interface
# ------------------------------------------------------
def create_interface():
    with gr.Blocks(title="PromptWizard โ€” Qwen Gita Trainer") as demo:
        gr.Markdown("""
        # ๐Ÿง˜ PromptWizard โ€” Qwen2.5-0.5B Gita Trainer  
        Fine-tune Qwen 0.5B on your **Bhagavad Gita CSV dataset**  
        Automatically uploads LoRA weights to `rahul7star/Qwen0.5-3B-Gita`
        """)

        with gr.Row():
            with gr.Column():
                gpu_status = gr.Textbox(
                    label="GPU Status",
                    value=check_gpu_status(),
                    interactive=False,
                )
                model_name = gr.Textbox(
                    value="Qwen/Qwen2.5-0.5B",
                    visible=False,
                )
                num_epochs = gr.Slider(1, 3, 1, step=1, label="Epochs")
                batch_size = gr.Slider(1, 4, 2, step=1, label="Batch Size")
                learning_rate = gr.Number(value=5e-5, label="Learning Rate")
                train_btn = gr.Button("๐Ÿš€ Start Fine-tuning", variant="primary")

            with gr.Column():
                output = gr.Textbox(
                    label="Training Logs",
                    lines=25,
                    max_lines=40,
                    value="Click 'Start Fine-tuning' to train on Bhagavad Gita dataset...",
                )

        train_btn.click(
            fn=train_model,
            inputs=[model_name, num_epochs, batch_size, learning_rate],
            outputs=output,
        )

        gr.Markdown("""
        ---
        **Notes:**
        - Downloads dataset: `rahul7star/Gita` โ†’ `/tmp/gita_data/Gita.csv`
        - Trains using LoRA for efficiency
        - Uploads to `rahul7star/Qwen0.5-3B-Gita`
        """)

    return demo


# ------------------------------------------------------
# ๐Ÿšช Launch app
# ------------------------------------------------------
if __name__ == "__main__":
    demo = create_interface()
    demo.launch()