File size: 9,134 Bytes
130e53d
 
e0ce993
2536b39
e0ce993
f52933f
130e53d
0e29f16
c5d24cb
c6ac4a0
c5d24cb
 
f52933f
ef7ad3a
d03d3f9
7719ac7
130e53d
e6380a7
919bf29
939e049
3da0193
d5dc5cf
 
 
5725e7b
d5dc5cf
 
 
0ad02a2
5725e7b
0ad02a2
5725e7b
0e29f16
d5dc5cf
 
5725e7b
0e29f16
 
d5dc5cf
5725e7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c6ac4a0
5725e7b
4363542
5725e7b
 
 
4363542
5725e7b
0e29f16
f52933f
5725e7b
 
 
4363542
5725e7b
 
863688d
d81ff51
5725e7b
863688d
4363542
5725e7b
 
 
 
 
f52933f
 
5725e7b
f52933f
 
 
 
919bf29
 
4363542
 
5725e7b
919bf29
5725e7b
f52933f
919bf29
f52933f
 
 
5725e7b
f52933f
 
5725e7b
 
 
4363542
756e900
 
5725e7b
756e900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5725e7b
4363542
756e900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2536b39
756e900
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5725e7b
 
756e900
 
 
5725e7b
756e900
 
 
 
 
5725e7b
756e900
 
 
 
 
 
 
 
 
 
5725e7b
 
756e900
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import os
import subprocess

# subprocess.run('pip install flash-attn==2.8.0 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

import threading

# subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])

import spaces
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from kernels import get_kernel

#vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3")

#torch._dynamo.config.disable = True

MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768"


def load_model():
    """Lazy-load model & tokenizer (for zeroGPU)."""
    device = "cuda"  # if torch.cuda.is_available() else "cpu"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        dtype=torch.bfloat16,  # if device == "cuda" else torch.float32,
        device_map="auto",  # if device == "cuda" else None,
        attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", #  # 
    )  # .cuda()
    print(f"Selected device:", device)
    return model, tokenizer, device


# Load model/tokenizer each request → allows zeroGPU to cold start & then release
model, tokenizer, device = load_model()


def user(user_message, history: list):
    return "", history + [{"role": "user", "content": user_message}]


def append_example_message(x: gr.SelectData, history):
    print(x)
    print(x.value)
    print(x.value["text"])
    if x.value["text"] is not None:
        history.append({"role": "user", "content": x.value["text"]})

    return history


@spaces.GPU
def bot(
    history: list[dict[str, str]],
    # max_tokens,
    # temperature,
    # top_p,
):

    # [{"role": "system", "content": system_message}] +
    # Build conversation
    max_tokens = 4096
    temperature = 0.7
    top_p = 0.95

    input_text: str = tokenizer.apply_chat_template(
        history,
        tokenize=False,
        add_generation_prompt=True,
        # enable_thinking=True,
    )

    input_text = input_text.replace(tokenizer.bos_token, "", 1)
    print(input_text)
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)  # .to(device)
    print("Decoded input:", tokenizer.decode(inputs["input_ids"][0]))
    print([{id: tokenizer.decode([id])} for id in inputs["input_ids"][0]])
    # Streamer setup
    streamer = TextIteratorStreamer(
        tokenizer, skip_prompt=True  # skip_special_tokens=True  # ,
    )

    # Run model.generate in background thread
    generation_kwargs = dict(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=64,
        do_sample=True,
        # eos_token_id=tokenizer.eos_token_id,
        streamer=streamer,
    )
    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    history.append({"role": "assistant", "content": ""})
    # Yield tokens as they come in
    for new_text in streamer:
        history[-1]["content"] += new_text
        yield history


# --- drop-in UI compatible with older Gradio versions ---
import os, tempfile, time
import gradio as gr

# Ukrainian-inspired theme with deep, muted colors reflecting unbeatable spirit:
THEME = gr.themes.Soft(
    primary_hue="blue",      # Deep blue representing Ukrainian sky and resolve
    secondary_hue="amber",   # Warm amber representing golden fields and determination  
    neutral_hue="stone",     # Earthy stone representing strength and foundation
)

# Load CSS from external file
def load_css():
    try:
        with open("static/style.css", "r", encoding="utf-8") as f:
            return f.read()
    except FileNotFoundError:
        print("Warning: static/style.css not found")
        return ""

CSS = load_css()

def _clear_chat():
    return "", []

with gr.Blocks(theme=THEME, css=CSS, fill_height=True) as demo:
    # Header (no gr.Box to avoid version issues)
    gr.HTML(
        """
        <div id="app-header">
          <div class="app-title">✨ LAPA</div>
          <div class="app-subtitle">LLM for Ukrainian Language</div>
        </div>
        """
    )

    with gr.Row(equal_height=True):
        # Left side: Chat
        with gr.Column(scale=7, elem_id="left-pane"):
            with gr.Column(elem_id="chat-card"):
                chatbot = gr.Chatbot(
                    type="messages",
                    height=560,
                    render_markdown=True,
                    show_copy_button=True,
                    show_label=False,
                    # likeable=True,
                    allow_tags=["think"],
                    examples=[
                        {"text": i}
                        for i in [
                            "хто тримає цей район?",
                            "Напиши історію про Івасика-Телесика",
                            "Яка найвища гора в Україні?",
                            "Як звали батька Тараса Григоровича Шевченка?",
                            "Яка з цих гір не знаходиться у Європі? Говерла, Монблан, Гран-Парадізо, Еверест",
                            "Дай відповідь на питання\nЧому у качки жовті ноги?",
                        ]
                    ],
                )

            # ChatGPT-style input box with stop button
            with gr.Row(elem_id="chat-input-row"):
                msg = gr.Textbox(
                    label=None,
                    placeholder="Message… (Press Enter to send)",
                    autofocus=True,
                    lines=1,
                    max_lines=6,
                    container=False,
                    show_label=False,
                    elem_id="chat-input",
                    elem_classes=["chat-input-box"]
                )
                stop_btn_visible = gr.Button(
                    "⏹️", 
                    variant="secondary", 
                    elem_id="stop-btn-visible",
                    elem_classes=["stop-btn-chat"],
                    visible=False,
                    size="sm"
                )
            
            # Hidden buttons for functionality
            with gr.Row(visible=True, elem_id="hidden-buttons"):
                send_btn = gr.Button("Send", variant="primary", elem_id="send-btn")
                stop_btn = gr.Button("Stop", variant="secondary", elem_id="stop-btn")
                clear_btn = gr.Button("Clear", variant="secondary", elem_id="clear-btn")

            # export_btn = gr.Button("Export chat (.md)", variant="secondary", elem_classes=["rounded-btn","secondary-btn"])
            # exported_file = gr.File(label="", interactive=False, visible=True)
            gr.HTML('<div class="footer-tip">Shortcuts: Enter to send • Shift+Enter for new line</div>')

    # Helper functions for managing UI state
    def show_stop_button():
        return gr.update(visible=True)
    
    def hide_stop_button():
        return gr.update(visible=False)

    # Events (preserve your original handlers)
    e1 = msg.submit(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True).then(
        fn=show_stop_button, inputs=None, outputs=stop_btn_visible
    ).then(
        fn=bot, inputs=chatbot, outputs=chatbot
    ).then(
        fn=hide_stop_button, inputs=None, outputs=stop_btn_visible
    )
    
    e2 = send_btn.click(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True).then(
        fn=show_stop_button, inputs=None, outputs=stop_btn_visible
    ).then(
        fn=bot, inputs=chatbot, outputs=chatbot
    ).then(
        fn=hide_stop_button, inputs=None, outputs=stop_btn_visible
    )
    
    e3 = chatbot.example_select(fn=append_example_message, inputs=[chatbot], outputs=[chatbot], queue=True).then(
        fn=show_stop_button, inputs=None, outputs=stop_btn_visible
    ).then(
        fn=bot, inputs=chatbot, outputs=chatbot
    ).then(
        fn=hide_stop_button, inputs=None, outputs=stop_btn_visible
    )

    # Stop cancels running events (both buttons work)
    stop_btn.click(fn=hide_stop_button, inputs=None, outputs=stop_btn_visible, cancels=[e1, e2, e3], queue=True)
    stop_btn_visible.click(fn=hide_stop_button, inputs=None, outputs=stop_btn_visible, cancels=[e1, e2, e3], queue=True)

    # Clear chat + input
    clear_btn.click(fn=_clear_chat, inputs=None, outputs=[msg, chatbot])

    # Export markdown
    # export_btn.click(fn=_export_markdown, inputs=chatbot, outputs=exported_file)

    # Load and inject external JavaScript
    def load_javascript():
        try:
            with open("static/script.js", "r", encoding="utf-8") as f:
                return f"<script>{f.read()}</script>"
        except FileNotFoundError:
            print("Warning: static/script.js not found")
            return ""
    
    gr.HTML(load_javascript())

if __name__ == "__main__":
    demo.queue().launch()