import os import subprocess # subprocess.run('pip install flash-attn==2.8.0 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) import threading # subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"]) import spaces import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from kernels import get_kernel #vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3") #torch._dynamo.config.disable = True MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768" def load_model(): """Lazy-load model & tokenizer (for zeroGPU).""" device = "cuda" # if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, dtype=torch.bfloat16, # if device == "cuda" else torch.float32, device_map="auto", # if device == "cuda" else None, attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", # # ) # .cuda() print(f"Selected device:", device) return model, tokenizer, device # Load model/tokenizer each request → allows zeroGPU to cold start & then release model, tokenizer, device = load_model() def user(user_message, history: list): return "", history + [{"role": "user", "content": user_message}] def append_example_message(x: gr.SelectData, history): print(x) print(x.value) print(x.value["text"]) if x.value["text"] is not None: history.append({"role": "user", "content": x.value["text"]}) return history @spaces.GPU def bot( history: list[dict[str, str]], # max_tokens, # temperature, # top_p, ): # [{"role": "system", "content": system_message}] + # Build conversation max_tokens = 4096 temperature = 0.7 top_p = 0.95 input_text: str = tokenizer.apply_chat_template( history, tokenize=False, add_generation_prompt=True, # enable_thinking=True, ) input_text = input_text.replace(tokenizer.bos_token, "", 1) print(input_text) inputs = tokenizer(input_text, return_tensors="pt").to(model.device) # .to(device) print("Decoded input:", tokenizer.decode(inputs["input_ids"][0])) print([{id: tokenizer.decode([id])} for id in inputs["input_ids"][0]]) # Streamer setup streamer = TextIteratorStreamer( tokenizer, skip_prompt=True # skip_special_tokens=True # , ) # Run model.generate in background thread generation_kwargs = dict( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, top_k=64, do_sample=True, # eos_token_id=tokenizer.eos_token_id, streamer=streamer, ) thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) thread.start() history.append({"role": "assistant", "content": ""}) # Yield tokens as they come in for new_text in streamer: history[-1]["content"] += new_text yield history # --- drop-in UI compatible with older Gradio versions --- import os, tempfile, time import gradio as gr # Ukrainian-inspired theme with deep, muted colors reflecting unbeatable spirit: THEME = gr.themes.Soft( primary_hue="blue", # Deep blue representing Ukrainian sky and resolve secondary_hue="amber", # Warm amber representing golden fields and determination neutral_hue="stone", # Earthy stone representing strength and foundation ) # Load CSS from external file def load_css(): try: with open("static/style.css", "r", encoding="utf-8") as f: return f.read() except FileNotFoundError: print("Warning: static/style.css not found") return "" CSS = load_css() def _clear_chat(): return "", [] with gr.Blocks(theme=THEME, css=CSS, fill_height=True) as demo: # Header (no gr.Box to avoid version issues) gr.HTML( """
✨ LAPA
LLM for Ukrainian Language
""" ) with gr.Row(equal_height=True): # Left side: Chat with gr.Column(scale=7, elem_id="left-pane"): with gr.Column(elem_id="chat-card"): chatbot = gr.Chatbot( type="messages", height=560, render_markdown=True, show_copy_button=True, show_label=False, # likeable=True, allow_tags=["think"], examples=[ {"text": i} for i in [ "хто тримає цей район?", "Напиши історію про Івасика-Телесика", "Яка найвища гора в Україні?", "Як звали батька Тараса Григоровича Шевченка?", "Яка з цих гір не знаходиться у Європі? Говерла, Монблан, Гран-Парадізо, Еверест", "Дай відповідь на питання\nЧому у качки жовті ноги?", ] ], ) # ChatGPT-style input box with stop button with gr.Row(elem_id="chat-input-row"): msg = gr.Textbox( label=None, placeholder="Message… (Press Enter to send)", autofocus=True, lines=1, max_lines=6, container=False, show_label=False, elem_id="chat-input", elem_classes=["chat-input-box"] ) stop_btn_visible = gr.Button( "⏹️", variant="secondary", elem_id="stop-btn-visible", elem_classes=["stop-btn-chat"], visible=False, size="sm" ) # Hidden buttons for functionality with gr.Row(visible=True, elem_id="hidden-buttons"): send_btn = gr.Button("Send", variant="primary", elem_id="send-btn") stop_btn = gr.Button("Stop", variant="secondary", elem_id="stop-btn") clear_btn = gr.Button("Clear", variant="secondary", elem_id="clear-btn") # export_btn = gr.Button("Export chat (.md)", variant="secondary", elem_classes=["rounded-btn","secondary-btn"]) # exported_file = gr.File(label="", interactive=False, visible=True) gr.HTML('') # Helper functions for managing UI state def show_stop_button(): return gr.update(visible=True) def hide_stop_button(): return gr.update(visible=False) # Events (preserve your original handlers) e1 = msg.submit(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True).then( fn=show_stop_button, inputs=None, outputs=stop_btn_visible ).then( fn=bot, inputs=chatbot, outputs=chatbot ).then( fn=hide_stop_button, inputs=None, outputs=stop_btn_visible ) e2 = send_btn.click(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True).then( fn=show_stop_button, inputs=None, outputs=stop_btn_visible ).then( fn=bot, inputs=chatbot, outputs=chatbot ).then( fn=hide_stop_button, inputs=None, outputs=stop_btn_visible ) e3 = chatbot.example_select(fn=append_example_message, inputs=[chatbot], outputs=[chatbot], queue=True).then( fn=show_stop_button, inputs=None, outputs=stop_btn_visible ).then( fn=bot, inputs=chatbot, outputs=chatbot ).then( fn=hide_stop_button, inputs=None, outputs=stop_btn_visible ) # Stop cancels running events (both buttons work) stop_btn.click(fn=hide_stop_button, inputs=None, outputs=stop_btn_visible, cancels=[e1, e2, e3], queue=True) stop_btn_visible.click(fn=hide_stop_button, inputs=None, outputs=stop_btn_visible, cancels=[e1, e2, e3], queue=True) # Clear chat + input clear_btn.click(fn=_clear_chat, inputs=None, outputs=[msg, chatbot]) # Export markdown # export_btn.click(fn=_export_markdown, inputs=chatbot, outputs=exported_file) # Load and inject external JavaScript def load_javascript(): try: with open("static/script.js", "r", encoding="utf-8") as f: return f"" except FileNotFoundError: print("Warning: static/script.js not found") return "" gr.HTML(load_javascript()) if __name__ == "__main__": demo.queue().launch()