Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import subprocess | |
| # subprocess.run('pip install flash-attn==2.8.0 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) | |
| import threading | |
| # subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"]) | |
| import spaces | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| from kernels import get_kernel | |
| #vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3") | |
| #torch._dynamo.config.disable = True | |
| MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768" | |
| def load_model(): | |
| """Lazy-load model & tokenizer (for zeroGPU).""" | |
| device = "cuda" # if torch.cuda.is_available() else "cpu" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| dtype=torch.bfloat16, # if device == "cuda" else torch.float32, | |
| device_map="auto", # if device == "cuda" else None, | |
| attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", # # | |
| ) # .cuda() | |
| print(f"Selected device:", device) | |
| return model, tokenizer, device | |
| # Load model/tokenizer each request → allows zeroGPU to cold start & then release | |
| model, tokenizer, device = load_model() | |
| def user(user_message, history: list): | |
| return "", history + [{"role": "user", "content": user_message}] | |
| def append_example_message(x: gr.SelectData, history): | |
| print(x) | |
| print(x.value) | |
| print(x.value["text"]) | |
| if x.value["text"] is not None: | |
| history.append({"role": "user", "content": x.value["text"]}) | |
| return history | |
| def bot( | |
| history: list[dict[str, str]], | |
| # max_tokens, | |
| # temperature, | |
| # top_p, | |
| ): | |
| # [{"role": "system", "content": system_message}] + | |
| # Build conversation | |
| max_tokens = 4096 | |
| temperature = 0.7 | |
| top_p = 0.95 | |
| input_text: str = tokenizer.apply_chat_template( | |
| history, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| # enable_thinking=True, | |
| ) | |
| input_text = input_text.replace(tokenizer.bos_token, "", 1) | |
| print(input_text) | |
| inputs = tokenizer(input_text, return_tensors="pt").to(model.device) # .to(device) | |
| print("Decoded input:", tokenizer.decode(inputs["input_ids"][0])) | |
| print([{id: tokenizer.decode([id])} for id in inputs["input_ids"][0]]) | |
| # Streamer setup | |
| streamer = TextIteratorStreamer( | |
| tokenizer, skip_prompt=True # skip_special_tokens=True # , | |
| ) | |
| # Run model.generate in background thread | |
| generation_kwargs = dict( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=64, | |
| do_sample=True, | |
| # eos_token_id=tokenizer.eos_token_id, | |
| streamer=streamer, | |
| ) | |
| thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| history.append({"role": "assistant", "content": ""}) | |
| # Yield tokens as they come in | |
| for new_text in streamer: | |
| history[-1]["content"] += new_text | |
| yield history | |
| # --- drop-in UI compatible with older Gradio versions --- | |
| import os, tempfile, time | |
| import gradio as gr | |
| # Ukrainian-inspired theme with deep, muted colors reflecting unbeatable spirit: | |
| THEME = gr.themes.Soft( | |
| primary_hue="blue", # Deep blue representing Ukrainian sky and resolve | |
| secondary_hue="amber", # Warm amber representing golden fields and determination | |
| neutral_hue="stone", # Earthy stone representing strength and foundation | |
| ) | |
| # Load CSS from external file | |
| def load_css(): | |
| try: | |
| with open("static/style.css", "r", encoding="utf-8") as f: | |
| return f.read() | |
| except FileNotFoundError: | |
| print("Warning: static/style.css not found") | |
| return "" | |
| CSS = load_css() | |
| def _clear_chat(): | |
| return "", [] | |
| with gr.Blocks(theme=THEME, css=CSS, fill_height=True) as demo: | |
| # Header (no gr.Box to avoid version issues) | |
| gr.HTML( | |
| """ | |
| <div id="app-header"> | |
| <div class="app-title">✨ LAPA</div> | |
| <div class="app-subtitle">LLM for Ukrainian Language</div> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(equal_height=True): | |
| # Left side: Chat | |
| with gr.Column(scale=7, elem_id="left-pane"): | |
| with gr.Column(elem_id="chat-card"): | |
| chatbot = gr.Chatbot( | |
| type="messages", | |
| height=560, | |
| render_markdown=True, | |
| show_copy_button=True, | |
| show_label=False, | |
| # likeable=True, | |
| allow_tags=["think"], | |
| examples=[ | |
| {"text": i} | |
| for i in [ | |
| "хто тримає цей район?", | |
| "Напиши історію про Івасика-Телесика", | |
| "Яка найвища гора в Україні?", | |
| "Як звали батька Тараса Григоровича Шевченка?", | |
| "Яка з цих гір не знаходиться у Європі? Говерла, Монблан, Гран-Парадізо, Еверест", | |
| "Дай відповідь на питання\nЧому у качки жовті ноги?", | |
| ] | |
| ], | |
| ) | |
| # ChatGPT-style input box with stop button | |
| with gr.Row(elem_id="chat-input-row"): | |
| msg = gr.Textbox( | |
| label=None, | |
| placeholder="Message… (Press Enter to send)", | |
| autofocus=True, | |
| lines=1, | |
| max_lines=6, | |
| container=False, | |
| show_label=False, | |
| elem_id="chat-input", | |
| elem_classes=["chat-input-box"] | |
| ) | |
| stop_btn_visible = gr.Button( | |
| "⏹️", | |
| variant="secondary", | |
| elem_id="stop-btn-visible", | |
| elem_classes=["stop-btn-chat"], | |
| visible=False, | |
| size="sm" | |
| ) | |
| # Hidden buttons for functionality | |
| with gr.Row(visible=True, elem_id="hidden-buttons"): | |
| send_btn = gr.Button("Send", variant="primary", elem_id="send-btn") | |
| stop_btn = gr.Button("Stop", variant="secondary", elem_id="stop-btn") | |
| clear_btn = gr.Button("Clear", variant="secondary", elem_id="clear-btn") | |
| # export_btn = gr.Button("Export chat (.md)", variant="secondary", elem_classes=["rounded-btn","secondary-btn"]) | |
| # exported_file = gr.File(label="", interactive=False, visible=True) | |
| gr.HTML('<div class="footer-tip">Shortcuts: Enter to send • Shift+Enter for new line</div>') | |
| # Helper functions for managing UI state | |
| def show_stop_button(): | |
| return gr.update(visible=True) | |
| def hide_stop_button(): | |
| return gr.update(visible=False) | |
| # Events (preserve your original handlers) | |
| e1 = msg.submit(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True).then( | |
| fn=show_stop_button, inputs=None, outputs=stop_btn_visible | |
| ).then( | |
| fn=bot, inputs=chatbot, outputs=chatbot | |
| ).then( | |
| fn=hide_stop_button, inputs=None, outputs=stop_btn_visible | |
| ) | |
| e2 = send_btn.click(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True).then( | |
| fn=show_stop_button, inputs=None, outputs=stop_btn_visible | |
| ).then( | |
| fn=bot, inputs=chatbot, outputs=chatbot | |
| ).then( | |
| fn=hide_stop_button, inputs=None, outputs=stop_btn_visible | |
| ) | |
| e3 = chatbot.example_select(fn=append_example_message, inputs=[chatbot], outputs=[chatbot], queue=True).then( | |
| fn=show_stop_button, inputs=None, outputs=stop_btn_visible | |
| ).then( | |
| fn=bot, inputs=chatbot, outputs=chatbot | |
| ).then( | |
| fn=hide_stop_button, inputs=None, outputs=stop_btn_visible | |
| ) | |
| # Stop cancels running events (both buttons work) | |
| stop_btn.click(fn=hide_stop_button, inputs=None, outputs=stop_btn_visible, cancels=[e1, e2, e3], queue=True) | |
| stop_btn_visible.click(fn=hide_stop_button, inputs=None, outputs=stop_btn_visible, cancels=[e1, e2, e3], queue=True) | |
| # Clear chat + input | |
| clear_btn.click(fn=_clear_chat, inputs=None, outputs=[msg, chatbot]) | |
| # Export markdown | |
| # export_btn.click(fn=_export_markdown, inputs=chatbot, outputs=exported_file) | |
| # Load and inject external JavaScript | |
| def load_javascript(): | |
| try: | |
| with open("static/script.js", "r", encoding="utf-8") as f: | |
| return f"<script>{f.read()}</script>" | |
| except FileNotFoundError: | |
| print("Warning: static/script.js not found") | |
| return "" | |
| gr.HTML(load_javascript()) | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |