lapa / app.py
VladyslavH's picture
Make buttons invisible
2536b39
raw
history blame
9.13 kB
import os
import subprocess
# subprocess.run('pip install flash-attn==2.8.0 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
import threading
# subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])
import spaces
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from kernels import get_kernel
#vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3")
#torch._dynamo.config.disable = True
MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768"
def load_model():
"""Lazy-load model & tokenizer (for zeroGPU)."""
device = "cuda" # if torch.cuda.is_available() else "cpu"
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
dtype=torch.bfloat16, # if device == "cuda" else torch.float32,
device_map="auto", # if device == "cuda" else None,
attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", # #
) # .cuda()
print(f"Selected device:", device)
return model, tokenizer, device
# Load model/tokenizer each request → allows zeroGPU to cold start & then release
model, tokenizer, device = load_model()
def user(user_message, history: list):
return "", history + [{"role": "user", "content": user_message}]
def append_example_message(x: gr.SelectData, history):
print(x)
print(x.value)
print(x.value["text"])
if x.value["text"] is not None:
history.append({"role": "user", "content": x.value["text"]})
return history
@spaces.GPU
def bot(
history: list[dict[str, str]],
# max_tokens,
# temperature,
# top_p,
):
# [{"role": "system", "content": system_message}] +
# Build conversation
max_tokens = 4096
temperature = 0.7
top_p = 0.95
input_text: str = tokenizer.apply_chat_template(
history,
tokenize=False,
add_generation_prompt=True,
# enable_thinking=True,
)
input_text = input_text.replace(tokenizer.bos_token, "", 1)
print(input_text)
inputs = tokenizer(input_text, return_tensors="pt").to(model.device) # .to(device)
print("Decoded input:", tokenizer.decode(inputs["input_ids"][0]))
print([{id: tokenizer.decode([id])} for id in inputs["input_ids"][0]])
# Streamer setup
streamer = TextIteratorStreamer(
tokenizer, skip_prompt=True # skip_special_tokens=True # ,
)
# Run model.generate in background thread
generation_kwargs = dict(
**inputs,
max_new_tokens=max_tokens,
temperature=temperature,
top_p=top_p,
top_k=64,
do_sample=True,
# eos_token_id=tokenizer.eos_token_id,
streamer=streamer,
)
thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
history.append({"role": "assistant", "content": ""})
# Yield tokens as they come in
for new_text in streamer:
history[-1]["content"] += new_text
yield history
# --- drop-in UI compatible with older Gradio versions ---
import os, tempfile, time
import gradio as gr
# Ukrainian-inspired theme with deep, muted colors reflecting unbeatable spirit:
THEME = gr.themes.Soft(
primary_hue="blue", # Deep blue representing Ukrainian sky and resolve
secondary_hue="amber", # Warm amber representing golden fields and determination
neutral_hue="stone", # Earthy stone representing strength and foundation
)
# Load CSS from external file
def load_css():
try:
with open("static/style.css", "r", encoding="utf-8") as f:
return f.read()
except FileNotFoundError:
print("Warning: static/style.css not found")
return ""
CSS = load_css()
def _clear_chat():
return "", []
with gr.Blocks(theme=THEME, css=CSS, fill_height=True) as demo:
# Header (no gr.Box to avoid version issues)
gr.HTML(
"""
<div id="app-header">
<div class="app-title">✨ LAPA</div>
<div class="app-subtitle">LLM for Ukrainian Language</div>
</div>
"""
)
with gr.Row(equal_height=True):
# Left side: Chat
with gr.Column(scale=7, elem_id="left-pane"):
with gr.Column(elem_id="chat-card"):
chatbot = gr.Chatbot(
type="messages",
height=560,
render_markdown=True,
show_copy_button=True,
show_label=False,
# likeable=True,
allow_tags=["think"],
examples=[
{"text": i}
for i in [
"хто тримає цей район?",
"Напиши історію про Івасика-Телесика",
"Яка найвища гора в Україні?",
"Як звали батька Тараса Григоровича Шевченка?",
"Яка з цих гір не знаходиться у Європі? Говерла, Монблан, Гран-Парадізо, Еверест",
"Дай відповідь на питання\nЧому у качки жовті ноги?",
]
],
)
# ChatGPT-style input box with stop button
with gr.Row(elem_id="chat-input-row"):
msg = gr.Textbox(
label=None,
placeholder="Message… (Press Enter to send)",
autofocus=True,
lines=1,
max_lines=6,
container=False,
show_label=False,
elem_id="chat-input",
elem_classes=["chat-input-box"]
)
stop_btn_visible = gr.Button(
"⏹️",
variant="secondary",
elem_id="stop-btn-visible",
elem_classes=["stop-btn-chat"],
visible=False,
size="sm"
)
# Hidden buttons for functionality
with gr.Row(visible=True, elem_id="hidden-buttons"):
send_btn = gr.Button("Send", variant="primary", elem_id="send-btn")
stop_btn = gr.Button("Stop", variant="secondary", elem_id="stop-btn")
clear_btn = gr.Button("Clear", variant="secondary", elem_id="clear-btn")
# export_btn = gr.Button("Export chat (.md)", variant="secondary", elem_classes=["rounded-btn","secondary-btn"])
# exported_file = gr.File(label="", interactive=False, visible=True)
gr.HTML('<div class="footer-tip">Shortcuts: Enter to send • Shift+Enter for new line</div>')
# Helper functions for managing UI state
def show_stop_button():
return gr.update(visible=True)
def hide_stop_button():
return gr.update(visible=False)
# Events (preserve your original handlers)
e1 = msg.submit(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True).then(
fn=show_stop_button, inputs=None, outputs=stop_btn_visible
).then(
fn=bot, inputs=chatbot, outputs=chatbot
).then(
fn=hide_stop_button, inputs=None, outputs=stop_btn_visible
)
e2 = send_btn.click(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True).then(
fn=show_stop_button, inputs=None, outputs=stop_btn_visible
).then(
fn=bot, inputs=chatbot, outputs=chatbot
).then(
fn=hide_stop_button, inputs=None, outputs=stop_btn_visible
)
e3 = chatbot.example_select(fn=append_example_message, inputs=[chatbot], outputs=[chatbot], queue=True).then(
fn=show_stop_button, inputs=None, outputs=stop_btn_visible
).then(
fn=bot, inputs=chatbot, outputs=chatbot
).then(
fn=hide_stop_button, inputs=None, outputs=stop_btn_visible
)
# Stop cancels running events (both buttons work)
stop_btn.click(fn=hide_stop_button, inputs=None, outputs=stop_btn_visible, cancels=[e1, e2, e3], queue=True)
stop_btn_visible.click(fn=hide_stop_button, inputs=None, outputs=stop_btn_visible, cancels=[e1, e2, e3], queue=True)
# Clear chat + input
clear_btn.click(fn=_clear_chat, inputs=None, outputs=[msg, chatbot])
# Export markdown
# export_btn.click(fn=_export_markdown, inputs=chatbot, outputs=exported_file)
# Load and inject external JavaScript
def load_javascript():
try:
with open("static/script.js", "r", encoding="utf-8") as f:
return f"<script>{f.read()}</script>"
except FileNotFoundError:
print("Warning: static/script.js not found")
return ""
gr.HTML(load_javascript())
if __name__ == "__main__":
demo.queue().launch()