Spaces:

lapa-llm
/

lapa

Running on Zero

File size: 9,134 Bytes

import os
import subprocess

# subprocess.run('pip install flash-attn==2.8.0 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)

import threading

# subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"])

import spaces
import gradio as gr
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
from kernels import get_kernel

#vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3")

#torch._dynamo.config.disable = True

MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768"


def load_model():
    """Lazy-load model & tokenizer (for zeroGPU)."""
    device = "cuda"  # if torch.cuda.is_available() else "cpu"
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_ID,
        dtype=torch.bfloat16,  # if device == "cuda" else torch.float32,
        device_map="auto",  # if device == "cuda" else None,
        attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", #  # 
    )  # .cuda()
    print(f"Selected device:", device)
    return model, tokenizer, device


# Load model/tokenizer each request → allows zeroGPU to cold start & then release
model, tokenizer, device = load_model()


def user(user_message, history: list):
    return "", history + [{"role": "user", "content": user_message}]


def append_example_message(x: gr.SelectData, history):
    print(x)
    print(x.value)
    print(x.value["text"])
    if x.value["text"] is not None:
        history.append({"role": "user", "content": x.value["text"]})

    return history


@spaces.GPU
def bot(
    history: list[dict[str, str]],
    # max_tokens,
    # temperature,
    # top_p,
):

    # [{"role": "system", "content": system_message}] +
    # Build conversation
    max_tokens = 4096
    temperature = 0.7
    top_p = 0.95

    input_text: str = tokenizer.apply_chat_template(
        history,
        tokenize=False,
        add_generation_prompt=True,
        # enable_thinking=True,
    )

    input_text = input_text.replace(tokenizer.bos_token, "", 1)
    print(input_text)
    inputs = tokenizer(input_text, return_tensors="pt").to(model.device)  # .to(device)
    print("Decoded input:", tokenizer.decode(inputs["input_ids"][0]))
    print([{id: tokenizer.decode([id])} for id in inputs["input_ids"][0]])
    # Streamer setup
    streamer = TextIteratorStreamer(
        tokenizer, skip_prompt=True  # skip_special_tokens=True  # ,
    )

    # Run model.generate in background thread
    generation_kwargs = dict(
        **inputs,
        max_new_tokens=max_tokens,
        temperature=temperature,
        top_p=top_p,
        top_k=64,
        do_sample=True,
        # eos_token_id=tokenizer.eos_token_id,
        streamer=streamer,
    )
    thread = threading.Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()

    history.append({"role": "assistant", "content": ""})
    # Yield tokens as they come in
    for new_text in streamer:
        history[-1]["content"] += new_text
        yield history


# --- drop-in UI compatible with older Gradio versions ---
import os, tempfile, time
import gradio as gr

# Ukrainian-inspired theme with deep, muted colors reflecting unbeatable spirit:
THEME = gr.themes.Soft(
    primary_hue="blue",      # Deep blue representing Ukrainian sky and resolve
    secondary_hue="amber",   # Warm amber representing golden fields and determination  
    neutral_hue="stone",     # Earthy stone representing strength and foundation
)

# Load CSS from external file
def load_css():
    try:
        with open("static/style.css", "r", encoding="utf-8") as f:
            return f.read()
    except FileNotFoundError:
        print("Warning: static/style.css not found")
        return ""

CSS = load_css()

def _clear_chat():
    return "", []

with gr.Blocks(theme=THEME, css=CSS, fill_height=True) as demo:
    # Header (no gr.Box to avoid version issues)
    gr.HTML(
        """
        <div id="app-header">
          <div class="app-title">✨ LAPA</div>
          <div class="app-subtitle">LLM for Ukrainian Language</div>
        </div>
        """
    )

    with gr.Row(equal_height=True):
        # Left side: Chat
        with gr.Column(scale=7, elem_id="left-pane"):
            with gr.Column(elem_id="chat-card"):
                chatbot = gr.Chatbot(
                    type="messages",
                    height=560,
                    render_markdown=True,
                    show_copy_button=True,
                    show_label=False,
                    # likeable=True,
                    allow_tags=["think"],
                    examples=[
                        {"text": i}
                        for i in [
                            "хто тримає цей район?",
                            "Напиши історію про Івасика-Телесика",
                            "Яка найвища гора в Україні?",
                            "Як звали батька Тараса Григоровича Шевченка?",
                            "Яка з цих гір не знаходиться у Європі? Говерла, Монблан, Гран-Парадізо, Еверест",
                            "Дай відповідь на питання\nЧому у качки жовті ноги?",
                        ]
                    ],
                )

            # ChatGPT-style input box with stop button
            with gr.Row(elem_id="chat-input-row"):
                msg = gr.Textbox(
                    label=None,
                    placeholder="Message… (Press Enter to send)",
                    autofocus=True,
                    lines=1,
                    max_lines=6,
                    container=False,
                    show_label=False,
                    elem_id="chat-input",
                    elem_classes=["chat-input-box"]
                )
                stop_btn_visible = gr.Button(
                    "⏹️", 
                    variant="secondary", 
                    elem_id="stop-btn-visible",
                    elem_classes=["stop-btn-chat"],
                    visible=False,
                    size="sm"
                )
            
            # Hidden buttons for functionality
            with gr.Row(visible=True, elem_id="hidden-buttons"):
                send_btn = gr.Button("Send", variant="primary", elem_id="send-btn")
                stop_btn = gr.Button("Stop", variant="secondary", elem_id="stop-btn")
                clear_btn = gr.Button("Clear", variant="secondary", elem_id="clear-btn")

            # export_btn = gr.Button("Export chat (.md)", variant="secondary", elem_classes=["rounded-btn","secondary-btn"])
            # exported_file = gr.File(label="", interactive=False, visible=True)
            gr.HTML('<div class="footer-tip">Shortcuts: Enter to send • Shift+Enter for new line</div>')

    # Helper functions for managing UI state
    def show_stop_button():
        return gr.update(visible=True)
    
    def hide_stop_button():
        return gr.update(visible=False)

    # Events (preserve your original handlers)
    e1 = msg.submit(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True).then(
        fn=show_stop_button, inputs=None, outputs=stop_btn_visible
    ).then(
        fn=bot, inputs=chatbot, outputs=chatbot
    ).then(
        fn=hide_stop_button, inputs=None, outputs=stop_btn_visible
    )
    
    e2 = send_btn.click(fn=user, inputs=[msg, chatbot], outputs=[msg, chatbot], queue=True).then(
        fn=show_stop_button, inputs=None, outputs=stop_btn_visible
    ).then(
        fn=bot, inputs=chatbot, outputs=chatbot
    ).then(
        fn=hide_stop_button, inputs=None, outputs=stop_btn_visible
    )
    
    e3 = chatbot.example_select(fn=append_example_message, inputs=[chatbot], outputs=[chatbot], queue=True).then(
        fn=show_stop_button, inputs=None, outputs=stop_btn_visible
    ).then(
        fn=bot, inputs=chatbot, outputs=chatbot
    ).then(
        fn=hide_stop_button, inputs=None, outputs=stop_btn_visible
    )

    # Stop cancels running events (both buttons work)
    stop_btn.click(fn=hide_stop_button, inputs=None, outputs=stop_btn_visible, cancels=[e1, e2, e3], queue=True)
    stop_btn_visible.click(fn=hide_stop_button, inputs=None, outputs=stop_btn_visible, cancels=[e1, e2, e3], queue=True)

    # Clear chat + input
    clear_btn.click(fn=_clear_chat, inputs=None, outputs=[msg, chatbot])

    # Export markdown
    # export_btn.click(fn=_export_markdown, inputs=chatbot, outputs=exported_file)

    # Load and inject external JavaScript
    def load_javascript():
        try:
            with open("static/script.js", "r", encoding="utf-8") as f:
                return f"<script>{f.read()}</script>"
        except FileNotFoundError:
            print("Warning: static/script.js not found")
            return ""
    
    gr.HTML(load_javascript())

if __name__ == "__main__":
    demo.queue().launch()