import os import subprocess # subprocess.run('pip install flash-attn==2.8.0 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) import threading # subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"]) import spaces import gradio as gr import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer from kernels import get_kernel #vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3") #torch._dynamo.config.disable = True MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-32768" def load_model(): """Lazy-load model & tokenizer (for zeroGPU).""" device = "cuda" # if torch.cuda.is_available() else "cpu" tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) model = AutoModelForCausalLM.from_pretrained( MODEL_ID, dtype=torch.bfloat16, # if device == "cuda" else torch.float32, device_map="auto", # if device == "cuda" else None, attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", # # ) # .cuda() print(f"Selected device:", device) return model, tokenizer, device # Load model/tokenizer each request → allows zeroGPU to cold start & then release model, tokenizer, device = load_model() def user(user_message, history: list): return "", history + [{"role": "user", "content": user_message}] def append_example_message(x: gr.SelectData, history): print(x) print(x.value) print(x.value["text"]) if x.value["text"] is not None: history.append({"role": "user", "content": x.value["text"]}) return history @spaces.GPU def bot( history: list[dict[str, str]], # max_tokens, # temperature, # top_p, ): # [{"role": "system", "content": system_message}] + # Build conversation max_tokens = 4096 temperature = 0.7 top_p = 0.95 input_text: str = tokenizer.apply_chat_template( history, tokenize=False, add_generation_prompt=True, # enable_thinking=True, ) input_text = input_text.replace(tokenizer.bos_token, "", 1) print(input_text) inputs = tokenizer(input_text, return_tensors="pt").to(model.device) # .to(device) print("Decoded input:", tokenizer.decode(inputs["input_ids"][0])) print([{id: tokenizer.decode([id])} for id in inputs["input_ids"][0]]) # Streamer setup streamer = TextIteratorStreamer( tokenizer, skip_prompt=True # skip_special_tokens=True # , ) # Run model.generate in background thread generation_kwargs = dict( **inputs, max_new_tokens=max_tokens, temperature=temperature, top_p=top_p, top_k=64, do_sample=True, # eos_token_id=tokenizer.eos_token_id, streamer=streamer, ) thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) thread.start() history.append({"role": "assistant", "content": ""}) # Yield tokens as they come in for new_text in streamer: history[-1]["content"] += new_text yield history # --- drop-in UI compatible with older Gradio versions --- import os, tempfile, time import gradio as gr # Ukrainian-inspired theme with deep, muted colors reflecting unbeatable spirit: THEME = gr.themes.Soft( primary_hue="blue", # Deep blue representing Ukrainian sky and resolve secondary_hue="amber", # Warm amber representing golden fields and determination neutral_hue="stone", # Earthy stone representing strength and foundation ) # Load CSS from external file def load_css(): try: with open("static/style.css", "r", encoding="utf-8") as f: return f.read() except FileNotFoundError: print("Warning: static/style.css not found") return "" CSS = load_css() def _clear_chat(): return "", [] with gr.Blocks(theme=THEME, css=CSS, fill_height=True) as demo: # Header (no gr.Box to avoid version issues) gr.HTML( """