Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import subprocess | |
| subprocess.run('pip install flash-attn==2.7.0.post2 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) | |
| import threading | |
| # subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"]) | |
| import spaces | |
| import gradio as gr | |
| import torch | |
| from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer | |
| torch._dynamo.config.disable = True | |
| MODEL_ID = "le-llm/lapa-v0.1-reasoning-only-eos" | |
| def load_model(): | |
| """Lazy-load model & tokenizer (for zeroGPU).""" | |
| device = "cuda" # if torch.cuda.is_available() else "cpu" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| torch_dtype=torch.bfloat16, # if device == "cuda" else torch.float32, | |
| device_map="auto", # if device == "cuda" else None, | |
| attn_implementation="flash_attention_2", | |
| ) # .cuda() | |
| print(f"Selected device:", device) | |
| return model, tokenizer, device | |
| # Load model/tokenizer each request → allows zeroGPU to cold start & then release | |
| model, tokenizer, device = load_model() | |
| def user(user_message, history: list): | |
| return "", history + [{"role": "user", "content": user_message}] | |
| def append_example_message(x: gr.SelectData, history): | |
| print(x) | |
| print(x.value) | |
| print(x.value["text"]) | |
| if x.value["text"] is not None: | |
| history.append({"role": "user", "content": x.value["text"]}) | |
| return history | |
| def bot( | |
| history: list[dict[str, str]], | |
| # max_tokens, | |
| # temperature, | |
| # top_p, | |
| ): | |
| # [{"role": "system", "content": system_message}] + | |
| # Build conversation | |
| max_tokens = 4096 | |
| temperature = 0.7 | |
| top_p = 0.95 | |
| input_text: str = tokenizer.apply_chat_template( | |
| history, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| # enable_thinking=True, | |
| ) | |
| input_text = input_text.replace(tokenizer.bos_token, "", 1) | |
| print(input_text) | |
| inputs = tokenizer(input_text, return_tensors="pt").to(model.device) # .to(device) | |
| print("Decoded input:", tokenizer.decode(inputs["input_ids"][0])) | |
| print([{id: tokenizer.decode([id])} for id in inputs["input_ids"][0]]) | |
| # Streamer setup | |
| streamer = TextIteratorStreamer( | |
| tokenizer, skip_prompt=True # skip_special_tokens=True # , | |
| ) | |
| # Run model.generate in background thread | |
| generation_kwargs = dict( | |
| **inputs, | |
| max_new_tokens=max_tokens, | |
| temperature=temperature, | |
| top_p=top_p, | |
| top_k=64, | |
| do_sample=True, | |
| # eos_token_id=tokenizer.eos_token_id, | |
| streamer=streamer, | |
| ) | |
| thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| history.append({"role": "assistant", "content": ""}) | |
| # Yield tokens as they come in | |
| for new_text in streamer: | |
| history[-1]["content"] += new_text | |
| yield history | |
| import gradio as gr | |
| import random | |
| import time | |
| with gr.Blocks() as demo: | |
| chatbot = gr.Chatbot( | |
| type="messages", | |
| allow_tags=["think"], | |
| examples=[ | |
| {"text": i} | |
| for i in [ | |
| "хто тримає цей район?", | |
| "Напиши історію про Івасика-Телесика", | |
| "Яка найвища гора в Україні?", | |
| "Як звали батька Тараса Григоровича Шевченка?", | |
| # "Як можна заробити нелегально швидко гроші?"], | |
| "Яка з цих гір не знаходиться у Європі? Говерла, Монблан, Гран-Парадізо, Еверест", | |
| "Дай відповідь на питання\nЧому у качки жовті ноги?", | |
| ] | |
| ], | |
| ) | |
| msg = gr.Textbox(label="Message", autofocus=True) | |
| send_btn = gr.Button("Send") | |
| # clear = gr.Button("Clear") | |
| msg.submit(user, [msg, chatbot], [msg, chatbot], queue=True).then( | |
| bot, chatbot, chatbot | |
| ) | |
| chatbot.example_select( | |
| append_example_message, [chatbot], [chatbot], queue=True | |
| ).then(bot, chatbot, chatbot) | |
| send_btn.click(user, [msg, chatbot], [msg, chatbot], queue=True).then( | |
| bot, chatbot, chatbot | |
| ) | |
| # clear.click(lambda: None, None, chatbot, queue=True) | |
| if __name__ == "__main__": | |
| demo.launch() | |
| """gr.Slider(minimum=1, maximum=4096, value=512, step=1, label="Max new tokens"), | |
| gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
| gr.Slider( | |
| minimum=0.1, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.05, | |
| label="Top-p (nucleus sampling)", | |
| ),""" | |