Spaces:
Running
on
Zero
Running
on
Zero
| import os | |
| import subprocess | |
| import tempfile | |
| # subprocess.run('pip install flash-attn==2.8.0 --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True) | |
| import threading | |
| # subprocess.check_call([os.sys.executable, "-m", "pip", "install", "-r", "requirements.txt"]) | |
| import spaces | |
| import gradio as gr | |
| import torch | |
| from PIL.Image import Image | |
| from transformers import AutoModelForCausalLM, AutoProcessor, AutoTokenizer, TextIteratorStreamer | |
| from analytics import AnalyticsLogger | |
| from kernels import get_kernel | |
| from typing import Any, Optional, Dict | |
| #vllm_flash_attn3 = get_kernel("kernels-community/vllm-flash-attn3") | |
| #torch._dynamo.config.disable = True | |
| # Login to HF to get access to the model weights | |
| HF_LE_LLM_READ_TOKEN = os.environ.get('HF_LE_LLM_READ_TOKEN') | |
| from huggingface_hub import login | |
| login(token=HF_LE_LLM_READ_TOKEN) | |
| # Constants | |
| MODEL_ID = "lapa-llm/lapa-v0.1.2-instruct" | |
| MAX_TOKENS = 4096 | |
| TEMPERATURE = 0.7 | |
| TOP_P = 0.95 | |
| IMAGE_MAX_SIZE = 1024 | |
| logger = AnalyticsLogger() | |
| def _begin_analytics_session(): | |
| # Called once per client on app load | |
| _ = logger.start_session(MODEL_ID) | |
| def load_model(): | |
| """Lazy-load model, tokenizer, and optional processor (for zeroGPU).""" | |
| device = "cuda" # if torch.cuda.is_available() else "cpu" | |
| tokenizer = AutoTokenizer.from_pretrained(MODEL_ID) | |
| processor = None | |
| try: | |
| processor = AutoProcessor.from_pretrained(MODEL_ID) | |
| except Exception as err: # pragma: no cover - informative fallback | |
| print(f"Warning: AutoProcessor not available ({err}). Falling back to tokenizer.") | |
| model = AutoModelForCausalLM.from_pretrained( | |
| MODEL_ID, | |
| dtype=torch.bfloat16, # if device == "cuda" else torch.float32, | |
| device_map="auto", # if device == "cuda" else None, | |
| attn_implementation="flash_attention_2",# "kernels-community/vllm-flash-attn3", # # | |
| ) # .cuda() | |
| print(f"Selected device:", device) | |
| return model, tokenizer, processor, device | |
| # Load model/tokenizer each request → allows zeroGPU to cold start & then release | |
| model, tokenizer, processor, device = load_model() | |
| def user(user_message, image_data: Image, history: list): | |
| """Format user message with optional image.""" | |
| import io | |
| user_message = user_message or "" | |
| updated_history = list(history) | |
| has_content = False | |
| stripped_message = user_message.strip() | |
| # If we have an image, save it to temp file for Gradio display | |
| if image_data is not None: | |
| image_data.thumbnail((IMAGE_MAX_SIZE, IMAGE_MAX_SIZE)) | |
| # Save to temp file for Gradio display | |
| fd, tmp_path = tempfile.mkstemp(suffix=".jpg") | |
| os.close(fd) | |
| image_data.save(tmp_path, format="JPEG") | |
| # Also encode as base64 for model processing (stored in metadata) | |
| buffered = io.BytesIO() | |
| image_data.save(buffered, format="JPEG") | |
| # TODO do we leave that message? | |
| text_content = stripped_message if stripped_message else "Опиши це зображення" | |
| # Store both text and image in a single message with base64 in metadata | |
| updated_history.append({ | |
| "role": "user", | |
| "content": text_content | |
| }) | |
| updated_history.append({ | |
| "role": "user", | |
| "content": { | |
| "path": tmp_path, | |
| "alt_text": "User uploaded image" | |
| }, | |
| }) | |
| has_content = True | |
| elif stripped_message: | |
| updated_history.append({"role": "user", "content": stripped_message}) | |
| has_content = True | |
| if not has_content: | |
| # Nothing to submit yet; keep inputs unchanged | |
| return user_message, image_data, history | |
| return "", None, updated_history | |
| def append_example_message(x: gr.SelectData, history): | |
| if x.value["text"] is not None: | |
| history.append({"role": "user", "content": x.value["text"]}) | |
| return history | |
| def _extract_text_from_content(content: Any) -> str | tuple[str, str]: | |
| """Extract text from message content for logging.""" | |
| if isinstance(content, str): | |
| return content | |
| if isinstance(content, tuple) and len(content) == 2: | |
| return content # (image_path, user_text) | |
| raise ValueError(f"Unsupported content type for text extraction: {content}") | |
| def _clean_history_for_display(history: list[dict[str, Any]]) -> list[dict[str, Any]]: | |
| """Remove internal metadata fields like _base64 before displaying in Gradio.""" | |
| cleaned = [] | |
| for message in history: | |
| cleaned_message = {"role": message.get("role", "user")} | |
| content = message.get("content") | |
| if isinstance(content, str): | |
| cleaned_message["content"] = content | |
| elif isinstance(content, list): | |
| cleaned_content = [] | |
| for item in content: | |
| if isinstance(item, dict): | |
| # Remove _base64 metadata | |
| cleaned_item = {k: v for k, v in item.items() if not k.startswith("_")} | |
| cleaned_content.append(cleaned_item) | |
| else: | |
| cleaned_content.append(item) | |
| cleaned_message["content"] = cleaned_content | |
| else: | |
| cleaned_message["content"] = content | |
| cleaned.append(cleaned_message) | |
| return cleaned | |
| def bot( | |
| history: list[dict[str, Any]] | |
| ): | |
| """Generate bot response with support for text and images.""" | |
| # Early return if no input | |
| if not history: | |
| return | |
| # Extract last user message for logging | |
| last_user_msg = next((msg for msg in reversed(history) if msg.get("role") == "user"), None) | |
| user_message_text = _extract_text_from_content(last_user_msg.get("content")) if last_user_msg else "" | |
| print('User message:', user_message_text) | |
| # Check if any message contains images | |
| has_images = any( | |
| isinstance(msg.get("content"), tuple) | |
| for msg in history | |
| ) | |
| model_inputs = None | |
| # Use processor if images are present | |
| if processor is not None and has_images: | |
| # try: | |
| processor_history = [] | |
| for msg in history: | |
| role = msg.get("role", "user") | |
| content = msg.get("content") | |
| if isinstance(content, str): | |
| processor_history.append({"role": role, "content": [{"type": "text", "text": content}]}) | |
| elif isinstance(content, tuple): | |
| formatted_content = [] | |
| tmp_path, _ = content | |
| image_input = { | |
| "type": "image", | |
| "url": f"{tmp_path}", | |
| } | |
| if processor_history[-1].get('role') == 'user': | |
| if isinstance(processor_history[-1].get('content'), str): | |
| previous_message = processor_history[-1].get('content') | |
| formatted_content.append({"type": "text", "text": previous_message}) | |
| formatted_content.append(image_input) | |
| processor_history[-1]['content'] = formatted_content | |
| elif isinstance(processor_history[-1].get('content'), list): | |
| processor_history[-1]['content'].append(image_input) | |
| else: | |
| formatted_content.append(image_input) | |
| processor_history.append({"role": role, "content": formatted_content}) | |
| print(f"{processor_history = }") | |
| model_inputs = processor.apply_chat_template( | |
| processor_history, | |
| tokenize=True, | |
| return_dict=True, | |
| return_tensors="pt", | |
| add_generation_prompt=True, | |
| ).to(model.device) | |
| print("Using processor for vision input") | |
| # except Exception as exc: | |
| # print(f"Processor failed: {exc}") | |
| # model_inputs = None | |
| # Fallback to tokenizer for text-only | |
| if model_inputs is None: | |
| # Convert to text-only format for tokenizer | |
| text_history = [] | |
| for msg in history: | |
| role = msg.get("role", "user") | |
| content = msg.get("content") | |
| text_content = _extract_text_from_content(content) | |
| if text_content: | |
| text_history.append({"role": role, "content": text_content}) | |
| if text_history: | |
| input_text = tokenizer.apply_chat_template( | |
| text_history, | |
| tokenize=False, | |
| add_generation_prompt=True, | |
| ) | |
| if input_text and tokenizer.bos_token: | |
| input_text = input_text.replace(tokenizer.bos_token, "", 1) | |
| model_inputs = tokenizer(input_text, return_tensors="pt").to(model.device) | |
| print("Using tokenizer for text-only input") | |
| if model_inputs is None: | |
| return | |
| # Streamer setup | |
| streamer = TextIteratorStreamer(tokenizer, skip_prompt=True) | |
| # Run model.generate in background thread | |
| generation_kwargs = dict( | |
| **model_inputs, | |
| max_new_tokens=MAX_TOKENS, | |
| temperature=TEMPERATURE, | |
| top_p=TOP_P, | |
| top_k=64, | |
| do_sample=True, | |
| streamer=streamer, | |
| ) | |
| thread = threading.Thread(target=model.generate, kwargs=generation_kwargs) | |
| thread.start() | |
| history.append({"role": "assistant", "content": ""}) | |
| # Yield tokens as they come in | |
| for new_text in streamer: | |
| history[-1]["content"] += new_text | |
| yield _clean_history_for_display(history) | |
| assistant_message = history[-1]["content"] | |
| logger.log_interaction(user=user_message_text, answer=assistant_message) | |
| # --- drop-in UI compatible with older Gradio versions --- | |
| import os, tempfile, time | |
| import gradio as gr | |
| # Ukrainian-inspired theme with deep, muted colors reflecting unbeatable spirit: | |
| THEME = gr.themes.Soft( | |
| primary_hue="blue", # Deep blue representing Ukrainian sky and resolve | |
| secondary_hue="amber", # Warm amber representing golden fields and determination | |
| neutral_hue="stone", # Earthy stone representing strength and foundation | |
| ) | |
| # Load CSS from external file | |
| def load_css(): | |
| try: | |
| with open("static/style.css", "r", encoding="utf-8") as f: | |
| return f.read() | |
| except FileNotFoundError: | |
| print("Warning: static/style.css not found") | |
| return "" | |
| CSS = load_css() | |
| def _clear_chat(): | |
| return "", None, [] | |
| with gr.Blocks(theme=THEME, css=CSS, fill_height=True, js="() => {document.body.classList.remove('dark');}") as demo: | |
| demo.load(fn=_begin_analytics_session, inputs=None, outputs=None) | |
| # Header (no gr.Box to avoid version issues) | |
| gr.HTML( | |
| """ | |
| <div id="app-header"> | |
| <div class="app-title">✨ LAPA</div> | |
| <div class="app-subtitle">LLM for Ukrainian Language</div> | |
| </div> | |
| """ | |
| ) | |
| with gr.Row(equal_height=True): | |
| # Left side: Chat | |
| with gr.Column(scale=7, elem_id="left-pane"): | |
| with gr.Column(elem_id="chat-card"): | |
| chatbot = gr.Chatbot( | |
| type="messages", | |
| height=560, | |
| render_markdown=True, | |
| show_copy_button=True, | |
| show_label=False, | |
| # likeable=True, | |
| allow_tags=["think"], | |
| elem_id="chatbot", | |
| examples=[ | |
| {"text": i} | |
| for i in [ | |
| "хто тримає цей район?", | |
| "Напиши історію про Івасика-Телесика", | |
| "Яка найвища гора в Україні?", | |
| "Як звали батька Тараса Григоровича Шевченка?", | |
| "Яка з цих гір не знаходиться у Європі? Говерла, Монблан, Гран-Парадізо, Еверест", | |
| "Дай відповідь на питання\nЧому у качки жовті ноги?", | |
| ] | |
| ], | |
| ) | |
| image_input = gr.Image( | |
| label="Attach image (optional)", | |
| type="pil", | |
| sources=["upload", "clipboard"], | |
| height=200, | |
| interactive=True, | |
| elem_id="image-input", | |
| ) | |
| # ChatGPT-style input box with stop button | |
| with gr.Row(elem_id="chat-input-row"): | |
| msg = gr.Textbox( | |
| label=None, | |
| placeholder="Message… (Press Enter to send)", | |
| autofocus=True, | |
| lines=1, | |
| max_lines=6, | |
| container=False, | |
| show_label=False, | |
| elem_id="chat-input", | |
| elem_classes=["chat-input-box"] | |
| ) | |
| send_btn_visible = gr.Button( | |
| "➤", | |
| variant="primary", | |
| elem_id="send-btn-visible", | |
| elem_classes=["send-btn-chat"], | |
| size="sm" | |
| ) | |
| stop_btn_visible = gr.Button( | |
| "⏹️", | |
| variant="secondary", | |
| elem_id="stop-btn-visible", | |
| elem_classes=["stop-btn-chat"], | |
| visible=False, | |
| size="sm" | |
| ) | |
| # Hidden buttons for functionality | |
| with gr.Row(visible=True, elem_id="hidden-buttons"): | |
| send_btn = gr.Button("Send", variant="primary", elem_id="send-btn") | |
| stop_btn = gr.Button("Stop", variant="secondary", elem_id="stop-btn") | |
| clear_btn = gr.Button("Clear", variant="secondary", elem_id="clear-btn") | |
| # export_btn = gr.Button("Export chat (.md)", variant="secondary", elem_classes=["rounded-btn","secondary-btn"]) | |
| # exported_file = gr.File(label="", interactive=False, visible=True) | |
| gr.HTML('<div class="footer-tip">Shortcuts: Enter to send • Shift+Enter for new line</div>') | |
| # Helper functions for managing UI state | |
| def show_stop_hide_send(): | |
| return gr.update(visible=True), gr.update(visible=False) | |
| def hide_stop_show_send(): | |
| return gr.update(visible=False), gr.update(visible=True) | |
| # Events (preserve your original handlers) | |
| e1 = msg.submit(fn=user, inputs=[msg, image_input, chatbot], outputs=[msg, image_input, chatbot], queue=True).then( | |
| fn=show_stop_hide_send, inputs=None, outputs=[stop_btn_visible, send_btn_visible] | |
| ).then( | |
| fn=bot, inputs=chatbot, outputs=chatbot | |
| ).then( | |
| fn=hide_stop_show_send, inputs=None, outputs=[stop_btn_visible, send_btn_visible] | |
| ) | |
| e2 = send_btn_visible.click(fn=user, inputs=[msg, image_input, chatbot], outputs=[msg, image_input, chatbot], queue=True).then( | |
| fn=show_stop_hide_send, inputs=None, outputs=[stop_btn_visible, send_btn_visible] | |
| ).then( | |
| fn=bot, inputs=chatbot, outputs=chatbot | |
| ).then( | |
| fn=hide_stop_show_send, inputs=None, outputs=[stop_btn_visible, send_btn_visible] | |
| ) | |
| e3 = chatbot.example_select(fn=append_example_message, inputs=[chatbot], outputs=[chatbot], queue=True).then( | |
| fn=show_stop_hide_send, inputs=None, outputs=[stop_btn_visible, send_btn_visible] | |
| ).then( | |
| fn=bot, inputs=chatbot, outputs=chatbot | |
| ).then( | |
| fn=hide_stop_show_send, inputs=None, outputs=[stop_btn_visible, send_btn_visible] | |
| ) | |
| # Stop cancels running events (both buttons work) | |
| stop_btn.click(fn=hide_stop_show_send, inputs=None, outputs=[stop_btn_visible, send_btn_visible], cancels=[e1, e2, e3], queue=True) | |
| stop_btn_visible.click(fn=hide_stop_show_send, inputs=None, outputs=[stop_btn_visible, send_btn_visible], cancels=[e1, e2, e3], queue=True) | |
| # Clear chat + input | |
| clear_btn.click(fn=_clear_chat, inputs=None, outputs=[msg, image_input, chatbot]) | |
| # Export markdown | |
| # export_btn.click(fn=_export_markdown, inputs=chatbot, outputs=exported_file) | |
| gr.HTML( | |
| """<h1>Lapa LLM</h1> | |
| <h2>Introducing Lapa LLM v0.1.2 — the most efficient Ukrainian open-source language model</h2> | |
| <div class="links-section"> | |
| <h2>Links:</h2> | |
| <a href="https://huggingface.co/collections/lapa-llm/lapa-v012-release" target="_blank">Release Collection: model checkpoints, datasets, demo pages</a><br> | |
| <a href="https://github.com/lapa-llm/lapa-llm" target="_blank">Code on GitHub</a><br> | |
| <a href="https://t.me/pehade_blog" target="_blank">Subscribe to the Telegram channel for further updates</a><br> | |
| </div> | |
| <br> | |
| <p>Today, we proudly present Lapa LLM — a cutting-edge open large language model based on Gemma-3-12B with a focus on Ukrainian language processing. The project is the result of many months of work by a team of Ukrainian researchers in artificial intelligence from the Ukrainian Catholic University, AGH University of Krakow, Igor Sikorsky Kyiv Polytechnic Institute, and Lviv Polytechnic, who united to create the best model for Ukrainian language processing.</p> | |
| <p>The model is named in honor of <a href="https://de.wikipedia.org/wiki/Walentyn_Lapa" target="_blank">Valentyn Lapa</a>, who together with <a href="https://uk.wikipedia.org/wiki/%D0%86%D0%B2%D0%B0%D1%85%D0%BD%D0%B5%D0%BD%D0%BA%D0%BE_%D0%9E%D0%BB%D0%B5%D0%BA%D1%81%D1%96%D0%B9_%D0%93%D1%80%D0%B8%D0%B3%D0%BE%D1%80%D0%BE%D0%B2%D0%B8%D1%87" target="_blank">Oleksiy Ivakhnenko</a> created the Group Method of Data Handling, which is a predecessor to Deep Learning <a href="https://people.idsia.ch/~juergen/DeepLearning2July2014.pdf" target="_blank">(source)</a>.</p> | |
| <p>The project's goal is to create the best model for Ukrainian language processing with open datasets for pretraining and instruction tuning.</p> | |
| <h3>Key Achievements</h3> | |
| <p><strong>Best tokenizer for the Ukrainian language</strong></p> | |
| <p>Thanks to a SOTA method for tokenizer adaptation developed by <a href="https://www.linkedin.com/in/mykola-haltiuk/" target="_blank">Mykola Haltiuk</a> as part of this project, it was possible to replace 80,000 tokens out of 250,000 with Ukrainian ones without loss of model quality, thus making Lapa LLM the fastest model for working with the Ukrainian language. Compared to the original Gemma 3, for working with Ukrainian, the model requires 1.5 times fewer tokens, thus performing three times fewer computations to achieve better results.</p> | |
| <p><strong>Most efficient instruction-tuned model on the market</strong></p> | |
| <p>Our instruction version of the model in some benchmark categories is only slightly behind the current leader — <a href="https://huggingface.co/spaces/INSAIT-Institute/mamaylm-v1-blog" target="_blank">MamayLM</a>. The team is actively working on new datasets to further improve benchmark scores, which we plan to surpass in the v1.0 model.</p> | |
| <h3>Benchmark Results</h3> | |
| <ul> | |
| <li>Best English-to-Ukrainian translator with a result of 33 BLEU on FLORES and vice versa, which allows for natural and cost-effective translation of new NLP datasets into Ukrainian</li> | |
| <li>One of the best models for image processing in Ukrainian in its size class, as measured on the MMZNO benchmark</li> | |
| <li>One of the best models for Summarization and Q&A, which means excellent performance for RAG</li> | |
| <li>Tests on propaganda and disinformation questions show the effectiveness of the filtering approach at the pretraining stage and during instruction fine-tuning</li> | |
| </ul> | |
| <p>Model measurements and comparisons will be published as part of the Ukrainian LLM Leaderboard project; subscribe to the Telegram channel for further news.</p> | |
| <p><strong>Leader in pretraining results</strong></p> | |
| <p>Lapa LLM demonstrates the best performance in pretraining benchmarks for Ukrainian language processing, which opens opportunities for use by other researchers to adapt for their own tasks.</p> | |
| <p>The model was trained on data evaluated by various quality assessment models - evaluation of propaganda and disinformation presence, readability, grammar assessment, etc. In the final stages of training, the model was trained on high-quality materials provided for commercial use by the Open Data division of Harvard Library.</p> | |
| <p><strong>Maximum openness and transparency</strong></p> | |
| <p>Unlike most available models, Lapa LLM is a maximally open project:</p> | |
| <ul> | |
| <li>The model is available for commercial use</li> | |
| <li>Approximately 25 datasets for model training have been published</li> | |
| <li>Methods for filtering and processing data are disclosed, including for detecting disinformation and propaganda</li> | |
| <li>Open source code for the model</li> | |
| <li>Documentation of the training process is available</li> | |
| </ul> | |
| <p>This openness allows for the development of the Ukrainian NLP community and helps businesses obtain a tool for the most efficient Ukrainian language processing in terms of both computation and results.</p> | |
| <h3>Application Possibilities</h3> | |
| <p>Lapa LLM opens wide possibilities for:</p> | |
| <ul> | |
| <li>Processing sensitive documents without transferring data to external servers</li> | |
| <li>Working with Ukrainian texts taking into account cultural and historical context without code-switching to Russian or other languages</li> | |
| <li>Building RAG systems and chatbots that write in proper Ukrainian</li> | |
| <li>Developing specialized solutions through the ability to fine-tune for specific tasks</li> | |
| <li>Machine translation with the best translation quality from English to Ukrainian and vice versa among all models, including API providers</li> | |
| </ul> | |
| <h3>Next Steps</h3> | |
| <ul> | |
| <li>Complete development of the reasoning model</li> | |
| <li>We are collecting community feedback on the model's performance, so we look forward to receiving it on GitHub or HuggingFace!</li> | |
| <li>Collecting additional datasets for image processing in Ukrainian</li> | |
| <li>Collecting additional datasets for instruction following and programming</li> | |
| </ul> | |
| <h3>Acknowledgment to Sponsors</h3> | |
| <p>The creation of Lapa LLM was made possible thanks to the support of our partners and sponsors, primarily the startup <strong>Comand.AI</strong>, which provided computational resources for training the model. We also want to thank the company <strong>ELEKS</strong>, which supported this project through a grant dedicated to the memory of Oleksiy Skrypnyk, and the startup <strong>HuggingFace</strong>, which provided a free corporate subscription to the team for storing models and datasets.</p> | |
| <h3>Team</h3>""" | |
| ) | |
| # Load and inject external JavaScript | |
| def load_javascript(): | |
| try: | |
| with open("static/script.js", "r", encoding="utf-8") as f: | |
| return f"<script>{f.read()}</script>" | |
| except FileNotFoundError: | |
| print("Warning: static/script.js not found") | |
| return "" | |
| gr.HTML(load_javascript()) | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |