iris / app.py
astegaras's picture
Update app.py
af8d9d1 verified
raw
history blame
1.26 kB
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Download your GGUF model from HF Hub
model_path = hf_hub_download(
repo_id="astegaras/merged_kaggle",
filename="llama-3.2-3b-instruct.Q2_K.gguf"
)
# Load GGUF with safe HF settings
llm = Llama(
model_path=model_path,
n_ctx=4096,
n_threads=4,
n_batch=64,
n_gpu_layers=0, # IMPORTANT
use_mmap=False, # IMPORTANT
use_mlock=False, # IMPORTANT
low_vram=True, # IMPORTANT
verbose=False
)
def chat_fn(message, history):
# Reformat history for llama.cpp chat template
messages = []
for user, assistant in history:
messages.append({"role": "user", "content": user})
messages.append({"role": "assistant", "content": assistant})
messages.append({"role": "user", "content": message})
output = llm.create_chat_completion(
messages=messages,
max_tokens=512,
temperature=0.7,
top_p=0.9
)
reply = output["choices"][0]["message"]["content"]
return reply
# Gradio UI
chatbot = gr.ChatInterface(
fn=chat_fn,
title="Merged Kaggle Model (GGUF)",
description="Running llama.cpp inference on GGUF model",
)
chatbot.launch()