import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download # Download your GGUF model from HF Hub model_path = hf_hub_download( repo_id="astegaras/merged_kaggle", filename="llama-3.2-3b-instruct.Q2_K.gguf" ) # Load the GGUF model with llama.cpp llm = Llama( model_path=model_path, n_ctx=4096, # Context window for inference n_threads=8, # Adjust to HF hardware n_batch=512, verbose=False ) def chat_fn(message, history): # Reformat history for llama.cpp chat template messages = [] for user, assistant in history: messages.append({"role": "user", "content": user}) messages.append({"role": "assistant", "content": assistant}) messages.append({"role": "user", "content": message}) output = llm.create_chat_completion( messages=messages, max_tokens=512, temperature=0.7, top_p=0.9 ) reply = output["choices"][0]["message"]["content"] return reply # Gradio UI chatbot = gr.ChatInterface( fn=chat_fn, title="Merged Kaggle Model (GGUF)", description="Running llama.cpp inference on GGUF model", ) chatbot.launch()