import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Download your GGUF model from HF Hub
model_path = hf_hub_download(
    repo_id="astegaras/merged_kaggle",
    filename="llama-3.2-3b-instruct.Q2_K.gguf"
)

# Load the GGUF model with llama.cpp
llm = Llama(
    model_path=model_path,
    n_ctx=4096,       # Context window for inference
    n_threads=8,      # Adjust to HF hardware
    n_batch=512,
    verbose=False
)

def chat_fn(message, history):
    # Reformat history for llama.cpp chat template
    messages = []
    for user, assistant in history:
        messages.append({"role": "user", "content": user})
        messages.append({"role": "assistant", "content": assistant})

    messages.append({"role": "user", "content": message})

    output = llm.create_chat_completion(
        messages=messages,
        max_tokens=512,
        temperature=0.7,
        top_p=0.9
    )

    reply = output["choices"][0]["message"]["content"]
    return reply


# Gradio UI
chatbot = gr.ChatInterface(
    fn=chat_fn,
    title="Merged Kaggle Model (GGUF)",
    description="Running llama.cpp inference on GGUF model",
)

chatbot.launch()