import gradio as gr
from huggingface_hub import hf_hub_download
from llama_cpp import Llama

# Replace with your actual HF model repo and filename
model_repo = "AravindKumarRajendran/WhiZ-gemma-3n-4b"
model_filename = "gemma-3n-4b-it-finetune.Q8_0.gguf"  # Exact GGUF file name in repo

# Download GGUF model from HF Hub (caches locally)
model_path = hf_hub_download(repo_id=model_repo, filename=model_filename)

# Load model with llama-cpp
llm = Llama(
    model_path=model_path,
    n_ctx=2048,
    n_threads=4,
    n_batch=64,
    verbose=False
)

# Chat handler
def chat_with_model(history, user_input):
    history.append(("🧑‍💻: " + user_input, ""))
    prompt = f"{user_input} தமிழில் பதிலளி:"

    output = llm(
        prompt,
        max_tokens=128,
        temperature=0.7,
        stop=["</s>"],
    )

    reply = output["choices"][0]["text"].strip()
    history[-1] = (history[-1][0], "🤖: " + reply)
    return history, ""

# Gradio UI
with gr.Blocks() as demo:
    gr.Markdown("## 🗣️ தமிழில் உரையாடல் (Tamil Chatbot - GGUF on CPU)")
    chatbot = gr.Chatbot()
    msg = gr.Textbox(label="உங்கள் செய்தி", placeholder="Type your message...")
    clear = gr.Button("🧹 Clear Chat")
    state = gr.State([])

    msg.submit(chat_with_model, [state, msg], [chatbot, msg])
    clear.click(lambda: ([], ""), None, [chatbot, msg, state])

demo.launch()