import gradio as gr from huggingface_hub import hf_hub_download from llama_cpp import Llama # Replace with your actual HF model repo and filename model_repo = "AravindKumarRajendran/WhiZ-gemma-3n-4b" model_filename = "gemma-3n-4b-it-finetune.Q8_0.gguf" # Exact GGUF file name in repo # Download GGUF model from HF Hub (caches locally) model_path = hf_hub_download(repo_id=model_repo, filename=model_filename) # Load model with llama-cpp llm = Llama( model_path=model_path, n_ctx=2048, n_threads=4, n_batch=64, verbose=False ) # Chat handler def chat_with_model(history, user_input): history.append(("🧑‍💻: " + user_input, "")) prompt = f"{user_input} தமிழில் பதிலளி:" output = llm( prompt, max_tokens=128, temperature=0.7, stop=[""], ) reply = output["choices"][0]["text"].strip() history[-1] = (history[-1][0], "🤖: " + reply) return history, "" # Gradio UI with gr.Blocks() as demo: gr.Markdown("## 🗣️ தமிழில் உரையாடல் (Tamil Chatbot - GGUF on CPU)") chatbot = gr.Chatbot() msg = gr.Textbox(label="உங்கள் செய்தி", placeholder="Type your message...") clear = gr.Button("🧹 Clear Chat") state = gr.State([]) msg.submit(chat_with_model, [state, msg], [chatbot, msg]) clear.click(lambda: ([], ""), None, [chatbot, msg, state]) demo.launch()