|
|
import gradio as gr |
|
|
from llama_cpp import Llama |
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
|
|
model_path = hf_hub_download( |
|
|
repo_id="astegaras/merged_kaggle", |
|
|
filename="llama-3.2-3b-instruct.Q2_K.gguf" |
|
|
) |
|
|
|
|
|
|
|
|
llm = Llama( |
|
|
model_path=model_path, |
|
|
n_ctx=4096, |
|
|
n_threads=4, |
|
|
n_batch=64, |
|
|
n_gpu_layers=0, |
|
|
use_mmap=False, |
|
|
use_mlock=False, |
|
|
low_vram=True, |
|
|
verbose=False |
|
|
) |
|
|
|
|
|
def chat_fn(message, history): |
|
|
|
|
|
messages = [] |
|
|
for user, assistant in history: |
|
|
messages.append({"role": "user", "content": user}) |
|
|
messages.append({"role": "assistant", "content": assistant}) |
|
|
|
|
|
messages.append({"role": "user", "content": message}) |
|
|
|
|
|
output = llm.create_chat_completion( |
|
|
messages=messages, |
|
|
max_tokens=512, |
|
|
temperature=0.7, |
|
|
top_p=0.9 |
|
|
) |
|
|
|
|
|
reply = output["choices"][0]["message"]["content"] |
|
|
return reply |
|
|
|
|
|
|
|
|
|
|
|
chatbot = gr.ChatInterface( |
|
|
fn=chat_fn, |
|
|
title="Merged Kaggle Model (GGUF)", |
|
|
description="Running llama.cpp inference on GGUF model", |
|
|
) |
|
|
|
|
|
chatbot.launch() |
|
|
|
|
|
|
|
|
|