File size: 1,258 Bytes
b7adb02
251dafb
e104971
485a33e
e104971
 
 
89d0feb
251dafb
 
af8d9d1
e104971
 
af8d9d1
 
 
 
 
 
 
e104971
 
e497580
e104971
 
e497580
 
 
 
485a33e
e497580
485a33e
e104971
 
 
 
 
 
62f86f8
e104971
e497580
805934c
e497580
e104971
 
 
 
 
 
 
 
251dafb
e497580
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Download your GGUF model from HF Hub
model_path = hf_hub_download(
    repo_id="astegaras/merged_kaggle",
    filename="llama-3.2-3b-instruct.Q2_K.gguf"
)

# Load GGUF with safe HF settings
llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    n_threads=4,
    n_batch=64,
    n_gpu_layers=0,     # IMPORTANT
    use_mmap=False,     # IMPORTANT
    use_mlock=False,    # IMPORTANT
    low_vram=True,      # IMPORTANT
    verbose=False
)

def chat_fn(message, history):
    # Reformat history for llama.cpp chat template
    messages = []
    for user, assistant in history:
        messages.append({"role": "user", "content": user})
        messages.append({"role": "assistant", "content": assistant})

    messages.append({"role": "user", "content": message})

    output = llm.create_chat_completion(
        messages=messages,
        max_tokens=512,
        temperature=0.7,
        top_p=0.9
    )

    reply = output["choices"][0]["message"]["content"]
    return reply


# Gradio UI
chatbot = gr.ChatInterface(
    fn=chat_fn,
    title="Merged Kaggle Model (GGUF)",
    description="Running llama.cpp inference on GGUF model",
)

chatbot.launch()