Spaces:

astegaras
/

iris

Sleeping

File size: 1,252 Bytes

b7adb02
222561c
 
b7adb02
389f1ec
222561c
 
ddbf874
222561c
b7adb02
389f1ec
222561c
 
d4ef7b3
389f1ec
 
 
 
222561c
b7adb02
389f1ec
 
2fb80ad
 
 
 
 
 
 
 
 
62f86f8
389f1ec
 
62f86f8
d4ef7b3
 
389f1ec
7e05dd1
 
389f1ec
7e05dd1
b7adb02
389f1ec
d4ef7b3
 
 
 
389f1ec
 
 
d4ef7b3
 
442c9db
62f86f8
389f1ec

import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download

# Download GGUF from your HF repo
model_path = hf_hub_download(
    repo_id="astegaras/Llama3.2_3B",
    filename="model-Q2_K.gguf"
)

# Load model (llama.cpp)
llm = Llama(
    model_path=model_path,
    n_ctx=4096,
    chat_format=None,
    n_gpu_layers=0,
    add_bos_token=False,
    add_eos_token=False,
)

# Build inference prompt according to your dataset format
def format_prompt(user_message):
    return (
        "<|begin_of_text|>"
        "<|start_header_id|>system<|end_header_id|>\n"
        "You are a helpful assistant.\n"
        "<|start_header_id|>user<|end_header_id|>\n"
        f"{user_message}\n"
        "<|start_header_id|>assistant<|end_header_id|>\n"
    )


def respond(user_input):
    prompt = format_prompt(user_input)

    output = llm(
        prompt,
        max_tokens=512,
        temperature=0.7,
        top_p=0.9,
        stop=["<|user|>", "<|system|>"],  # avoid looping
    )

    return output["choices"][0]["text"]

# Gradio UI
gr.Interface(
    fn=respond,
    inputs=gr.components.Textbox(label="Ask"),
    outputs=gr.components.Textbox(label="Answer"),
    title="Llama3.2-3B Fine-tuned Assistant"
).launch()