File size: 1,252 Bytes
b7adb02 222561c b7adb02 389f1ec 222561c ddbf874 222561c b7adb02 389f1ec 222561c d4ef7b3 389f1ec 222561c b7adb02 389f1ec 2fb80ad 62f86f8 389f1ec 62f86f8 d4ef7b3 389f1ec 7e05dd1 389f1ec 7e05dd1 b7adb02 389f1ec d4ef7b3 389f1ec d4ef7b3 442c9db 62f86f8 389f1ec |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 |
import gradio as gr
from llama_cpp import Llama
from huggingface_hub import hf_hub_download
# Download GGUF from your HF repo
model_path = hf_hub_download(
repo_id="astegaras/Llama3.2_3B",
filename="model-Q2_K.gguf"
)
# Load model (llama.cpp)
llm = Llama(
model_path=model_path,
n_ctx=4096,
chat_format=None,
n_gpu_layers=0,
add_bos_token=False,
add_eos_token=False,
)
# Build inference prompt according to your dataset format
def format_prompt(user_message):
return (
"<|begin_of_text|>"
"<|start_header_id|>system<|end_header_id|>\n"
"You are a helpful assistant.\n"
"<|start_header_id|>user<|end_header_id|>\n"
f"{user_message}\n"
"<|start_header_id|>assistant<|end_header_id|>\n"
)
def respond(user_input):
prompt = format_prompt(user_input)
output = llm(
prompt,
max_tokens=512,
temperature=0.7,
top_p=0.9,
stop=["<|user|>", "<|system|>"], # avoid looping
)
return output["choices"][0]["text"]
# Gradio UI
gr.Interface(
fn=respond,
inputs=gr.components.Textbox(label="Ask"),
outputs=gr.components.Textbox(label="Answer"),
title="Llama3.2-3B Fine-tuned Assistant"
).launch()
|