|
|
import gradio as gr |
|
|
from llama_cpp import Llama |
|
|
from huggingface_hub import hf_hub_download |
|
|
|
|
|
|
|
|
model_path = hf_hub_download( |
|
|
repo_id="astegaras/Llama3.2_3B", |
|
|
filename="model-Q2_K.gguf" |
|
|
) |
|
|
|
|
|
|
|
|
llm = Llama( |
|
|
model_path=model_path, |
|
|
n_ctx=4096, |
|
|
chat_format=None, |
|
|
n_gpu_layers=0, |
|
|
add_bos_token=False, |
|
|
add_eos_token=False, |
|
|
) |
|
|
|
|
|
|
|
|
def format_prompt(user_message): |
|
|
return ( |
|
|
"<|begin_of_text|>" |
|
|
"<|start_header_id|>system<|end_header_id|>\n" |
|
|
"You are a helpful assistant.\n" |
|
|
"<|start_header_id|>user<|end_header_id|>\n" |
|
|
f"{user_message}\n" |
|
|
"<|start_header_id|>assistant<|end_header_id|>\n" |
|
|
) |
|
|
|
|
|
|
|
|
def respond(user_input): |
|
|
prompt = format_prompt(user_input) |
|
|
|
|
|
output = llm( |
|
|
prompt, |
|
|
max_tokens=512, |
|
|
temperature=0.7, |
|
|
top_p=0.9, |
|
|
stop=["<|user|>", "<|system|>"], |
|
|
) |
|
|
|
|
|
return output["choices"][0]["text"] |
|
|
|
|
|
|
|
|
gr.Interface( |
|
|
fn=respond, |
|
|
inputs=gr.components.Textbox(label="Ask"), |
|
|
outputs=gr.components.Textbox(label="Answer"), |
|
|
title="Llama3.2-3B Fine-tuned Assistant" |
|
|
).launch() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|