Spaces:

astegaras
/

iris

Sleeping

iris / app.py

Update app.py

2fb80ad verified 24 days ago

1.25 kB

	import gradio as gr
	from llama_cpp import Llama
	from huggingface_hub import hf_hub_download

	# Download GGUF from your HF repo
	model_path = hf_hub_download(
	repo_id="astegaras/Llama3.2_3B",
	filename="model-Q2_K.gguf"
	)

	# Load model (llama.cpp)
	llm = Llama(
	model_path=model_path,
	n_ctx=4096,
	chat_format=None,
	n_gpu_layers=0,
	add_bos_token=False,
	add_eos_token=False,
	)

	# Build inference prompt according to your dataset format
	def format_prompt(user_message):
	return (
	"<\|begin_of_text\|>"
	"<\|start_header_id\|>system<\|end_header_id\|>\n"
	"You are a helpful assistant.\n"
	"<\|start_header_id\|>user<\|end_header_id\|>\n"
	f"{user_message}\n"
	"<\|start_header_id\|>assistant<\|end_header_id\|>\n"
	)


	def respond(user_input):
	prompt = format_prompt(user_input)

	output = llm(
	prompt,
	max_tokens=512,
	temperature=0.7,
	top_p=0.9,
	stop=["<\|user\|>", "<\|system\|>"], # avoid looping
	)

	return output["choices"][0]["text"]

	# Gradio UI
	gr.Interface(
	fn=respond,
	inputs=gr.components.Textbox(label="Ask"),
	outputs=gr.components.Textbox(label="Answer"),
	title="Llama3.2-3B Fine-tuned Assistant"
	).launch()