Spaces:

astegaras
/

iris

Sleeping

App Files Files Community

astegaras commited on 24 days ago

Commit

d4ef7b3

verified ·

1 Parent(s): 62f86f8

Update app.py

Browse files

Files changed (1) hide show

app.py +36 -12

app.py CHANGED Viewed

@@ -2,38 +2,62 @@ import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 model_path = hf_hub_download(
     repo_id="astegaras/Llama3.2_3B",
     filename="model-Q2_K.gguf"
 )
 llm = Llama(
     model_path=model_path,
-    n_ctx=2048,
-    n_gpu_layers=0,
-    chat_format=None,     # <-- ABSOLUTELY REQUIRED
-    add_bos_token=False,  # <-- REQUIRED
-    add_eos_token=False,  # <-- REQUIRED
 )
 def respond(user_question):
-    # sanitize input for your tokenizer
     user_question = user_question.replace("\r", " ").replace("\n", " ")
     user_question = user_question.encode("ascii", "ignore").decode()
-    # match your fine-tuning format exactly
     prompt = f"Q: {user_question}\nA:"
-    out = llm.create_completion(
-        prompt=prompt,
         max_tokens=256,
         temperature=0.7,
         top_p=0.9,
-        stop=["Q:"],   # safety
     )
-    return out["choices"][0]["text"]
-gr.Interface(fn=respond, inputs="text", outputs="text").launch()

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
+# ---------------------------------------------------------
+# Download GGUF file from your Hugging Face repo
+# ---------------------------------------------------------
 model_path = hf_hub_download(
     repo_id="astegaras/Llama3.2_3B",
     filename="model-Q2_K.gguf"
 )
+# ---------------------------------------------------------
+# Load model with llama.cpp
+# IMPORTANT: chat_format=None because your SFT uses Q/A style
+# IMPORTANT: add_bos_token=True is REQUIRED
+# ---------------------------------------------------------
 llm = Llama(
     model_path=model_path,
+    n_ctx=4096,
+    n_gpu_layers=-1,      # use GPU layers if available
+    chat_format=None,     # DO NOT use llama-3 chat template
+    add_bos_token=True,   # REQUIRED so prompt starts correctly
+    verbose=False,
 )
+# ---------------------------------------------------------
+# Function to generate answer
+# ---------------------------------------------------------
 def respond(user_question):
+    # Basic sanitization (not strictly required)
     user_question = user_question.replace("\r", " ").replace("\n", " ")
     user_question = user_question.encode("ascii", "ignore").decode()
+    # MATCH YOUR FINETUNE FORMAT EXACTLY
     prompt = f"Q: {user_question}\nA:"
+    # Raw completion call (works best for GGUF)
+    output = llm(
+        prompt,
         max_tokens=256,
         temperature=0.7,
         top_p=0.9,
+        stop=["Q:"],    # VERY IMPORTANT
     )
+    reply = output["choices"][0]["text"].strip()
+    return reply
+# ---------------------------------------------------------
+# Gradio UI
+# ---------------------------------------------------------
+gr.Interface(
+    fn=respond,
+    inputs="text",
+    outputs="text",
+    title="Llama3.2-3B Finetuned (Q/A format)",
+    description="Ask any question. Model trained with Finetome100k-style SFT."
+).launch()