Spaces:

astegaras
/

iris

Sleeping

App Files Files Community

astegaras commited on 24 days ago

Commit

389f1ec

verified ·

1 Parent(s): d4ef7b3

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -33

app.py CHANGED Viewed

@@ -2,62 +2,52 @@ import gradio as gr
 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
-# ---------------------------------------------------------
-# Download GGUF file from your Hugging Face repo
-# ---------------------------------------------------------
 model_path = hf_hub_download(
     repo_id="astegaras/Llama3.2_3B",
     filename="model-Q2_K.gguf"
 )
-# ---------------------------------------------------------
-# Load model with llama.cpp
-# IMPORTANT: chat_format=None because your SFT uses Q/A style
-# IMPORTANT: add_bos_token=True is REQUIRED
-# ---------------------------------------------------------
 llm = Llama(
     model_path=model_path,
     n_ctx=4096,
-    n_gpu_layers=-1,      # use GPU layers if available
-    chat_format=None,     # DO NOT use llama-3 chat template
-    add_bos_token=True,   # REQUIRED so prompt starts correctly
-    verbose=False,
 )
-# ---------------------------------------------------------
-# Function to generate answer
-# ---------------------------------------------------------
-def respond(user_question):
-    # Basic sanitization (not strictly required)
-    user_question = user_question.replace("\r", " ").replace("\n", " ")
-    user_question = user_question.encode("ascii", "ignore").decode()
-    # MATCH YOUR FINETUNE FORMAT EXACTLY
-    prompt = f"Q: {user_question}\nA:"
-    # Raw completion call (works best for GGUF)
     output = llm(
         prompt,
-        max_tokens=256,
         temperature=0.7,
         top_p=0.9,
-        stop=["Q:"],    # VERY IMPORTANT
     )
-    reply = output["choices"][0]["text"].strip()
-    return reply
-# ---------------------------------------------------------
 # Gradio UI
-# ---------------------------------------------------------
 gr.Interface(
     fn=respond,
-    inputs="text",
-    outputs="text",
-    title="Llama3.2-3B Finetuned (Q/A format)",
-    description="Ask any question. Model trained with Finetome100k-style SFT."
 ).launch()

 from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
+# Download GGUF from your HF repo
 model_path = hf_hub_download(
     repo_id="astegaras/Llama3.2_3B",
     filename="model-Q2_K.gguf"
 )
+# Load model (llama.cpp)
 llm = Llama(
     model_path=model_path,
     n_ctx=4096,
+    chat_format=None,
+    n_gpu_layers=0,
+    add_bos_token=False,
+    add_eos_token=False,
 )
+# Build inference prompt according to your dataset format
+def format_prompt(user_message):
+    return f"""<|system|>
+You are a helpful assistant.
+<|user|>
+{user_message}
+<|assistant|>
+"""
+def respond(user_input):
+    prompt = format_prompt(user_input)
     output = llm(
         prompt,
+        max_tokens=512,
         temperature=0.7,
         top_p=0.9,
+        stop=["<|user|>", "<|system|>"],  # avoid looping
     )
+    return output["choices"][0]["text"]
 # Gradio UI
 gr.Interface(
     fn=respond,
+    inputs=gr.components.Textbox(label="Ask"),
+    outputs=gr.components.Textbox(label="Answer"),
+    title="Llama3.2-3B Fine-tuned Assistant"
 ).launch()