astegaras commited on
Commit
d4ef7b3
·
verified ·
1 Parent(s): 62f86f8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +36 -12
app.py CHANGED
@@ -2,38 +2,62 @@ import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
 
 
 
5
  model_path = hf_hub_download(
6
  repo_id="astegaras/Llama3.2_3B",
7
  filename="model-Q2_K.gguf"
8
  )
9
 
 
 
 
 
 
10
  llm = Llama(
11
  model_path=model_path,
12
- n_ctx=2048,
13
- n_gpu_layers=0,
14
- chat_format=None, # <-- ABSOLUTELY REQUIRED
15
- add_bos_token=False, # <-- REQUIRED
16
- add_eos_token=False, # <-- REQUIRED
17
  )
18
 
 
 
 
19
  def respond(user_question):
20
 
21
- # sanitize input for your tokenizer
22
  user_question = user_question.replace("\r", " ").replace("\n", " ")
23
  user_question = user_question.encode("ascii", "ignore").decode()
24
 
25
- # match your fine-tuning format exactly
26
  prompt = f"Q: {user_question}\nA:"
27
 
28
- out = llm.create_completion(
29
- prompt=prompt,
 
30
  max_tokens=256,
31
  temperature=0.7,
32
  top_p=0.9,
33
- stop=["Q:"], # safety
34
  )
35
- return out["choices"][0]["text"]
36
 
37
- gr.Interface(fn=respond, inputs="text", outputs="text").launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
 
 
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
+ # ---------------------------------------------------------
6
+ # Download GGUF file from your Hugging Face repo
7
+ # ---------------------------------------------------------
8
  model_path = hf_hub_download(
9
  repo_id="astegaras/Llama3.2_3B",
10
  filename="model-Q2_K.gguf"
11
  )
12
 
13
+ # ---------------------------------------------------------
14
+ # Load model with llama.cpp
15
+ # IMPORTANT: chat_format=None because your SFT uses Q/A style
16
+ # IMPORTANT: add_bos_token=True is REQUIRED
17
+ # ---------------------------------------------------------
18
  llm = Llama(
19
  model_path=model_path,
20
+ n_ctx=4096,
21
+ n_gpu_layers=-1, # use GPU layers if available
22
+ chat_format=None, # DO NOT use llama-3 chat template
23
+ add_bos_token=True, # REQUIRED so prompt starts correctly
24
+ verbose=False,
25
  )
26
 
27
+ # ---------------------------------------------------------
28
+ # Function to generate answer
29
+ # ---------------------------------------------------------
30
  def respond(user_question):
31
 
32
+ # Basic sanitization (not strictly required)
33
  user_question = user_question.replace("\r", " ").replace("\n", " ")
34
  user_question = user_question.encode("ascii", "ignore").decode()
35
 
36
+ # MATCH YOUR FINETUNE FORMAT EXACTLY
37
  prompt = f"Q: {user_question}\nA:"
38
 
39
+ # Raw completion call (works best for GGUF)
40
+ output = llm(
41
+ prompt,
42
  max_tokens=256,
43
  temperature=0.7,
44
  top_p=0.9,
45
+ stop=["Q:"], # VERY IMPORTANT
46
  )
 
47
 
48
+ reply = output["choices"][0]["text"].strip()
49
+ return reply
50
+
51
+ # ---------------------------------------------------------
52
+ # Gradio UI
53
+ # ---------------------------------------------------------
54
+ gr.Interface(
55
+ fn=respond,
56
+ inputs="text",
57
+ outputs="text",
58
+ title="Llama3.2-3B Finetuned (Q/A format)",
59
+ description="Ask any question. Model trained with Finetome100k-style SFT."
60
+ ).launch()
61
+
62
 
63