astegaras commited on
Commit
389f1ec
·
verified ·
1 Parent(s): d4ef7b3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -33
app.py CHANGED
@@ -2,62 +2,52 @@ import gradio as gr
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
- # ---------------------------------------------------------
6
- # Download GGUF file from your Hugging Face repo
7
- # ---------------------------------------------------------
8
  model_path = hf_hub_download(
9
  repo_id="astegaras/Llama3.2_3B",
10
  filename="model-Q2_K.gguf"
11
  )
12
 
13
- # ---------------------------------------------------------
14
- # Load model with llama.cpp
15
- # IMPORTANT: chat_format=None because your SFT uses Q/A style
16
- # IMPORTANT: add_bos_token=True is REQUIRED
17
- # ---------------------------------------------------------
18
  llm = Llama(
19
  model_path=model_path,
20
  n_ctx=4096,
21
- n_gpu_layers=-1, # use GPU layers if available
22
- chat_format=None, # DO NOT use llama-3 chat template
23
- add_bos_token=True, # REQUIRED so prompt starts correctly
24
- verbose=False,
25
  )
26
 
27
- # ---------------------------------------------------------
28
- # Function to generate answer
29
- # ---------------------------------------------------------
30
- def respond(user_question):
 
 
 
 
31
 
32
- # Basic sanitization (not strictly required)
33
- user_question = user_question.replace("\r", " ").replace("\n", " ")
34
- user_question = user_question.encode("ascii", "ignore").decode()
35
 
36
- # MATCH YOUR FINETUNE FORMAT EXACTLY
37
- prompt = f"Q: {user_question}\nA:"
38
-
39
- # Raw completion call (works best for GGUF)
40
  output = llm(
41
  prompt,
42
- max_tokens=256,
43
  temperature=0.7,
44
  top_p=0.9,
45
- stop=["Q:"], # VERY IMPORTANT
46
  )
47
 
48
- reply = output["choices"][0]["text"].strip()
49
- return reply
50
 
51
- # ---------------------------------------------------------
52
  # Gradio UI
53
- # ---------------------------------------------------------
54
  gr.Interface(
55
  fn=respond,
56
- inputs="text",
57
- outputs="text",
58
- title="Llama3.2-3B Finetuned (Q/A format)",
59
- description="Ask any question. Model trained with Finetome100k-style SFT."
60
  ).launch()
61
 
62
 
63
 
 
 
2
  from llama_cpp import Llama
3
  from huggingface_hub import hf_hub_download
4
 
5
+ # Download GGUF from your HF repo
 
 
6
  model_path = hf_hub_download(
7
  repo_id="astegaras/Llama3.2_3B",
8
  filename="model-Q2_K.gguf"
9
  )
10
 
11
+ # Load model (llama.cpp)
 
 
 
 
12
  llm = Llama(
13
  model_path=model_path,
14
  n_ctx=4096,
15
+ chat_format=None,
16
+ n_gpu_layers=0,
17
+ add_bos_token=False,
18
+ add_eos_token=False,
19
  )
20
 
21
+ # Build inference prompt according to your dataset format
22
+ def format_prompt(user_message):
23
+ return f"""<|system|>
24
+ You are a helpful assistant.
25
+ <|user|>
26
+ {user_message}
27
+ <|assistant|>
28
+ """
29
 
30
+ def respond(user_input):
31
+ prompt = format_prompt(user_input)
 
32
 
 
 
 
 
33
  output = llm(
34
  prompt,
35
+ max_tokens=512,
36
  temperature=0.7,
37
  top_p=0.9,
38
+ stop=["<|user|>", "<|system|>"], # avoid looping
39
  )
40
 
41
+ return output["choices"][0]["text"]
 
42
 
 
43
  # Gradio UI
 
44
  gr.Interface(
45
  fn=respond,
46
+ inputs=gr.components.Textbox(label="Ask"),
47
+ outputs=gr.components.Textbox(label="Answer"),
48
+ title="Llama3.2-3B Fine-tuned Assistant"
 
49
  ).launch()
50
 
51
 
52
 
53
+