astegaras commited on
Commit
e104971
·
verified ·
1 Parent(s): 5587610

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +30 -24
app.py CHANGED
@@ -1,43 +1,49 @@
1
  import gradio as gr
2
  from llama_cpp import Llama
 
3
 
4
- # ----------------------------------------------------
5
- # Load GGUF model
6
- # ----------------------------------------------------
7
- repo = "astegaras/merged_kaggle"
8
-
9
- # llama_cpp automatically downloads from HF Hub if you provide the repo
10
- llm = Llama.from_pretrained(
11
- repo_id="astegaras/merged_kaggle",
12
- filename="llama-3.2-3b-instruct.Q4_K_M.gguf",
13
  )
14
 
15
- # ----------------------------------------------------
16
- # Chat function
17
- # ----------------------------------------------------
 
 
 
 
 
18
 
19
- def respond(message, history):
 
20
  messages = []
21
-
22
  for user, assistant in history:
23
  messages.append({"role": "user", "content": user})
24
  messages.append({"role": "assistant", "content": assistant})
25
 
26
  messages.append({"role": "user", "content": message})
27
 
28
- output = llm.create_chat_completion(messages=messages)
29
- reply = output["choices"][0]["message"]["content"]
 
 
 
 
30
 
 
31
  return reply
32
 
33
- # ----------------------------------------------------
34
- # Launch Gradio app
35
- # ----------------------------------------------------
36
 
37
- gr.ChatInterface(
38
- respond,
39
- title="My Llama 3.2 GGUF Chatbot",
40
- description="Running GGUF with llama.cpp inside a HuggingFace Space",
41
- ).launch()
 
 
 
42
 
43
 
 
1
  import gradio as gr
2
  from llama_cpp import Llama
3
+ from huggingface_hub import hf_hub_download
4
 
5
+ # Download your GGUF model from HF Hub
6
+ model_path = hf_hub_download(
7
+ repo_id="astegaras/merged_kaggle",
8
+ filename="llama-3.2-3b-instruct.Q4_K_M.gguf"
 
 
 
 
 
9
  )
10
 
11
+ # Load the GGUF model with llama.cpp
12
+ llm = Llama(
13
+ model_path=model_path,
14
+ n_ctx=4096, # Context window for inference
15
+ n_threads=8, # Adjust to HF hardware
16
+ n_batch=512,
17
+ verbose=False
18
+ )
19
 
20
+ def chat_fn(message, history):
21
+ # Reformat history for llama.cpp chat template
22
  messages = []
 
23
  for user, assistant in history:
24
  messages.append({"role": "user", "content": user})
25
  messages.append({"role": "assistant", "content": assistant})
26
 
27
  messages.append({"role": "user", "content": message})
28
 
29
+ output = llm.create_chat_completion(
30
+ messages=messages,
31
+ max_tokens=512,
32
+ temperature=0.7,
33
+ top_p=0.9
34
+ )
35
 
36
+ reply = output["choices"][0]["message"]["content"]
37
  return reply
38
 
 
 
 
39
 
40
+ # Gradio UI
41
+ chatbot = gr.ChatInterface(
42
+ fn=chat_fn,
43
+ title="Merged Kaggle Model (GGUF)",
44
+ description="Running llama.cpp inference on GGUF model",
45
+ )
46
+
47
+ chatbot.launch()
48
 
49