Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
import os
|
| 2 |
from typing import Generator, Optional
|
| 3 |
import gradio as gr
|
| 4 |
-
from llama_cpp import Llama
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
|
| 7 |
DESCRIPTION = '''
|
|
@@ -21,7 +21,7 @@ LICENSE = """
|
|
| 21 |
template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
|
| 22 |
|
| 23 |
class OptimizedLLMInterface:
|
| 24 |
-
_model_instance = None
|
| 25 |
|
| 26 |
def __init__(
|
| 27 |
self,
|
|
@@ -29,19 +29,24 @@ class OptimizedLLMInterface:
|
|
| 29 |
model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
|
| 30 |
):
|
| 31 |
if OptimizedLLMInterface._model_instance is None:
|
|
|
|
| 32 |
OptimizedLLMInterface._model_instance = Llama(
|
| 33 |
-
model_path=
|
| 34 |
-
n_ctx=
|
| 35 |
-
n_threads=4,
|
| 36 |
-
n_batch=
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
|
|
|
|
|
|
|
|
|
| 42 |
)
|
| 43 |
self.model = OptimizedLLMInterface._model_instance
|
| 44 |
|
|
|
|
| 45 |
template_parts = template.split("{content}")
|
| 46 |
self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
|
| 47 |
self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
|
|
@@ -50,28 +55,33 @@ class OptimizedLLMInterface:
|
|
| 50 |
self,
|
| 51 |
message: str,
|
| 52 |
history: Optional[list] = None,
|
| 53 |
-
max_tokens: int =
|
| 54 |
temperature: float = 0.7,
|
| 55 |
top_p: float = 0.95,
|
| 56 |
) -> Generator[str, None, None]:
|
| 57 |
-
message_tokens = self.model.tokenize(message.encode())
|
| 58 |
-
input_tokens = []
|
| 59 |
-
input_tokens.extend(self._prefix_tokens)
|
| 60 |
-
input_tokens.extend(message_tokens)
|
| 61 |
-
input_tokens.extend(self._suffix_tokens)
|
| 62 |
-
|
| 63 |
-
output = ""
|
| 64 |
-
batch = []
|
| 65 |
-
batch_size = 8
|
| 66 |
-
|
| 67 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
for token in self.model.generate(
|
| 69 |
input_tokens,
|
| 70 |
top_p=top_p,
|
| 71 |
temp=temperature,
|
| 72 |
-
top_k=1,
|
| 73 |
-
repeat_penalty=1.0,
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
):
|
| 76 |
batch.append(token)
|
| 77 |
if len(batch) >= batch_size:
|
|
@@ -101,12 +111,12 @@ def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
|
|
| 101 |
['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
|
| 102 |
['Find the least odd prime factor of $2019^8+1$.'],
|
| 103 |
],
|
| 104 |
-
cache_examples=False,
|
| 105 |
fill_height=True
|
| 106 |
)
|
| 107 |
|
| 108 |
with gr.Accordion("Adjust Parameters", open=False):
|
| 109 |
-
gr.Slider(minimum=
|
| 110 |
gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
|
| 111 |
gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")
|
| 112 |
|
|
@@ -118,8 +128,8 @@ def main():
|
|
| 118 |
llm = OptimizedLLMInterface()
|
| 119 |
demo = create_demo(llm)
|
| 120 |
|
| 121 |
-
# Simplified launch configuration
|
| 122 |
demo.launch(
|
|
|
|
| 123 |
quiet=True
|
| 124 |
)
|
| 125 |
|
|
|
|
| 1 |
import os
|
| 2 |
from typing import Generator, Optional
|
| 3 |
import gradio as gr
|
| 4 |
+
from llama_cpp import Llama, LlamaGrammar
|
| 5 |
from huggingface_hub import hf_hub_download
|
| 6 |
|
| 7 |
DESCRIPTION = '''
|
|
|
|
| 21 |
template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
|
| 22 |
|
| 23 |
class OptimizedLLMInterface:
|
| 24 |
+
_model_instance = None # Singleton pattern
|
| 25 |
|
| 26 |
def __init__(
|
| 27 |
self,
|
|
|
|
| 29 |
model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
|
| 30 |
):
|
| 31 |
if OptimizedLLMInterface._model_instance is None:
|
| 32 |
+
model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename)
|
| 33 |
OptimizedLLMInterface._model_instance = Llama(
|
| 34 |
+
model_path=model_path,
|
| 35 |
+
n_ctx=256, # Minimal context for speed
|
| 36 |
+
n_threads=4, # Fixed thread count
|
| 37 |
+
n_batch=1, # Single batch for low latency
|
| 38 |
+
verbose=False, # Disable logging
|
| 39 |
+
seed=-1, # Disable random seed
|
| 40 |
+
logits_all=False, # Disable logits
|
| 41 |
+
embedding=False, # Disable embeddings
|
| 42 |
+
tensor_split=None, # No tensor splitting
|
| 43 |
+
rope_freq_base=10000, # Default RoPE settings
|
| 44 |
+
rope_freq_scale=1.0,
|
| 45 |
+
main_gpu=0,
|
| 46 |
)
|
| 47 |
self.model = OptimizedLLMInterface._model_instance
|
| 48 |
|
| 49 |
+
# Pre-tokenize template parts
|
| 50 |
template_parts = template.split("{content}")
|
| 51 |
self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
|
| 52 |
self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
|
|
|
|
| 55 |
self,
|
| 56 |
message: str,
|
| 57 |
history: Optional[list] = None,
|
| 58 |
+
max_tokens: int = 128, # Reduced max tokens
|
| 59 |
temperature: float = 0.7,
|
| 60 |
top_p: float = 0.95,
|
| 61 |
) -> Generator[str, None, None]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
try:
|
| 63 |
+
# Fast token preparation
|
| 64 |
+
message_tokens = self.model.tokenize(message.encode())
|
| 65 |
+
input_tokens = []
|
| 66 |
+
input_tokens.extend(self._prefix_tokens)
|
| 67 |
+
input_tokens.extend(message_tokens)
|
| 68 |
+
input_tokens.extend(self._suffix_tokens)
|
| 69 |
+
|
| 70 |
+
output = ""
|
| 71 |
+
batch = []
|
| 72 |
+
batch_size = 4 # Small batch size for faster responses
|
| 73 |
+
|
| 74 |
for token in self.model.generate(
|
| 75 |
input_tokens,
|
| 76 |
top_p=top_p,
|
| 77 |
temp=temperature,
|
| 78 |
+
top_k=1, # Minimal top_k
|
| 79 |
+
repeat_penalty=1.0, # No repeat penalty
|
| 80 |
+
mirostat_mode=0, # Disable mirostat
|
| 81 |
+
min_p=0.05, # Allow more diversity
|
| 82 |
+
typical_p=1.0, # Disable typical sampling
|
| 83 |
+
presence_penalty=0,
|
| 84 |
+
frequency_penalty=0,
|
| 85 |
):
|
| 86 |
batch.append(token)
|
| 87 |
if len(batch) >= batch_size:
|
|
|
|
| 111 |
['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
|
| 112 |
['Find the least odd prime factor of $2019^8+1$.'],
|
| 113 |
],
|
| 114 |
+
cache_examples=False,
|
| 115 |
fill_height=True
|
| 116 |
)
|
| 117 |
|
| 118 |
with gr.Accordion("Adjust Parameters", open=False):
|
| 119 |
+
gr.Slider(minimum=64, maximum=512, value=128, step=64, label="Max Tokens")
|
| 120 |
gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
|
| 121 |
gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")
|
| 122 |
|
|
|
|
| 128 |
llm = OptimizedLLMInterface()
|
| 129 |
demo = create_demo(llm)
|
| 130 |
|
|
|
|
| 131 |
demo.launch(
|
| 132 |
+
share=False,
|
| 133 |
quiet=True
|
| 134 |
)
|
| 135 |
|