Spaces:

SimpleBerry
/

LLaMA-O1-Supervised-1129-Demo

Running

App Files Files Community

jwu323 commited on Dec 4, 2024

Commit

93fb83c

verified ·

1 Parent(s): f5f11cd

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -28

app.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import os
 from typing import Generator, Optional
 import gradio as gr
-from llama_cpp import Llama
 from huggingface_hub import hf_hub_download
 DESCRIPTION = '''
@@ -21,7 +21,7 @@ LICENSE = """
 template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
 class OptimizedLLMInterface:
-    _model_instance = None
     def __init__(
         self,
@@ -29,19 +29,24 @@ class OptimizedLLMInterface:
         model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
     ):
         if OptimizedLLMInterface._model_instance is None:
             OptimizedLLMInterface._model_instance = Llama(
-                model_path=hf_hub_download(repo_id=model_repo_id, filename=model_filename),
-                n_ctx=512,
-                n_threads=4,
-                n_batch=32,
-                logits_all=False,
-                embedding=False,
-                seed=-1,
-                verbose=False,
-                offload_kqv=True,
             )
         self.model = OptimizedLLMInterface._model_instance
         template_parts = template.split("{content}")
         self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
         self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
@@ -50,28 +55,33 @@ class OptimizedLLMInterface:
         self,
         message: str,
         history: Optional[list] = None,
-        max_tokens: int = 256,
         temperature: float = 0.7,
         top_p: float = 0.95,
     ) -> Generator[str, None, None]:
-        message_tokens = self.model.tokenize(message.encode())
-        input_tokens = []
-        input_tokens.extend(self._prefix_tokens)
-        input_tokens.extend(message_tokens)
-        input_tokens.extend(self._suffix_tokens)
-        output = ""
-        batch = []
-        batch_size = 8
         try:
             for token in self.model.generate(
                 input_tokens,
                 top_p=top_p,
                 temp=temperature,
-                top_k=1,
-                repeat_penalty=1.0,
-                max_tokens=max_tokens,  # Added max_tokens limit
             ):
                 batch.append(token)
                 if len(batch) >= batch_size:
@@ -101,12 +111,12 @@ def create_demo(llm_interface: OptimizedLLMInterface) -> gr.Blocks:
                 ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
                 ['Find the least odd prime factor of $2019^8+1$.'],
             ],
-            cache_examples=False,  # Disabled example caching to fix the error
             fill_height=True
         )
         with gr.Accordion("Adjust Parameters", open=False):
-            gr.Slider(minimum=128, maximum=2048, value=256, step=128, label="Max Tokens")
             gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
             gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")
@@ -118,8 +128,8 @@ def main():
     llm = OptimizedLLMInterface()
     demo = create_demo(llm)
-    # Simplified launch configuration
     demo.launch(
         quiet=True
     )

 import os
 from typing import Generator, Optional
 import gradio as gr
+from llama_cpp import Llama, LlamaGrammar
 from huggingface_hub import hf_hub_download
 DESCRIPTION = '''
 template = "<start_of_father_id>-1<end_of_father_id><start_of_local_id>0<end_of_local_id><start_of_thought><problem>{content}<end_of_thought><start_of_rating><positive_rating><end_of_rating>\n<start_of_father_id>0<end_of_father_id><start_of_local_id>1<end_of_local_id><start_of_thought><expansion>"
 class OptimizedLLMInterface:
+    _model_instance = None  # Singleton pattern
     def __init__(
         self,
         model_filename: str = "llama-o1-supervised-1129-q4_k_m.gguf",
     ):
         if OptimizedLLMInterface._model_instance is None:
+            model_path = hf_hub_download(repo_id=model_repo_id, filename=model_filename)
             OptimizedLLMInterface._model_instance = Llama(
+                model_path=model_path,
+                n_ctx=256,            # Minimal context for speed
+                n_threads=4,          # Fixed thread count
+                n_batch=1,            # Single batch for low latency
+                verbose=False,        # Disable logging
+                seed=-1,             # Disable random seed
+                logits_all=False,    # Disable logits
+                embedding=False,     # Disable embeddings
+                tensor_split=None,   # No tensor splitting
+                rope_freq_base=10000,  # Default RoPE settings
+                rope_freq_scale=1.0,
+                main_gpu=0,
             )
         self.model = OptimizedLLMInterface._model_instance
+        # Pre-tokenize template parts
         template_parts = template.split("{content}")
         self._prefix_tokens = self.model.tokenize(template_parts[0].encode())
         self._suffix_tokens = self.model.tokenize(template_parts[1].encode())
         self,
         message: str,
         history: Optional[list] = None,
+        max_tokens: int = 128,    # Reduced max tokens
         temperature: float = 0.7,
         top_p: float = 0.95,
     ) -> Generator[str, None, None]:
         try:
+            # Fast token preparation
+            message_tokens = self.model.tokenize(message.encode())
+            input_tokens = []
+            input_tokens.extend(self._prefix_tokens)
+            input_tokens.extend(message_tokens)
+            input_tokens.extend(self._suffix_tokens)
+            output = ""
+            batch = []
+            batch_size = 4  # Small batch size for faster responses
             for token in self.model.generate(
                 input_tokens,
                 top_p=top_p,
                 temp=temperature,
+                top_k=1,           # Minimal top_k
+                repeat_penalty=1.0, # No repeat penalty
+                mirostat_mode=0,   # Disable mirostat
+                min_p=0.05,        # Allow more diversity
+                typical_p=1.0,     # Disable typical sampling
+                presence_penalty=0,
+                frequency_penalty=0,
             ):
                 batch.append(token)
                 if len(batch) >= batch_size:
                 ['If Diana needs to bike 10 miles to reach home and she can bike at a speed of 3 mph for two hours before getting tired, and then at a speed of 1 mph until she reaches home, how long will it take her to get home?'],
                 ['Find the least odd prime factor of $2019^8+1$.'],
             ],
+            cache_examples=False,
             fill_height=True
         )
         with gr.Accordion("Adjust Parameters", open=False):
+            gr.Slider(minimum=64, maximum=512, value=128, step=64, label="Max Tokens")
             gr.Slider(minimum=0.1, maximum=1.0, value=0.7, step=0.1, label="Temperature")
             gr.Slider(minimum=0.05, maximum=1.0, value=0.95, step=0.05, label="Top-p")
     llm = OptimizedLLMInterface()
     demo = create_demo(llm)
     demo.launch(
+        share=False,
         quiet=True
     )