Spaces:

colin730
/

SummarizerApp

Running

ming commited on Oct 4

Commit

f2cff39

1 Parent(s): 64e4f7a

Optimize latency and performance for Hugging Face Spaces

Performance Optimizations:
- Reduce base timeout from 60s to 30s for faster failure detection
- Optimize timeout calculation: 3s per 1000 chars (was 5s) with 60s cap (was 120s)
- Reduce default max_tokens from 256 to 100 for faster responses
- Add text preprocessing: truncate texts >4000 chars for faster processing
- Optimize model parameters:
* Lower temperature (0.1) for faster, focused output
* Add top_p, top_k, repeat_penalty for efficiency
* Limit context window to 2048 tokens for speed
- Shorter, more direct prompt for faster processing

Expected Results:
- 2-3x faster response times
- Better handling of large texts
- Reduced timeout errors
- More reliable performance on free tier

Files changed (2) hide show

app/core/config.py +1 -1
app/services/summarizer.py +16 -5

app/core/config.py CHANGED Viewed

@@ -13,7 +13,7 @@ class Settings(BaseSettings):
     # Ollama Configuration
     ollama_model: str = Field(default="llama3.2:1b", env="OLLAMA_MODEL")
     ollama_host: str = Field(default="http://0.0.0.0:11434", env="OLLAMA_HOST")
-    ollama_timeout: int = Field(default=60, env="OLLAMA_TIMEOUT", ge=1)
     # Server Configuration
     server_host: str = Field(default="127.0.0.1", env="SERVER_HOST")

     # Ollama Configuration
     ollama_model: str = Field(default="llama3.2:1b", env="OLLAMA_MODEL")
     ollama_host: str = Field(default="http://0.0.0.0:11434", env="OLLAMA_HOST")
+    ollama_timeout: int = Field(default=30, env="OLLAMA_TIMEOUT", ge=1)
     # Server Configuration
     server_host: str = Field(default="127.0.0.1", env="SERVER_HOST")

app/services/summarizer.py CHANGED Viewed

@@ -46,8 +46,8 @@ class OllamaService:
     async def summarize_text(
         self,
         text: str,
-        max_tokens: int = 256,
-        prompt: str = "Summarize the following text concisely:",
     ) -> Dict[str, Any]:
         """
         Summarize text using Ollama.
@@ -55,9 +55,16 @@ class OllamaService:
         """
         start_time = time.time()
-        # Dynamic timeout: base + 5s per extra 1000 chars (cap 120s)
         text_length = len(text)
-        dynamic_timeout = min(self.timeout + max(0, (text_length - 1000) // 1000 * 5), 120)
         logger.info(f"Processing text of {text_length} chars with timeout {dynamic_timeout}s")
@@ -69,7 +76,11 @@ class OllamaService:
             "stream": False,
             "options": {
                 "num_predict": max_tokens,
-                "temperature": 0.3,
             },
         }

     async def summarize_text(
         self,
         text: str,
+        max_tokens: int = 100,
+        prompt: str = "Summarize concisely:",
     ) -> Dict[str, Any]:
         """
         Summarize text using Ollama.
         """
         start_time = time.time()
+        # Optimized timeout: base + 3s per extra 1000 chars (cap 60s)
         text_length = len(text)
+        dynamic_timeout = min(self.timeout + max(0, (text_length - 1000) // 1000 * 3), 60)
+        # Preprocess text to reduce input size for faster processing
+        if text_length > 4000:
+            # Truncate very long texts and add note
+            text = text[:4000] + "\n\n[Text truncated for faster processing]"
+            text_length = len(text)
+            logger.info(f"Text truncated from {len(text)} to {text_length} chars for faster processing")
         logger.info(f"Processing text of {text_length} chars with timeout {dynamic_timeout}s")
             "stream": False,
             "options": {
                 "num_predict": max_tokens,
+                "temperature": 0.1,  # Lower temperature for faster, more focused output
+                "top_p": 0.9,        # Nucleus sampling for efficiency
+                "top_k": 40,         # Limit vocabulary for speed
+                "repeat_penalty": 1.1,  # Prevent repetition
+                "num_ctx": 2048,     # Limit context window for speed
             },
         }