Spaces:
Running
Running
ming
commited on
Commit
·
f2cff39
1
Parent(s):
64e4f7a
Optimize latency and performance for Hugging Face Spaces
Browse filesPerformance Optimizations:
- Reduce base timeout from 60s to 30s for faster failure detection
- Optimize timeout calculation: 3s per 1000 chars (was 5s) with 60s cap (was 120s)
- Reduce default max_tokens from 256 to 100 for faster responses
- Add text preprocessing: truncate texts >4000 chars for faster processing
- Optimize model parameters:
* Lower temperature (0.1) for faster, focused output
* Add top_p, top_k, repeat_penalty for efficiency
* Limit context window to 2048 tokens for speed
- Shorter, more direct prompt for faster processing
Expected Results:
- 2-3x faster response times
- Better handling of large texts
- Reduced timeout errors
- More reliable performance on free tier
- app/core/config.py +1 -1
- app/services/summarizer.py +16 -5
app/core/config.py
CHANGED
|
@@ -13,7 +13,7 @@ class Settings(BaseSettings):
|
|
| 13 |
# Ollama Configuration
|
| 14 |
ollama_model: str = Field(default="llama3.2:1b", env="OLLAMA_MODEL")
|
| 15 |
ollama_host: str = Field(default="http://0.0.0.0:11434", env="OLLAMA_HOST")
|
| 16 |
-
ollama_timeout: int = Field(default=
|
| 17 |
|
| 18 |
# Server Configuration
|
| 19 |
server_host: str = Field(default="127.0.0.1", env="SERVER_HOST")
|
|
|
|
| 13 |
# Ollama Configuration
|
| 14 |
ollama_model: str = Field(default="llama3.2:1b", env="OLLAMA_MODEL")
|
| 15 |
ollama_host: str = Field(default="http://0.0.0.0:11434", env="OLLAMA_HOST")
|
| 16 |
+
ollama_timeout: int = Field(default=30, env="OLLAMA_TIMEOUT", ge=1)
|
| 17 |
|
| 18 |
# Server Configuration
|
| 19 |
server_host: str = Field(default="127.0.0.1", env="SERVER_HOST")
|
app/services/summarizer.py
CHANGED
|
@@ -46,8 +46,8 @@ class OllamaService:
|
|
| 46 |
async def summarize_text(
|
| 47 |
self,
|
| 48 |
text: str,
|
| 49 |
-
max_tokens: int =
|
| 50 |
-
prompt: str = "Summarize
|
| 51 |
) -> Dict[str, Any]:
|
| 52 |
"""
|
| 53 |
Summarize text using Ollama.
|
|
@@ -55,9 +55,16 @@ class OllamaService:
|
|
| 55 |
"""
|
| 56 |
start_time = time.time()
|
| 57 |
|
| 58 |
-
#
|
| 59 |
text_length = len(text)
|
| 60 |
-
dynamic_timeout = min(self.timeout + max(0, (text_length - 1000) // 1000 *
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 61 |
|
| 62 |
logger.info(f"Processing text of {text_length} chars with timeout {dynamic_timeout}s")
|
| 63 |
|
|
@@ -69,7 +76,11 @@ class OllamaService:
|
|
| 69 |
"stream": False,
|
| 70 |
"options": {
|
| 71 |
"num_predict": max_tokens,
|
| 72 |
-
"temperature": 0.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
},
|
| 74 |
}
|
| 75 |
|
|
|
|
| 46 |
async def summarize_text(
|
| 47 |
self,
|
| 48 |
text: str,
|
| 49 |
+
max_tokens: int = 100,
|
| 50 |
+
prompt: str = "Summarize concisely:",
|
| 51 |
) -> Dict[str, Any]:
|
| 52 |
"""
|
| 53 |
Summarize text using Ollama.
|
|
|
|
| 55 |
"""
|
| 56 |
start_time = time.time()
|
| 57 |
|
| 58 |
+
# Optimized timeout: base + 3s per extra 1000 chars (cap 60s)
|
| 59 |
text_length = len(text)
|
| 60 |
+
dynamic_timeout = min(self.timeout + max(0, (text_length - 1000) // 1000 * 3), 60)
|
| 61 |
+
|
| 62 |
+
# Preprocess text to reduce input size for faster processing
|
| 63 |
+
if text_length > 4000:
|
| 64 |
+
# Truncate very long texts and add note
|
| 65 |
+
text = text[:4000] + "\n\n[Text truncated for faster processing]"
|
| 66 |
+
text_length = len(text)
|
| 67 |
+
logger.info(f"Text truncated from {len(text)} to {text_length} chars for faster processing")
|
| 68 |
|
| 69 |
logger.info(f"Processing text of {text_length} chars with timeout {dynamic_timeout}s")
|
| 70 |
|
|
|
|
| 76 |
"stream": False,
|
| 77 |
"options": {
|
| 78 |
"num_predict": max_tokens,
|
| 79 |
+
"temperature": 0.1, # Lower temperature for faster, more focused output
|
| 80 |
+
"top_p": 0.9, # Nucleus sampling for efficiency
|
| 81 |
+
"top_k": 40, # Limit vocabulary for speed
|
| 82 |
+
"repeat_penalty": 1.1, # Prevent repetition
|
| 83 |
+
"num_ctx": 2048, # Limit context window for speed
|
| 84 |
},
|
| 85 |
}
|
| 86 |
|