ming commited on
Commit
f2cff39
·
1 Parent(s): 64e4f7a

Optimize latency and performance for Hugging Face Spaces

Browse files

Performance Optimizations:
- Reduce base timeout from 60s to 30s for faster failure detection
- Optimize timeout calculation: 3s per 1000 chars (was 5s) with 60s cap (was 120s)
- Reduce default max_tokens from 256 to 100 for faster responses
- Add text preprocessing: truncate texts >4000 chars for faster processing
- Optimize model parameters:
* Lower temperature (0.1) for faster, focused output
* Add top_p, top_k, repeat_penalty for efficiency
* Limit context window to 2048 tokens for speed
- Shorter, more direct prompt for faster processing

Expected Results:
- 2-3x faster response times
- Better handling of large texts
- Reduced timeout errors
- More reliable performance on free tier

Files changed (2) hide show
  1. app/core/config.py +1 -1
  2. app/services/summarizer.py +16 -5
app/core/config.py CHANGED
@@ -13,7 +13,7 @@ class Settings(BaseSettings):
13
  # Ollama Configuration
14
  ollama_model: str = Field(default="llama3.2:1b", env="OLLAMA_MODEL")
15
  ollama_host: str = Field(default="http://0.0.0.0:11434", env="OLLAMA_HOST")
16
- ollama_timeout: int = Field(default=60, env="OLLAMA_TIMEOUT", ge=1)
17
 
18
  # Server Configuration
19
  server_host: str = Field(default="127.0.0.1", env="SERVER_HOST")
 
13
  # Ollama Configuration
14
  ollama_model: str = Field(default="llama3.2:1b", env="OLLAMA_MODEL")
15
  ollama_host: str = Field(default="http://0.0.0.0:11434", env="OLLAMA_HOST")
16
+ ollama_timeout: int = Field(default=30, env="OLLAMA_TIMEOUT", ge=1)
17
 
18
  # Server Configuration
19
  server_host: str = Field(default="127.0.0.1", env="SERVER_HOST")
app/services/summarizer.py CHANGED
@@ -46,8 +46,8 @@ class OllamaService:
46
  async def summarize_text(
47
  self,
48
  text: str,
49
- max_tokens: int = 256,
50
- prompt: str = "Summarize the following text concisely:",
51
  ) -> Dict[str, Any]:
52
  """
53
  Summarize text using Ollama.
@@ -55,9 +55,16 @@ class OllamaService:
55
  """
56
  start_time = time.time()
57
 
58
- # Dynamic timeout: base + 5s per extra 1000 chars (cap 120s)
59
  text_length = len(text)
60
- dynamic_timeout = min(self.timeout + max(0, (text_length - 1000) // 1000 * 5), 120)
 
 
 
 
 
 
 
61
 
62
  logger.info(f"Processing text of {text_length} chars with timeout {dynamic_timeout}s")
63
 
@@ -69,7 +76,11 @@ class OllamaService:
69
  "stream": False,
70
  "options": {
71
  "num_predict": max_tokens,
72
- "temperature": 0.3,
 
 
 
 
73
  },
74
  }
75
 
 
46
  async def summarize_text(
47
  self,
48
  text: str,
49
+ max_tokens: int = 100,
50
+ prompt: str = "Summarize concisely:",
51
  ) -> Dict[str, Any]:
52
  """
53
  Summarize text using Ollama.
 
55
  """
56
  start_time = time.time()
57
 
58
+ # Optimized timeout: base + 3s per extra 1000 chars (cap 60s)
59
  text_length = len(text)
60
+ dynamic_timeout = min(self.timeout + max(0, (text_length - 1000) // 1000 * 3), 60)
61
+
62
+ # Preprocess text to reduce input size for faster processing
63
+ if text_length > 4000:
64
+ # Truncate very long texts and add note
65
+ text = text[:4000] + "\n\n[Text truncated for faster processing]"
66
+ text_length = len(text)
67
+ logger.info(f"Text truncated from {len(text)} to {text_length} chars for faster processing")
68
 
69
  logger.info(f"Processing text of {text_length} chars with timeout {dynamic_timeout}s")
70
 
 
76
  "stream": False,
77
  "options": {
78
  "num_predict": max_tokens,
79
+ "temperature": 0.1, # Lower temperature for faster, more focused output
80
+ "top_p": 0.9, # Nucleus sampling for efficiency
81
+ "top_k": 40, # Limit vocabulary for speed
82
+ "repeat_penalty": 1.1, # Prevent repetition
83
+ "num_ctx": 2048, # Limit context window for speed
84
  },
85
  }
86