ming Claude commited on
Commit
6a1e8a3
·
1 Parent(s): 80ea70f

Revert adaptive token logic, restore client-controlled max_tokens

Browse files

- Restore client max_tokens control in V3 API (was being ignored)
- Remove min_length parameter from HF streaming service
- Re-enable temperature and top_p sampling parameters
- Simplify token generation: min_new_tokens = max(20, min(50, max_tokens//4))
- Set neutral length_penalty = 1.0 to avoid length bias
- Reduce recursive chunk tokens from 200 to 80 for tighter summaries
- Remove adaptive token calculation tests and parameter validation tests
- Update default max_tokens back to 256 in schema

This simplifies the summarization logic by removing server-side token overrides
and allowing the client (Android app) to control summary length directly.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

app/api/v3/schemas.py CHANGED
@@ -22,10 +22,7 @@ class ScrapeAndSummarizeRequest(BaseModel):
22
  example="Your article text here...",
23
  )
24
  max_tokens: Optional[int] = Field(
25
- default=512,
26
- ge=1,
27
- le=2048,
28
- description="Maximum tokens in summary. Higher values allow more complete summaries for long articles.",
29
  )
30
  temperature: Optional[float] = Field(
31
  default=0.3,
 
22
  example="Your article text here...",
23
  )
24
  max_tokens: Optional[int] = Field(
25
+ default=256, ge=1, le=2048, description="Maximum tokens in summary"
 
 
 
26
  )
27
  temperature: Optional[float] = Field(
28
  default=0.3,
app/api/v3/scrape_summarize.py CHANGED
@@ -114,25 +114,6 @@ async def _stream_generator(text: str, payload, metadata: dict, request_id: str)
114
  metadata_event = {"type": "metadata", "data": metadata}
115
  yield f"data: {json.dumps(metadata_event)}\n\n"
116
 
117
- # Calculate adaptive token limits based on text length
118
- # Formula: scale tokens with input length, but enforce min/max bounds
119
- # Note: Ignores client's max_tokens to ensure quality (client often sends too-low values)
120
- text_length = len(text)
121
- adaptive_max_tokens = min(
122
- max(text_length // 3, 300), # At least 300 tokens, scale ~33% of input chars
123
- 1024, # Cap at 1024 to avoid excessive generation
124
- )
125
- # Calculate minimum length (60% of max) to encourage complete thoughts
126
- adaptive_min_length = int(adaptive_max_tokens * 0.6)
127
-
128
- logger.info(
129
- f"[{request_id}] Adaptive token calculation: "
130
- f"text_length={text_length}, "
131
- f"requested_max={payload.max_tokens}, "
132
- f"adaptive_max={adaptive_max_tokens}, "
133
- f"adaptive_min={adaptive_min_length}"
134
- )
135
-
136
  # Stream summarization chunks
137
  summarization_start = time.time()
138
  tokens_used = 0
@@ -140,8 +121,7 @@ async def _stream_generator(text: str, payload, metadata: dict, request_id: str)
140
  try:
141
  async for chunk in hf_streaming_service.summarize_text_stream(
142
  text=text,
143
- max_new_tokens=adaptive_max_tokens,
144
- min_length=adaptive_min_length,
145
  temperature=payload.temperature,
146
  top_p=payload.top_p,
147
  prompt=payload.prompt,
 
114
  metadata_event = {"type": "metadata", "data": metadata}
115
  yield f"data: {json.dumps(metadata_event)}\n\n"
116
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  # Stream summarization chunks
118
  summarization_start = time.time()
119
  tokens_used = 0
 
121
  try:
122
  async for chunk in hf_streaming_service.summarize_text_stream(
123
  text=text,
124
+ max_new_tokens=payload.max_tokens,
 
125
  temperature=payload.temperature,
126
  top_p=payload.top_p,
127
  prompt=payload.prompt,
app/services/hf_streaming_summarizer.py CHANGED
@@ -167,7 +167,6 @@ class HFStreamingSummarizer:
167
  self,
168
  text: str,
169
  max_new_tokens: int = None,
170
- min_length: int = None,
171
  temperature: float = None,
172
  top_p: float = None,
173
  prompt: str = "Summarize the key points concisely:",
@@ -178,7 +177,6 @@ class HFStreamingSummarizer:
178
  Args:
179
  text: Input text to summarize
180
  max_new_tokens: Maximum new tokens to generate
181
- min_length: Minimum length of generated summary (encourages complete thoughts)
182
  temperature: Sampling temperature
183
  top_p: Nucleus sampling parameter
184
  prompt: System prompt for summarization
@@ -211,7 +209,7 @@ class HFStreamingSummarizer:
211
  f"Text is long ({text_length} chars), using recursive summarization"
212
  )
213
  async for chunk in self._recursive_summarize(
214
- text, max_new_tokens, min_length, temperature, top_p, prompt
215
  ):
216
  yield chunk
217
  return
@@ -372,7 +370,8 @@ class HFStreamingSummarizer:
372
  "streamer": streamer,
373
  "max_new_tokens": max_new_tokens,
374
  "do_sample": False,
375
- # Note: temperature, top_p removed - incompatible with greedy decoding
 
376
  "pad_token_id": pad_id,
377
  "eos_token_id": eos_id,
378
  }
@@ -380,23 +379,15 @@ class HFStreamingSummarizer:
380
  gen_kwargs["num_return_sequences"] = 1
381
  gen_kwargs["num_beams"] = 1
382
  gen_kwargs["num_beam_groups"] = 1
383
- # Set min_new_tokens: use provided min_length if available, else calculate
384
- if min_length is not None:
385
- gen_kwargs["min_new_tokens"] = min_length
386
- else:
387
- # Ensure minimum quality: at least 50 tokens, up to half of max (capped at 200)
388
- gen_kwargs["min_new_tokens"] = max(
389
- 50, min(max_new_tokens // 2, 200)
390
- )
391
- # Note: length_penalty removed - only works with beam search (num_beams > 1)
392
- # Using greedy decoding (num_beams=1) for speed
393
  # Reduce premature EOS in some checkpoints (optional)
394
  gen_kwargs["no_repeat_ngram_size"] = 3
395
  gen_kwargs["repetition_penalty"] = 1.05
396
- # CRITICAL: Override model config defaults that cause early stopping
397
- gen_kwargs["forced_eos_token_id"] = None # Disable forced EOS from model config
398
- gen_kwargs["forced_bos_token_id"] = None # Disable forced BOS for consistency
399
- gen_kwargs["early_stopping"] = False # Disable early stopping to respect min_new_tokens
400
  # Extra safety: remove any stray args that imply multiple sequences
401
  for k in ("num_beam_groups", "num_beams", "num_return_sequences"):
402
  # Reassert values in case something upstream re-injected them
@@ -406,14 +397,6 @@ class HFStreamingSummarizer:
406
  gen_kwargs.pop("diversity_penalty", None)
407
  gen_kwargs.pop("num_return_sequences_per_prompt", None)
408
 
409
- # Log generation parameters for debugging
410
- logger.info(
411
- f"Generation params: max_new_tokens={gen_kwargs['max_new_tokens']}, "
412
- f"min_new_tokens={gen_kwargs['min_new_tokens']}, "
413
- f"early_stopping={gen_kwargs['early_stopping']}, "
414
- f"forced_eos_token_id={gen_kwargs['forced_eos_token_id']}"
415
- )
416
-
417
  generation_thread = threading.Thread(
418
  target=self.model.generate, kwargs=gen_kwargs, daemon=True
419
  )
@@ -463,7 +446,6 @@ class HFStreamingSummarizer:
463
  self,
464
  text: str,
465
  max_new_tokens: int,
466
- min_length: int,
467
  temperature: float,
468
  top_p: float,
469
  prompt: str,
@@ -471,8 +453,6 @@ class HFStreamingSummarizer:
471
  """
472
  Recursively summarize long text by chunking and summarizing each chunk,
473
  then summarizing the summaries if there are multiple chunks.
474
-
475
- Note: min_length is used for the final summary only, not for individual chunks.
476
  """
477
  try:
478
  # Split text into chunks of ~800-1000 tokens
@@ -487,9 +467,8 @@ class HFStreamingSummarizer:
487
  for i, chunk in enumerate(chunks):
488
  logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
489
 
490
- # Use reasonable max_new_tokens for individual chunks
491
- # Allow at least half of max, up to 200 tokens per chunk
492
- chunk_max_tokens = min(max_new_tokens // 2, 200)
493
 
494
  chunk_summary = ""
495
  async for chunk_result in self._single_chunk_summarize(
@@ -506,14 +485,13 @@ class HFStreamingSummarizer:
506
  logger.info("Creating final summary of summaries")
507
  combined_summaries = "\n\n".join(chunk_summaries)
508
 
509
- # Use original max_new_tokens and min_length for final summary
510
  async for final_result in self._single_chunk_summarize(
511
  combined_summaries,
512
  max_new_tokens,
513
  temperature,
514
  top_p,
515
  "Summarize the key points from these summaries:",
516
- min_length=min_length,
517
  ):
518
  yield final_result
519
  else:
@@ -539,14 +517,10 @@ class HFStreamingSummarizer:
539
  temperature: float,
540
  top_p: float,
541
  prompt: str,
542
- min_length: int = None,
543
  ) -> AsyncGenerator[Dict[str, Any], None]:
544
  """
545
  Summarize a single chunk of text using the same logic as the main method
546
  but without the recursive check.
547
-
548
- Args:
549
- min_length: Optional minimum length for generation
550
  """
551
  if not self.model or not self.tokenizer:
552
  error_msg = (
@@ -655,40 +629,24 @@ class HFStreamingSummarizer:
655
  self.tokenizer, skip_prompt=True, skip_special_tokens=True
656
  )
657
 
658
- # Set min_new_tokens: use provided min_length if available, else calculate
659
- if min_length is not None:
660
- calculated_min_tokens = min_length
661
- else:
662
- # Ensure minimum quality: at least 50 tokens, up to half of max (capped at 200)
663
- calculated_min_tokens = max(50, min(max_new_tokens // 2, 200))
664
-
665
  gen_kwargs = {
666
  **inputs,
667
  "streamer": streamer,
668
  "max_new_tokens": max_new_tokens,
669
  "do_sample": False,
670
- # Note: temperature, top_p, length_penalty removed - incompatible with greedy decoding
 
671
  "pad_token_id": pad_id,
672
  "eos_token_id": eos_id,
673
  "num_return_sequences": 1,
674
  "num_beams": 1,
675
  "num_beam_groups": 1,
676
- "min_new_tokens": calculated_min_tokens,
 
677
  "no_repeat_ngram_size": 3,
678
  "repetition_penalty": 1.05,
679
- # CRITICAL: Override model config defaults that cause early stopping
680
- "forced_eos_token_id": None, # Disable forced EOS from model config
681
- "forced_bos_token_id": None, # Disable forced BOS for consistency
682
- "early_stopping": False, # Disable early stopping to respect min_new_tokens
683
  }
684
 
685
- # Log generation parameters for debugging
686
- logger.info(
687
- f"Chunk generation params: max_new_tokens={gen_kwargs['max_new_tokens']}, "
688
- f"min_new_tokens={gen_kwargs['min_new_tokens']}, "
689
- f"early_stopping={gen_kwargs['early_stopping']}"
690
- )
691
-
692
  generation_thread = threading.Thread(
693
  target=self.model.generate, kwargs=gen_kwargs, daemon=True
694
  )
 
167
  self,
168
  text: str,
169
  max_new_tokens: int = None,
 
170
  temperature: float = None,
171
  top_p: float = None,
172
  prompt: str = "Summarize the key points concisely:",
 
177
  Args:
178
  text: Input text to summarize
179
  max_new_tokens: Maximum new tokens to generate
 
180
  temperature: Sampling temperature
181
  top_p: Nucleus sampling parameter
182
  prompt: System prompt for summarization
 
209
  f"Text is long ({text_length} chars), using recursive summarization"
210
  )
211
  async for chunk in self._recursive_summarize(
212
+ text, max_new_tokens, temperature, top_p, prompt
213
  ):
214
  yield chunk
215
  return
 
370
  "streamer": streamer,
371
  "max_new_tokens": max_new_tokens,
372
  "do_sample": False,
373
+ "temperature": temperature,
374
+ "top_p": top_p,
375
  "pad_token_id": pad_id,
376
  "eos_token_id": eos_id,
377
  }
 
379
  gen_kwargs["num_return_sequences"] = 1
380
  gen_kwargs["num_beams"] = 1
381
  gen_kwargs["num_beam_groups"] = 1
382
+ # Set conservative min_new_tokens to prevent rambling
383
+ gen_kwargs["min_new_tokens"] = max(
384
+ 20, min(50, max_new_tokens // 4)
385
+ ) # floor ~20-50
386
+ # Use neutral length_penalty to avoid encouraging longer outputs
387
+ gen_kwargs["length_penalty"] = 1.0
 
 
 
 
388
  # Reduce premature EOS in some checkpoints (optional)
389
  gen_kwargs["no_repeat_ngram_size"] = 3
390
  gen_kwargs["repetition_penalty"] = 1.05
 
 
 
 
391
  # Extra safety: remove any stray args that imply multiple sequences
392
  for k in ("num_beam_groups", "num_beams", "num_return_sequences"):
393
  # Reassert values in case something upstream re-injected them
 
397
  gen_kwargs.pop("diversity_penalty", None)
398
  gen_kwargs.pop("num_return_sequences_per_prompt", None)
399
 
 
 
 
 
 
 
 
 
400
  generation_thread = threading.Thread(
401
  target=self.model.generate, kwargs=gen_kwargs, daemon=True
402
  )
 
446
  self,
447
  text: str,
448
  max_new_tokens: int,
 
449
  temperature: float,
450
  top_p: float,
451
  prompt: str,
 
453
  """
454
  Recursively summarize long text by chunking and summarizing each chunk,
455
  then summarizing the summaries if there are multiple chunks.
 
 
456
  """
457
  try:
458
  # Split text into chunks of ~800-1000 tokens
 
467
  for i, chunk in enumerate(chunks):
468
  logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
469
 
470
+ # Use smaller max_new_tokens for individual chunks
471
+ chunk_max_tokens = min(max_new_tokens, 80)
 
472
 
473
  chunk_summary = ""
474
  async for chunk_result in self._single_chunk_summarize(
 
485
  logger.info("Creating final summary of summaries")
486
  combined_summaries = "\n\n".join(chunk_summaries)
487
 
488
+ # Use original max_new_tokens for final summary
489
  async for final_result in self._single_chunk_summarize(
490
  combined_summaries,
491
  max_new_tokens,
492
  temperature,
493
  top_p,
494
  "Summarize the key points from these summaries:",
 
495
  ):
496
  yield final_result
497
  else:
 
517
  temperature: float,
518
  top_p: float,
519
  prompt: str,
 
520
  ) -> AsyncGenerator[Dict[str, Any], None]:
521
  """
522
  Summarize a single chunk of text using the same logic as the main method
523
  but without the recursive check.
 
 
 
524
  """
525
  if not self.model or not self.tokenizer:
526
  error_msg = (
 
629
  self.tokenizer, skip_prompt=True, skip_special_tokens=True
630
  )
631
 
 
 
 
 
 
 
 
632
  gen_kwargs = {
633
  **inputs,
634
  "streamer": streamer,
635
  "max_new_tokens": max_new_tokens,
636
  "do_sample": False,
637
+ "temperature": temperature,
638
+ "top_p": top_p,
639
  "pad_token_id": pad_id,
640
  "eos_token_id": eos_id,
641
  "num_return_sequences": 1,
642
  "num_beams": 1,
643
  "num_beam_groups": 1,
644
+ "min_new_tokens": max(20, min(50, max_new_tokens // 4)),
645
+ "length_penalty": 1.0,
646
  "no_repeat_ngram_size": 3,
647
  "repetition_penalty": 1.05,
 
 
 
 
648
  }
649
 
 
 
 
 
 
 
 
650
  generation_thread = threading.Thread(
651
  target=self.model.generate, kwargs=gen_kwargs, daemon=True
652
  )
tests/test_hf_streaming.py CHANGED
@@ -175,47 +175,3 @@ class TestHFStreamingServiceIntegration:
175
  result = await hf_streaming_service.check_health()
176
  # Should return False when transformers not available
177
  assert result is False
178
-
179
-
180
- class TestHFGenerationParameters:
181
- """Test HF service generation parameters (min_length, length_penalty).
182
-
183
- Note: These tests verify the method signature and parameter acceptance.
184
- Full integration testing is done in test_v3_api.py.
185
- """
186
-
187
- def test_summarize_text_stream_accepts_min_length_parameter(self):
188
- """Test that summarize_text_stream accepts min_length parameter."""
189
- import inspect
190
-
191
- service = HFStreamingSummarizer()
192
- sig = inspect.signature(service.summarize_text_stream)
193
-
194
- # Verify min_length parameter exists
195
- assert "min_length" in sig.parameters
196
- # Verify it has default None
197
- assert sig.parameters["min_length"].default is None
198
-
199
- def test_single_chunk_summarize_accepts_min_length_parameter(self):
200
- """Test that _single_chunk_summarize accepts min_length parameter."""
201
- import inspect
202
-
203
- service = HFStreamingSummarizer()
204
- sig = inspect.signature(service._single_chunk_summarize)
205
-
206
- # Verify min_length parameter exists
207
- assert "min_length" in sig.parameters
208
- # Verify it has default None
209
- assert sig.parameters["min_length"].default is None
210
-
211
- def test_recursive_summarize_accepts_min_length_parameter(self):
212
- """Test that _recursive_summarize accepts min_length parameter."""
213
- import inspect
214
-
215
- service = HFStreamingSummarizer()
216
- sig = inspect.signature(service._recursive_summarize)
217
-
218
- # Verify min_length parameter exists
219
- assert "min_length" in sig.parameters
220
- # Verify it's a required parameter (no default)
221
- assert sig.parameters["min_length"].default == inspect.Parameter.empty
 
175
  result = await hf_streaming_service.check_health()
176
  # Should return False when transformers not available
177
  assert result is False
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
tests/test_v3_api.py CHANGED
@@ -269,312 +269,3 @@ def test_request_validation():
269
  json={"url": "https://example.com/test", "top_p": 1.5}, # Too high
270
  )
271
  assert response.status_code == 422
272
-
273
-
274
- def test_adaptive_tokens_short_article(client: TestClient):
275
- """Test adaptive token calculation for short articles (~500 chars)."""
276
- with patch(
277
- "app.services.article_scraper.article_scraper_service.scrape_article"
278
- ) as mock_scrape:
279
- # Short article: 500 chars
280
- mock_scrape.return_value = {
281
- "text": "Short article content. " * 20, # ~500 chars
282
- "title": "Short Article",
283
- "url": "https://example.com/short",
284
- "method": "static",
285
- "scrape_time_ms": 100.0,
286
- }
287
-
288
- captured_kwargs = {}
289
-
290
- async def mock_stream(*args, **kwargs):
291
- # Capture the kwargs to verify adaptive tokens
292
- captured_kwargs.update(kwargs)
293
- yield {"content": "Summary", "done": False, "tokens_used": 1}
294
- yield {"content": "", "done": True, "tokens_used": 1}
295
-
296
- with patch(
297
- "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
298
- side_effect=mock_stream,
299
- ):
300
- response = client.post(
301
- "/api/v3/scrape-and-summarize/stream",
302
- json={"url": "https://example.com/short"},
303
- )
304
-
305
- assert response.status_code == 200
306
- # For 500 chars, adaptive tokens should be at least 300 (the minimum)
307
- assert captured_kwargs.get("max_new_tokens", 0) >= 300
308
- # min_length should be 60% of max_new_tokens
309
- expected_min = int(captured_kwargs["max_new_tokens"] * 0.6)
310
- assert captured_kwargs.get("min_length", 0) == expected_min
311
-
312
-
313
- def test_adaptive_tokens_medium_article(client: TestClient):
314
- """Test adaptive token calculation for medium articles (~2000 chars)."""
315
- with patch(
316
- "app.services.article_scraper.article_scraper_service.scrape_article"
317
- ) as mock_scrape:
318
- # Medium article: ~2000 chars -> should get 666 tokens (2000 // 3)
319
- mock_scrape.return_value = {
320
- "text": "Medium article content. " * 80, # ~2000 chars
321
- "title": "Medium Article",
322
- "url": "https://example.com/medium",
323
- "method": "static",
324
- "scrape_time_ms": 200.0,
325
- }
326
-
327
- captured_kwargs = {}
328
-
329
- async def mock_stream(*args, **kwargs):
330
- captured_kwargs.update(kwargs)
331
- yield {"content": "Summary", "done": False, "tokens_used": 1}
332
- yield {"content": "", "done": True, "tokens_used": 1}
333
-
334
- with patch(
335
- "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
336
- side_effect=mock_stream,
337
- ):
338
- response = client.post(
339
- "/api/v3/scrape-and-summarize/stream",
340
- json={"url": "https://example.com/medium", "max_tokens": 512},
341
- )
342
-
343
- assert response.status_code == 200
344
- # Now ignores client's max_tokens, uses adaptive calculation
345
- # For 2000 chars: 2000 // 3 = 666 tokens (client's 512 is ignored)
346
- assert 600 <= captured_kwargs.get("max_new_tokens", 0) <= 700
347
- # min_length should be 60% of max_new_tokens
348
- expected_min = int(captured_kwargs["max_new_tokens"] * 0.6)
349
- assert captured_kwargs.get("min_length", 0) == expected_min
350
-
351
-
352
- def test_adaptive_tokens_long_article(client: TestClient):
353
- """Test adaptive token calculation for long articles (~4000 chars)."""
354
- with patch(
355
- "app.services.article_scraper.article_scraper_service.scrape_article"
356
- ) as mock_scrape:
357
- # Long article: 4000 chars -> should be capped at 1024 tokens
358
- mock_scrape.return_value = {
359
- "text": "Long article content. " * 180, # ~4000 chars
360
- "title": "Long Article",
361
- "url": "https://example.com/long",
362
- "method": "static",
363
- "scrape_time_ms": 300.0,
364
- }
365
-
366
- captured_kwargs = {}
367
-
368
- async def mock_stream(*args, **kwargs):
369
- captured_kwargs.update(kwargs)
370
- yield {"content": "Summary", "done": False, "tokens_used": 1}
371
- yield {"content": "", "done": True, "tokens_used": 1}
372
-
373
- with patch(
374
- "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
375
- side_effect=mock_stream,
376
- ):
377
- response = client.post(
378
- "/api/v3/scrape-and-summarize/stream",
379
- json={"url": "https://example.com/long"},
380
- )
381
-
382
- assert response.status_code == 200
383
- # Should be capped at 1024
384
- assert captured_kwargs.get("max_new_tokens", 0) <= 1024
385
- # min_length should be 60% of max_new_tokens
386
- expected_min = int(captured_kwargs["max_new_tokens"] * 0.6)
387
- assert captured_kwargs.get("min_length", 0) == expected_min
388
-
389
-
390
- def test_user_max_tokens_ignored_for_quality(client: TestClient):
391
- """Test that user-specified max_tokens is IGNORED to ensure quality summaries."""
392
- with patch(
393
- "app.services.article_scraper.article_scraper_service.scrape_article"
394
- ) as mock_scrape:
395
- # Long article that would normally get 1000 tokens
396
- mock_scrape.return_value = {
397
- "text": "Long article content. " * 180, # ~4000 chars
398
- "title": "Long Article",
399
- "url": "https://example.com/long",
400
- "method": "static",
401
- "scrape_time_ms": 300.0,
402
- }
403
-
404
- captured_kwargs = {}
405
-
406
- async def mock_stream(*args, **kwargs):
407
- captured_kwargs.update(kwargs)
408
- yield {"content": "Summary", "done": False, "tokens_used": 1}
409
- yield {"content": "", "done": True, "tokens_used": 1}
410
-
411
- with patch(
412
- "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
413
- side_effect=mock_stream,
414
- ):
415
- # User requests only 400 tokens, but backend will ignore and use adaptive
416
- response = client.post(
417
- "/api/v3/scrape-and-summarize/stream",
418
- json={"url": "https://example.com/long", "max_tokens": 400},
419
- )
420
-
421
- assert response.status_code == 200
422
- # Ignores user's 400, uses adaptive (4000 // 3 = 1333, capped at 1024)
423
- assert captured_kwargs.get("max_new_tokens", 0) == 1024
424
- # min_length should still be 60% of the actual max used
425
- expected_min = int(captured_kwargs["max_new_tokens"] * 0.6)
426
- assert captured_kwargs.get("min_length", 0) == expected_min
427
-
428
-
429
- def test_default_max_tokens_updated():
430
- """Test that default max_tokens is now 512 instead of 256."""
431
- from app.api.v3.schemas import ScrapeAndSummarizeRequest
432
-
433
- # Create request without specifying max_tokens
434
- request = ScrapeAndSummarizeRequest(url="https://example.com/test")
435
-
436
- # Default should be 512
437
- assert request.max_tokens == 512
438
-
439
-
440
- def test_summary_completeness_no_cutoff(client: TestClient):
441
- """Integration test: Verify summaries end properly without mid-sentence cutoffs."""
442
- with patch(
443
- "app.services.article_scraper.article_scraper_service.scrape_article"
444
- ) as mock_scrape:
445
- # Long realistic article
446
- article_text = """
447
- Artificial intelligence has revolutionized the technology industry in recent years.
448
- Machine learning models are now capable of understanding complex patterns in data.
449
- Deep learning techniques have enabled breakthrough achievements in computer vision.
450
- Natural language processing has made significant strides in understanding human language.
451
- Researchers continue to push the boundaries of what AI can accomplish.
452
- The integration of AI into everyday applications has become increasingly common.
453
- From virtual assistants to recommendation systems, AI is everywhere.
454
- Companies are investing billions of dollars in AI research and development.
455
- Ethical considerations around AI deployment are gaining more attention.
456
- The future of AI holds both promise and challenges for society.
457
- """ * 5 # Make it longer to test token limits
458
-
459
- mock_scrape.return_value = {
460
- "text": article_text,
461
- "title": "AI Revolution Article",
462
- "author": "Tech Writer",
463
- "url": "https://example.com/ai-article",
464
- "method": "static",
465
- "scrape_time_ms": 250.0,
466
- }
467
-
468
- # Mock streaming that returns complete sentences
469
- async def mock_stream(*args, **kwargs):
470
- # Simulate a complete summary with proper ending
471
- summary_parts = [
472
- "Artificial",
473
- " intelligence",
474
- " has",
475
- " transformed",
476
- " technology",
477
- ",",
478
- " with",
479
- " machine",
480
- " learning",
481
- " and",
482
- " deep",
483
- " learning",
484
- " enabling",
485
- " breakthroughs",
486
- " in",
487
- " computer",
488
- " vision",
489
- " and",
490
- " natural",
491
- " language",
492
- " processing",
493
- ".", # Complete sentence
494
- ]
495
- for i, part in enumerate(summary_parts):
496
- yield {"content": part, "done": False, "tokens_used": i + 1}
497
- yield {"content": "", "done": True, "tokens_used": len(summary_parts)}
498
-
499
- with patch(
500
- "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
501
- side_effect=mock_stream,
502
- ):
503
- response = client.post(
504
- "/api/v3/scrape-and-summarize/stream",
505
- json={"url": "https://example.com/ai-article", "include_metadata": False},
506
- )
507
-
508
- assert response.status_code == 200
509
-
510
- # Collect all content chunks
511
- summary_text = ""
512
- for line in response.text.split("\n"):
513
- if line.startswith("data: "):
514
- try:
515
- event = json.loads(line[6:])
516
- if "content" in event and not event.get("done", False):
517
- summary_text += event["content"]
518
- except json.JSONDecodeError:
519
- pass
520
-
521
- # Verify summary ends with proper punctuation
522
- assert summary_text.strip(), "Summary should not be empty"
523
- assert summary_text.strip()[-1] in [
524
- ".",
525
- "!",
526
- "?",
527
- ], f"Summary should end with punctuation, got: '{summary_text.strip()[-20:]}'"
528
-
529
- # Verify summary doesn't end mid-word (no trailing incomplete words)
530
- last_word = summary_text.strip().split()[-1] if summary_text.strip() else ""
531
- # Last word should end with punctuation (complete sentence)
532
- if last_word:
533
- assert last_word[-1] in [
534
- ".",
535
- "!",
536
- "?",
537
- ",",
538
- ], f"Last word should have punctuation: '{last_word}'"
539
-
540
-
541
- def test_text_mode_adaptive_tokens(client: TestClient):
542
- """Test V3 text mode (no URL) with adaptive token calculation."""
543
- # Long text input
544
- long_text = "This is a test article. " * 100 # ~2500 chars
545
-
546
- captured_kwargs = {}
547
-
548
- async def mock_stream(*args, **kwargs):
549
- captured_kwargs.update(kwargs)
550
- yield {"content": "Summary of the test.", "done": False, "tokens_used": 5}
551
- yield {"content": "", "done": True, "tokens_used": 5}
552
-
553
- with patch(
554
- "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
555
- side_effect=mock_stream,
556
- ):
557
- response = client.post(
558
- "/api/v3/scrape-and-summarize/stream",
559
- json={"text": long_text, "include_metadata": True},
560
- )
561
-
562
- assert response.status_code == 200
563
-
564
- # Verify adaptive tokens were calculated for text mode too
565
- assert captured_kwargs.get("max_new_tokens", 0) >= 300
566
- assert captured_kwargs.get("min_length") is not None
567
-
568
- # Parse events to verify metadata has text mode indicator
569
- events = []
570
- for line in response.text.split("\n"):
571
- if line.startswith("data: "):
572
- try:
573
- events.append(json.loads(line[6:]))
574
- except json.JSONDecodeError:
575
- pass
576
-
577
- metadata_events = [e for e in events if e.get("type") == "metadata"]
578
- assert len(metadata_events) == 1
579
- assert metadata_events[0]["data"]["input_type"] == "text"
580
- assert metadata_events[0]["data"]["text_length"] == len(long_text)
 
269
  json={"url": "https://example.com/test", "top_p": 1.5}, # Too high
270
  )
271
  assert response.status_code == 422