ming Claude commited on
Commit
80ea70f
Β·
1 Parent(s): 6c96c54

fix: Backend ignores client max_tokens to verify Android app hypothesis

Browse files

This is a diagnostic commit to verify that the Android app's max_tokens=256
is causing early stopping, NOT a backend issue.

Changes Made:

1. **Remove Client max_tokens Constraint** (app/api/v3/scrape_summarize.py:121-123)
- BEFORE: min(text_length // 3, payload.max_tokens, 1024)
- AFTER: min(text_length // 3, 1024)
- Backend now IGNORES client's max_tokens value
- Always uses adaptive calculation for quality

2. **Remove Incompatible Generation Parameters** (app/services/hf_streaming_summarizer.py)
- Removed temperature, top_p from gen_kwargs (lines ~375, ~670)
- Removed length_penalty from gen_kwargs (lines ~391)
- These parameters are invalid with do_sample=False (greedy decoding)
- Eliminates transformers warning:
"generation flags are not valid and may be ignored: ['temperature', 'top_p', 'length_penalty']"

3. **Update Tests** (tests/test_v3_api.py)
- test_adaptive_tokens_medium_article: Now expects 600-700 tokens (not 450-512)
- test_user_max_tokens_ignored_for_quality: Renamed, expects client ignored

Expected Impact for 4237-char Article (from logs):
- BEFORE: adaptive_max=256 (constrained by client)
- AFTER: adaptive_max=1024 (4237 // 3 = 1412, capped at 1024)
- Improvement: 4x more tokens!

Logs Will Show:
```
text_length=4237, requested_max=256, adaptive_max=1024, adaptive_min=614
```

No More Warnings:
The transformers warning about invalid generation flags will disappear.

Test Results:
- All V3 tests passing (16/16) βœ…
- Tests updated to reflect new behavior

Purpose of This Commit:
This is a DIAGNOSTIC commit to test the hypothesis that the Android app's
hardcoded max_tokens=256 is the root cause of early stopping. If summaries
improve after this deployment, it confirms the Android app needs updating.

Next Steps:
1. Deploy to HuggingFace Spaces
2. Test with Android app (same URL)
3. If summaries are longer and more complete β†’ Android app is the issue
4. Then update Android app to send max_tokens=1024 or omit it entirely

Related commits:
- 5e83010: Initial adaptive calculation
- 6b2de93: Enhanced token allocation
- 6c96c54: Model config overrides

πŸ€– Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

app/api/v3/scrape_summarize.py CHANGED
@@ -116,10 +116,10 @@ async def _stream_generator(text: str, payload, metadata: dict, request_id: str)
116
 
117
  # Calculate adaptive token limits based on text length
118
  # Formula: scale tokens with input length, but enforce min/max bounds
 
119
  text_length = len(text)
120
  adaptive_max_tokens = min(
121
  max(text_length // 3, 300), # At least 300 tokens, scale ~33% of input chars
122
- payload.max_tokens, # Respect user's max if specified
123
  1024, # Cap at 1024 to avoid excessive generation
124
  )
125
  # Calculate minimum length (60% of max) to encourage complete thoughts
 
116
 
117
  # Calculate adaptive token limits based on text length
118
  # Formula: scale tokens with input length, but enforce min/max bounds
119
+ # Note: Ignores client's max_tokens to ensure quality (client often sends too-low values)
120
  text_length = len(text)
121
  adaptive_max_tokens = min(
122
  max(text_length // 3, 300), # At least 300 tokens, scale ~33% of input chars
 
123
  1024, # Cap at 1024 to avoid excessive generation
124
  )
125
  # Calculate minimum length (60% of max) to encourage complete thoughts
app/services/hf_streaming_summarizer.py CHANGED
@@ -372,8 +372,7 @@ class HFStreamingSummarizer:
372
  "streamer": streamer,
373
  "max_new_tokens": max_new_tokens,
374
  "do_sample": False,
375
- "temperature": temperature,
376
- "top_p": top_p,
377
  "pad_token_id": pad_id,
378
  "eos_token_id": eos_id,
379
  }
@@ -389,8 +388,8 @@ class HFStreamingSummarizer:
389
  gen_kwargs["min_new_tokens"] = max(
390
  50, min(max_new_tokens // 2, 200)
391
  )
392
- # Use slightly positive length_penalty to favor complete sentences
393
- gen_kwargs["length_penalty"] = 1.2
394
  # Reduce premature EOS in some checkpoints (optional)
395
  gen_kwargs["no_repeat_ngram_size"] = 3
396
  gen_kwargs["repetition_penalty"] = 1.05
@@ -668,15 +667,13 @@ class HFStreamingSummarizer:
668
  "streamer": streamer,
669
  "max_new_tokens": max_new_tokens,
670
  "do_sample": False,
671
- "temperature": temperature,
672
- "top_p": top_p,
673
  "pad_token_id": pad_id,
674
  "eos_token_id": eos_id,
675
  "num_return_sequences": 1,
676
  "num_beams": 1,
677
  "num_beam_groups": 1,
678
  "min_new_tokens": calculated_min_tokens,
679
- "length_penalty": 1.2,
680
  "no_repeat_ngram_size": 3,
681
  "repetition_penalty": 1.05,
682
  # CRITICAL: Override model config defaults that cause early stopping
 
372
  "streamer": streamer,
373
  "max_new_tokens": max_new_tokens,
374
  "do_sample": False,
375
+ # Note: temperature, top_p removed - incompatible with greedy decoding
 
376
  "pad_token_id": pad_id,
377
  "eos_token_id": eos_id,
378
  }
 
388
  gen_kwargs["min_new_tokens"] = max(
389
  50, min(max_new_tokens // 2, 200)
390
  )
391
+ # Note: length_penalty removed - only works with beam search (num_beams > 1)
392
+ # Using greedy decoding (num_beams=1) for speed
393
  # Reduce premature EOS in some checkpoints (optional)
394
  gen_kwargs["no_repeat_ngram_size"] = 3
395
  gen_kwargs["repetition_penalty"] = 1.05
 
667
  "streamer": streamer,
668
  "max_new_tokens": max_new_tokens,
669
  "do_sample": False,
670
+ # Note: temperature, top_p, length_penalty removed - incompatible with greedy decoding
 
671
  "pad_token_id": pad_id,
672
  "eos_token_id": eos_id,
673
  "num_return_sequences": 1,
674
  "num_beams": 1,
675
  "num_beam_groups": 1,
676
  "min_new_tokens": calculated_min_tokens,
 
677
  "no_repeat_ngram_size": 3,
678
  "repetition_penalty": 1.05,
679
  # CRITICAL: Override model config defaults that cause early stopping
tests/test_v3_api.py CHANGED
@@ -315,7 +315,7 @@ def test_adaptive_tokens_medium_article(client: TestClient):
315
  with patch(
316
  "app.services.article_scraper.article_scraper_service.scrape_article"
317
  ) as mock_scrape:
318
- # Medium article: ~2000 chars -> should get 500 tokens (2000 // 4)
319
  mock_scrape.return_value = {
320
  "text": "Medium article content. " * 80, # ~2000 chars
321
  "title": "Medium Article",
@@ -341,8 +341,9 @@ def test_adaptive_tokens_medium_article(client: TestClient):
341
  )
342
 
343
  assert response.status_code == 200
344
- # For 2000 chars with default max_tokens=512, should get ~500 tokens
345
- assert 450 <= captured_kwargs.get("max_new_tokens", 0) <= 512
 
346
  # min_length should be 60% of max_new_tokens
347
  expected_min = int(captured_kwargs["max_new_tokens"] * 0.6)
348
  assert captured_kwargs.get("min_length", 0) == expected_min
@@ -386,8 +387,8 @@ def test_adaptive_tokens_long_article(client: TestClient):
386
  assert captured_kwargs.get("min_length", 0) == expected_min
387
 
388
 
389
- def test_user_max_tokens_respected(client: TestClient):
390
- """Test that user-specified max_tokens is respected when lower than adaptive."""
391
  with patch(
392
  "app.services.article_scraper.article_scraper_service.scrape_article"
393
  ) as mock_scrape:
@@ -411,15 +412,15 @@ def test_user_max_tokens_respected(client: TestClient):
411
  "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
412
  side_effect=mock_stream,
413
  ):
414
- # User requests only 400 tokens
415
  response = client.post(
416
  "/api/v3/scrape-and-summarize/stream",
417
  json={"url": "https://example.com/long", "max_tokens": 400},
418
  )
419
 
420
  assert response.status_code == 200
421
- # Should respect user's limit of 400
422
- assert captured_kwargs.get("max_new_tokens", 0) <= 400
423
  # min_length should still be 60% of the actual max used
424
  expected_min = int(captured_kwargs["max_new_tokens"] * 0.6)
425
  assert captured_kwargs.get("min_length", 0) == expected_min
 
315
  with patch(
316
  "app.services.article_scraper.article_scraper_service.scrape_article"
317
  ) as mock_scrape:
318
+ # Medium article: ~2000 chars -> should get 666 tokens (2000 // 3)
319
  mock_scrape.return_value = {
320
  "text": "Medium article content. " * 80, # ~2000 chars
321
  "title": "Medium Article",
 
341
  )
342
 
343
  assert response.status_code == 200
344
+ # Now ignores client's max_tokens, uses adaptive calculation
345
+ # For 2000 chars: 2000 // 3 = 666 tokens (client's 512 is ignored)
346
+ assert 600 <= captured_kwargs.get("max_new_tokens", 0) <= 700
347
  # min_length should be 60% of max_new_tokens
348
  expected_min = int(captured_kwargs["max_new_tokens"] * 0.6)
349
  assert captured_kwargs.get("min_length", 0) == expected_min
 
387
  assert captured_kwargs.get("min_length", 0) == expected_min
388
 
389
 
390
+ def test_user_max_tokens_ignored_for_quality(client: TestClient):
391
+ """Test that user-specified max_tokens is IGNORED to ensure quality summaries."""
392
  with patch(
393
  "app.services.article_scraper.article_scraper_service.scrape_article"
394
  ) as mock_scrape:
 
412
  "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
413
  side_effect=mock_stream,
414
  ):
415
+ # User requests only 400 tokens, but backend will ignore and use adaptive
416
  response = client.post(
417
  "/api/v3/scrape-and-summarize/stream",
418
  json={"url": "https://example.com/long", "max_tokens": 400},
419
  )
420
 
421
  assert response.status_code == 200
422
+ # Ignores user's 400, uses adaptive (4000 // 3 = 1333, capped at 1024)
423
+ assert captured_kwargs.get("max_new_tokens", 0) == 1024
424
  # min_length should still be 60% of the actual max used
425
  expected_min = int(captured_kwargs["max_new_tokens"] * 0.6)
426
  assert captured_kwargs.get("min_length", 0) == expected_min