Spaces:

colin730
/

SummarizerApp

Sleeping

ming Claude commited on 26 days ago

Commit

6a1e8a3

1 Parent(s): 80ea70f

Revert adaptive token logic, restore client-controlled max_tokens

- Restore client max_tokens control in V3 API (was being ignored)
- Remove min_length parameter from HF streaming service
- Re-enable temperature and top_p sampling parameters
- Simplify token generation: min_new_tokens = max(20, min(50, max_tokens//4))
- Set neutral length_penalty = 1.0 to avoid length bias
- Reduce recursive chunk tokens from 200 to 80 for tighter summaries
- Remove adaptive token calculation tests and parameter validation tests
- Update default max_tokens back to 256 in schema

This simplifies the summarization logic by removing server-side token overrides
and allowing the client (Android app) to control summary length directly.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

Files changed (5) hide show

app/api/v3/schemas.py +1 -4
app/api/v3/scrape_summarize.py +1 -21
app/services/hf_streaming_summarizer.py +16 -58
tests/test_hf_streaming.py +0 -44
tests/test_v3_api.py +0 -309

app/api/v3/schemas.py CHANGED Viewed

@@ -22,10 +22,7 @@ class ScrapeAndSummarizeRequest(BaseModel):
         example="Your article text here...",
     )
     max_tokens: Optional[int] = Field(
-        default=512,
-        ge=1,
-        le=2048,
-        description="Maximum tokens in summary. Higher values allow more complete summaries for long articles.",
     )
     temperature: Optional[float] = Field(
         default=0.3,

         example="Your article text here...",
     )
     max_tokens: Optional[int] = Field(
+        default=256, ge=1, le=2048, description="Maximum tokens in summary"
     )
     temperature: Optional[float] = Field(
         default=0.3,

app/api/v3/scrape_summarize.py CHANGED Viewed

@@ -114,25 +114,6 @@ async def _stream_generator(text: str, payload, metadata: dict, request_id: str)
         metadata_event = {"type": "metadata", "data": metadata}
         yield f"data: {json.dumps(metadata_event)}\n\n"
-    # Calculate adaptive token limits based on text length
-    # Formula: scale tokens with input length, but enforce min/max bounds
-    # Note: Ignores client's max_tokens to ensure quality (client often sends too-low values)
-    text_length = len(text)
-    adaptive_max_tokens = min(
-        max(text_length // 3, 300),  # At least 300 tokens, scale ~33% of input chars
-        1024,  # Cap at 1024 to avoid excessive generation
-    )
-    # Calculate minimum length (60% of max) to encourage complete thoughts
-    adaptive_min_length = int(adaptive_max_tokens * 0.6)
-    logger.info(
-        f"[{request_id}] Adaptive token calculation: "
-        f"text_length={text_length}, "
-        f"requested_max={payload.max_tokens}, "
-        f"adaptive_max={adaptive_max_tokens}, "
-        f"adaptive_min={adaptive_min_length}"
-    )
     # Stream summarization chunks
     summarization_start = time.time()
     tokens_used = 0
@@ -140,8 +121,7 @@ async def _stream_generator(text: str, payload, metadata: dict, request_id: str)
     try:
         async for chunk in hf_streaming_service.summarize_text_stream(
             text=text,
-            max_new_tokens=adaptive_max_tokens,
-            min_length=adaptive_min_length,
             temperature=payload.temperature,
             top_p=payload.top_p,
             prompt=payload.prompt,

         metadata_event = {"type": "metadata", "data": metadata}
         yield f"data: {json.dumps(metadata_event)}\n\n"
     # Stream summarization chunks
     summarization_start = time.time()
     tokens_used = 0
     try:
         async for chunk in hf_streaming_service.summarize_text_stream(
             text=text,
+            max_new_tokens=payload.max_tokens,
             temperature=payload.temperature,
             top_p=payload.top_p,
             prompt=payload.prompt,

app/services/hf_streaming_summarizer.py CHANGED Viewed

@@ -167,7 +167,6 @@ class HFStreamingSummarizer:
         self,
         text: str,
         max_new_tokens: int = None,
-        min_length: int = None,
         temperature: float = None,
         top_p: float = None,
         prompt: str = "Summarize the key points concisely:",
@@ -178,7 +177,6 @@ class HFStreamingSummarizer:
         Args:
             text: Input text to summarize
             max_new_tokens: Maximum new tokens to generate
-            min_length: Minimum length of generated summary (encourages complete thoughts)
             temperature: Sampling temperature
             top_p: Nucleus sampling parameter
             prompt: System prompt for summarization
@@ -211,7 +209,7 @@ class HFStreamingSummarizer:
                 f"Text is long ({text_length} chars), using recursive summarization"
             )
             async for chunk in self._recursive_summarize(
-                text, max_new_tokens, min_length, temperature, top_p, prompt
             ):
                 yield chunk
             return
@@ -372,7 +370,8 @@ class HFStreamingSummarizer:
                 "streamer": streamer,
                 "max_new_tokens": max_new_tokens,
                 "do_sample": False,
-                # Note: temperature, top_p removed - incompatible with greedy decoding
                 "pad_token_id": pad_id,
                 "eos_token_id": eos_id,
             }
@@ -380,23 +379,15 @@ class HFStreamingSummarizer:
             gen_kwargs["num_return_sequences"] = 1
             gen_kwargs["num_beams"] = 1
             gen_kwargs["num_beam_groups"] = 1
-            # Set min_new_tokens: use provided min_length if available, else calculate
-            if min_length is not None:
-                gen_kwargs["min_new_tokens"] = min_length
-            else:
-                # Ensure minimum quality: at least 50 tokens, up to half of max (capped at 200)
-                gen_kwargs["min_new_tokens"] = max(
-                    50, min(max_new_tokens // 2, 200)
-                )
-            # Note: length_penalty removed - only works with beam search (num_beams > 1)
-            # Using greedy decoding (num_beams=1) for speed
             # Reduce premature EOS in some checkpoints (optional)
             gen_kwargs["no_repeat_ngram_size"] = 3
             gen_kwargs["repetition_penalty"] = 1.05
-            # CRITICAL: Override model config defaults that cause early stopping
-            gen_kwargs["forced_eos_token_id"] = None  # Disable forced EOS from model config
-            gen_kwargs["forced_bos_token_id"] = None  # Disable forced BOS for consistency
-            gen_kwargs["early_stopping"] = False  # Disable early stopping to respect min_new_tokens
             # Extra safety: remove any stray args that imply multiple sequences
             for k in ("num_beam_groups", "num_beams", "num_return_sequences"):
                 # Reassert values in case something upstream re-injected them
@@ -406,14 +397,6 @@ class HFStreamingSummarizer:
             gen_kwargs.pop("diversity_penalty", None)
             gen_kwargs.pop("num_return_sequences_per_prompt", None)
-            # Log generation parameters for debugging
-            logger.info(
-                f"Generation params: max_new_tokens={gen_kwargs['max_new_tokens']}, "
-                f"min_new_tokens={gen_kwargs['min_new_tokens']}, "
-                f"early_stopping={gen_kwargs['early_stopping']}, "
-                f"forced_eos_token_id={gen_kwargs['forced_eos_token_id']}"
-            )
             generation_thread = threading.Thread(
                 target=self.model.generate, kwargs=gen_kwargs, daemon=True
             )
@@ -463,7 +446,6 @@ class HFStreamingSummarizer:
         self,
         text: str,
         max_new_tokens: int,
-        min_length: int,
         temperature: float,
         top_p: float,
         prompt: str,
@@ -471,8 +453,6 @@ class HFStreamingSummarizer:
         """
         Recursively summarize long text by chunking and summarizing each chunk,
         then summarizing the summaries if there are multiple chunks.
-        Note: min_length is used for the final summary only, not for individual chunks.
         """
         try:
             # Split text into chunks of ~800-1000 tokens
@@ -487,9 +467,8 @@ class HFStreamingSummarizer:
             for i, chunk in enumerate(chunks):
                 logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
-                # Use reasonable max_new_tokens for individual chunks
-                # Allow at least half of max, up to 200 tokens per chunk
-                chunk_max_tokens = min(max_new_tokens // 2, 200)
                 chunk_summary = ""
                 async for chunk_result in self._single_chunk_summarize(
@@ -506,14 +485,13 @@ class HFStreamingSummarizer:
                 logger.info("Creating final summary of summaries")
                 combined_summaries = "\n\n".join(chunk_summaries)
-                # Use original max_new_tokens and min_length for final summary
                 async for final_result in self._single_chunk_summarize(
                     combined_summaries,
                     max_new_tokens,
                     temperature,
                     top_p,
                     "Summarize the key points from these summaries:",
-                    min_length=min_length,
                 ):
                     yield final_result
             else:
@@ -539,14 +517,10 @@ class HFStreamingSummarizer:
         temperature: float,
         top_p: float,
         prompt: str,
-        min_length: int = None,
     ) -> AsyncGenerator[Dict[str, Any], None]:
         """
         Summarize a single chunk of text using the same logic as the main method
         but without the recursive check.
-        Args:
-            min_length: Optional minimum length for generation
         """
         if not self.model or not self.tokenizer:
             error_msg = (
@@ -655,40 +629,24 @@ class HFStreamingSummarizer:
                 self.tokenizer, skip_prompt=True, skip_special_tokens=True
             )
-            # Set min_new_tokens: use provided min_length if available, else calculate
-            if min_length is not None:
-                calculated_min_tokens = min_length
-            else:
-                # Ensure minimum quality: at least 50 tokens, up to half of max (capped at 200)
-                calculated_min_tokens = max(50, min(max_new_tokens // 2, 200))
             gen_kwargs = {
                 **inputs,
                 "streamer": streamer,
                 "max_new_tokens": max_new_tokens,
                 "do_sample": False,
-                # Note: temperature, top_p, length_penalty removed - incompatible with greedy decoding
                 "pad_token_id": pad_id,
                 "eos_token_id": eos_id,
                 "num_return_sequences": 1,
                 "num_beams": 1,
                 "num_beam_groups": 1,
-                "min_new_tokens": calculated_min_tokens,
                 "no_repeat_ngram_size": 3,
                 "repetition_penalty": 1.05,
-                # CRITICAL: Override model config defaults that cause early stopping
-                "forced_eos_token_id": None,  # Disable forced EOS from model config
-                "forced_bos_token_id": None,  # Disable forced BOS for consistency
-                "early_stopping": False,  # Disable early stopping to respect min_new_tokens
             }
-            # Log generation parameters for debugging
-            logger.info(
-                f"Chunk generation params: max_new_tokens={gen_kwargs['max_new_tokens']}, "
-                f"min_new_tokens={gen_kwargs['min_new_tokens']}, "
-                f"early_stopping={gen_kwargs['early_stopping']}"
-            )
             generation_thread = threading.Thread(
                 target=self.model.generate, kwargs=gen_kwargs, daemon=True
             )

         self,
         text: str,
         max_new_tokens: int = None,
         temperature: float = None,
         top_p: float = None,
         prompt: str = "Summarize the key points concisely:",
         Args:
             text: Input text to summarize
             max_new_tokens: Maximum new tokens to generate
             temperature: Sampling temperature
             top_p: Nucleus sampling parameter
             prompt: System prompt for summarization
                 f"Text is long ({text_length} chars), using recursive summarization"
             )
             async for chunk in self._recursive_summarize(
+                text, max_new_tokens, temperature, top_p, prompt
             ):
                 yield chunk
             return
                 "streamer": streamer,
                 "max_new_tokens": max_new_tokens,
                 "do_sample": False,
+                "temperature": temperature,
+                "top_p": top_p,
                 "pad_token_id": pad_id,
                 "eos_token_id": eos_id,
             }
             gen_kwargs["num_return_sequences"] = 1
             gen_kwargs["num_beams"] = 1
             gen_kwargs["num_beam_groups"] = 1
+            # Set conservative min_new_tokens to prevent rambling
+            gen_kwargs["min_new_tokens"] = max(
+                20, min(50, max_new_tokens // 4)
+            )  # floor ~20-50
+            # Use neutral length_penalty to avoid encouraging longer outputs
+            gen_kwargs["length_penalty"] = 1.0
             # Reduce premature EOS in some checkpoints (optional)
             gen_kwargs["no_repeat_ngram_size"] = 3
             gen_kwargs["repetition_penalty"] = 1.05
             # Extra safety: remove any stray args that imply multiple sequences
             for k in ("num_beam_groups", "num_beams", "num_return_sequences"):
                 # Reassert values in case something upstream re-injected them
             gen_kwargs.pop("diversity_penalty", None)
             gen_kwargs.pop("num_return_sequences_per_prompt", None)
             generation_thread = threading.Thread(
                 target=self.model.generate, kwargs=gen_kwargs, daemon=True
             )
         self,
         text: str,
         max_new_tokens: int,
         temperature: float,
         top_p: float,
         prompt: str,
         """
         Recursively summarize long text by chunking and summarizing each chunk,
         then summarizing the summaries if there are multiple chunks.
         """
         try:
             # Split text into chunks of ~800-1000 tokens
             for i, chunk in enumerate(chunks):
                 logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
+                # Use smaller max_new_tokens for individual chunks
+                chunk_max_tokens = min(max_new_tokens, 80)
                 chunk_summary = ""
                 async for chunk_result in self._single_chunk_summarize(
                 logger.info("Creating final summary of summaries")
                 combined_summaries = "\n\n".join(chunk_summaries)
+                # Use original max_new_tokens for final summary
                 async for final_result in self._single_chunk_summarize(
                     combined_summaries,
                     max_new_tokens,
                     temperature,
                     top_p,
                     "Summarize the key points from these summaries:",
                 ):
                     yield final_result
             else:
         temperature: float,
         top_p: float,
         prompt: str,
     ) -> AsyncGenerator[Dict[str, Any], None]:
         """
         Summarize a single chunk of text using the same logic as the main method
         but without the recursive check.
         """
         if not self.model or not self.tokenizer:
             error_msg = (
                 self.tokenizer, skip_prompt=True, skip_special_tokens=True
             )
             gen_kwargs = {
                 **inputs,
                 "streamer": streamer,
                 "max_new_tokens": max_new_tokens,
                 "do_sample": False,
+                "temperature": temperature,
+                "top_p": top_p,
                 "pad_token_id": pad_id,
                 "eos_token_id": eos_id,
                 "num_return_sequences": 1,
                 "num_beams": 1,
                 "num_beam_groups": 1,
+                "min_new_tokens": max(20, min(50, max_new_tokens // 4)),
+                "length_penalty": 1.0,
                 "no_repeat_ngram_size": 3,
                 "repetition_penalty": 1.05,
             }
             generation_thread = threading.Thread(
                 target=self.model.generate, kwargs=gen_kwargs, daemon=True
             )

tests/test_hf_streaming.py CHANGED Viewed

@@ -175,47 +175,3 @@ class TestHFStreamingServiceIntegration:
         result = await hf_streaming_service.check_health()
         # Should return False when transformers not available
         assert result is False
-class TestHFGenerationParameters:
-    """Test HF service generation parameters (min_length, length_penalty).
-    Note: These tests verify the method signature and parameter acceptance.
-    Full integration testing is done in test_v3_api.py.
-    """
-    def test_summarize_text_stream_accepts_min_length_parameter(self):
-        """Test that summarize_text_stream accepts min_length parameter."""
-        import inspect
-        service = HFStreamingSummarizer()
-        sig = inspect.signature(service.summarize_text_stream)
-        # Verify min_length parameter exists
-        assert "min_length" in sig.parameters
-        # Verify it has default None
-        assert sig.parameters["min_length"].default is None
-    def test_single_chunk_summarize_accepts_min_length_parameter(self):
-        """Test that _single_chunk_summarize accepts min_length parameter."""
-        import inspect
-        service = HFStreamingSummarizer()
-        sig = inspect.signature(service._single_chunk_summarize)
-        # Verify min_length parameter exists
-        assert "min_length" in sig.parameters
-        # Verify it has default None
-        assert sig.parameters["min_length"].default is None
-    def test_recursive_summarize_accepts_min_length_parameter(self):
-        """Test that _recursive_summarize accepts min_length parameter."""
-        import inspect
-        service = HFStreamingSummarizer()
-        sig = inspect.signature(service._recursive_summarize)
-        # Verify min_length parameter exists
-        assert "min_length" in sig.parameters
-        # Verify it's a required parameter (no default)
-        assert sig.parameters["min_length"].default == inspect.Parameter.empty

         result = await hf_streaming_service.check_health()
         # Should return False when transformers not available
         assert result is False

tests/test_v3_api.py CHANGED Viewed

@@ -269,312 +269,3 @@ def test_request_validation():
         json={"url": "https://example.com/test", "top_p": 1.5},  # Too high
     )
     assert response.status_code == 422
-def test_adaptive_tokens_short_article(client: TestClient):
-    """Test adaptive token calculation for short articles (~500 chars)."""
-    with patch(
-        "app.services.article_scraper.article_scraper_service.scrape_article"
-    ) as mock_scrape:
-        # Short article: 500 chars
-        mock_scrape.return_value = {
-            "text": "Short article content. " * 20,  # ~500 chars
-            "title": "Short Article",
-            "url": "https://example.com/short",
-            "method": "static",
-            "scrape_time_ms": 100.0,
-        }
-        captured_kwargs = {}
-        async def mock_stream(*args, **kwargs):
-            # Capture the kwargs to verify adaptive tokens
-            captured_kwargs.update(kwargs)
-            yield {"content": "Summary", "done": False, "tokens_used": 1}
-            yield {"content": "", "done": True, "tokens_used": 1}
-        with patch(
-            "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
-            side_effect=mock_stream,
-        ):
-            response = client.post(
-                "/api/v3/scrape-and-summarize/stream",
-                json={"url": "https://example.com/short"},
-            )
-            assert response.status_code == 200
-            # For 500 chars, adaptive tokens should be at least 300 (the minimum)
-            assert captured_kwargs.get("max_new_tokens", 0) >= 300
-            # min_length should be 60% of max_new_tokens
-            expected_min = int(captured_kwargs["max_new_tokens"] * 0.6)
-            assert captured_kwargs.get("min_length", 0) == expected_min
-def test_adaptive_tokens_medium_article(client: TestClient):
-    """Test adaptive token calculation for medium articles (~2000 chars)."""
-    with patch(
-        "app.services.article_scraper.article_scraper_service.scrape_article"
-    ) as mock_scrape:
-        # Medium article: ~2000 chars -> should get 666 tokens (2000 // 3)
-        mock_scrape.return_value = {
-            "text": "Medium article content. " * 80,  # ~2000 chars
-            "title": "Medium Article",
-            "url": "https://example.com/medium",
-            "method": "static",
-            "scrape_time_ms": 200.0,
-        }
-        captured_kwargs = {}
-        async def mock_stream(*args, **kwargs):
-            captured_kwargs.update(kwargs)
-            yield {"content": "Summary", "done": False, "tokens_used": 1}
-            yield {"content": "", "done": True, "tokens_used": 1}
-        with patch(
-            "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
-            side_effect=mock_stream,
-        ):
-            response = client.post(
-                "/api/v3/scrape-and-summarize/stream",
-                json={"url": "https://example.com/medium", "max_tokens": 512},
-            )
-            assert response.status_code == 200
-            # Now ignores client's max_tokens, uses adaptive calculation
-            # For 2000 chars: 2000 // 3 = 666 tokens (client's 512 is ignored)
-            assert 600 <= captured_kwargs.get("max_new_tokens", 0) <= 700
-            # min_length should be 60% of max_new_tokens
-            expected_min = int(captured_kwargs["max_new_tokens"] * 0.6)
-            assert captured_kwargs.get("min_length", 0) == expected_min
-def test_adaptive_tokens_long_article(client: TestClient):
-    """Test adaptive token calculation for long articles (~4000 chars)."""
-    with patch(
-        "app.services.article_scraper.article_scraper_service.scrape_article"
-    ) as mock_scrape:
-        # Long article: 4000 chars -> should be capped at 1024 tokens
-        mock_scrape.return_value = {
-            "text": "Long article content. " * 180,  # ~4000 chars
-            "title": "Long Article",
-            "url": "https://example.com/long",
-            "method": "static",
-            "scrape_time_ms": 300.0,
-        }
-        captured_kwargs = {}
-        async def mock_stream(*args, **kwargs):
-            captured_kwargs.update(kwargs)
-            yield {"content": "Summary", "done": False, "tokens_used": 1}
-            yield {"content": "", "done": True, "tokens_used": 1}
-        with patch(
-            "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
-            side_effect=mock_stream,
-        ):
-            response = client.post(
-                "/api/v3/scrape-and-summarize/stream",
-                json={"url": "https://example.com/long"},
-            )
-            assert response.status_code == 200
-            # Should be capped at 1024
-            assert captured_kwargs.get("max_new_tokens", 0) <= 1024
-            # min_length should be 60% of max_new_tokens
-            expected_min = int(captured_kwargs["max_new_tokens"] * 0.6)
-            assert captured_kwargs.get("min_length", 0) == expected_min
-def test_user_max_tokens_ignored_for_quality(client: TestClient):
-    """Test that user-specified max_tokens is IGNORED to ensure quality summaries."""
-    with patch(
-        "app.services.article_scraper.article_scraper_service.scrape_article"
-    ) as mock_scrape:
-        # Long article that would normally get 1000 tokens
-        mock_scrape.return_value = {
-            "text": "Long article content. " * 180,  # ~4000 chars
-            "title": "Long Article",
-            "url": "https://example.com/long",
-            "method": "static",
-            "scrape_time_ms": 300.0,
-        }
-        captured_kwargs = {}
-        async def mock_stream(*args, **kwargs):
-            captured_kwargs.update(kwargs)
-            yield {"content": "Summary", "done": False, "tokens_used": 1}
-            yield {"content": "", "done": True, "tokens_used": 1}
-        with patch(
-            "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
-            side_effect=mock_stream,
-        ):
-            # User requests only 400 tokens, but backend will ignore and use adaptive
-            response = client.post(
-                "/api/v3/scrape-and-summarize/stream",
-                json={"url": "https://example.com/long", "max_tokens": 400},
-            )
-            assert response.status_code == 200
-            # Ignores user's 400, uses adaptive (4000 // 3 = 1333, capped at 1024)
-            assert captured_kwargs.get("max_new_tokens", 0) == 1024
-            # min_length should still be 60% of the actual max used
-            expected_min = int(captured_kwargs["max_new_tokens"] * 0.6)
-            assert captured_kwargs.get("min_length", 0) == expected_min
-def test_default_max_tokens_updated():
-    """Test that default max_tokens is now 512 instead of 256."""
-    from app.api.v3.schemas import ScrapeAndSummarizeRequest
-    # Create request without specifying max_tokens
-    request = ScrapeAndSummarizeRequest(url="https://example.com/test")
-    # Default should be 512
-    assert request.max_tokens == 512
-def test_summary_completeness_no_cutoff(client: TestClient):
-    """Integration test: Verify summaries end properly without mid-sentence cutoffs."""
-    with patch(
-        "app.services.article_scraper.article_scraper_service.scrape_article"
-    ) as mock_scrape:
-        # Long realistic article
-        article_text = """
-        Artificial intelligence has revolutionized the technology industry in recent years.
-        Machine learning models are now capable of understanding complex patterns in data.
-        Deep learning techniques have enabled breakthrough achievements in computer vision.
-        Natural language processing has made significant strides in understanding human language.
-        Researchers continue to push the boundaries of what AI can accomplish.
-        The integration of AI into everyday applications has become increasingly common.
-        From virtual assistants to recommendation systems, AI is everywhere.
-        Companies are investing billions of dollars in AI research and development.
-        Ethical considerations around AI deployment are gaining more attention.
-        The future of AI holds both promise and challenges for society.
-        """ * 5  # Make it longer to test token limits
-        mock_scrape.return_value = {
-            "text": article_text,
-            "title": "AI Revolution Article",
-            "author": "Tech Writer",
-            "url": "https://example.com/ai-article",
-            "method": "static",
-            "scrape_time_ms": 250.0,
-        }
-        # Mock streaming that returns complete sentences
-        async def mock_stream(*args, **kwargs):
-            # Simulate a complete summary with proper ending
-            summary_parts = [
-                "Artificial",
-                " intelligence",
-                " has",
-                " transformed",
-                " technology",
-                ",",
-                " with",
-                " machine",
-                " learning",
-                " and",
-                " deep",
-                " learning",
-                " enabling",
-                " breakthroughs",
-                " in",
-                " computer",
-                " vision",
-                " and",
-                " natural",
-                " language",
-                " processing",
-                ".",  # Complete sentence
-            ]
-            for i, part in enumerate(summary_parts):
-                yield {"content": part, "done": False, "tokens_used": i + 1}
-            yield {"content": "", "done": True, "tokens_used": len(summary_parts)}
-        with patch(
-            "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
-            side_effect=mock_stream,
-        ):
-            response = client.post(
-                "/api/v3/scrape-and-summarize/stream",
-                json={"url": "https://example.com/ai-article", "include_metadata": False},
-            )
-            assert response.status_code == 200
-            # Collect all content chunks
-            summary_text = ""
-            for line in response.text.split("\n"):
-                if line.startswith("data: "):
-                    try:
-                        event = json.loads(line[6:])
-                        if "content" in event and not event.get("done", False):
-                            summary_text += event["content"]
-                    except json.JSONDecodeError:
-                        pass
-            # Verify summary ends with proper punctuation
-            assert summary_text.strip(), "Summary should not be empty"
-            assert summary_text.strip()[-1] in [
-                ".",
-                "!",
-                "?",
-            ], f"Summary should end with punctuation, got: '{summary_text.strip()[-20:]}'"
-            # Verify summary doesn't end mid-word (no trailing incomplete words)
-            last_word = summary_text.strip().split()[-1] if summary_text.strip() else ""
-            # Last word should end with punctuation (complete sentence)
-            if last_word:
-                assert last_word[-1] in [
-                    ".",
-                    "!",
-                    "?",
-                    ",",
-                ], f"Last word should have punctuation: '{last_word}'"
-def test_text_mode_adaptive_tokens(client: TestClient):
-    """Test V3 text mode (no URL) with adaptive token calculation."""
-    # Long text input
-    long_text = "This is a test article. " * 100  # ~2500 chars
-    captured_kwargs = {}
-    async def mock_stream(*args, **kwargs):
-        captured_kwargs.update(kwargs)
-        yield {"content": "Summary of the test.", "done": False, "tokens_used": 5}
-        yield {"content": "", "done": True, "tokens_used": 5}
-    with patch(
-        "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream",
-        side_effect=mock_stream,
-    ):
-        response = client.post(
-            "/api/v3/scrape-and-summarize/stream",
-            json={"text": long_text, "include_metadata": True},
-        )
-        assert response.status_code == 200
-        # Verify adaptive tokens were calculated for text mode too
-        assert captured_kwargs.get("max_new_tokens", 0) >= 300
-        assert captured_kwargs.get("min_length") is not None
-        # Parse events to verify metadata has text mode indicator
-        events = []
-        for line in response.text.split("\n"):
-            if line.startswith("data: "):
-                try:
-                    events.append(json.loads(line[6:]))
-                except json.JSONDecodeError:
-                    pass
-        metadata_events = [e for e in events if e.get("type") == "metadata"]
-        assert len(metadata_events) == 1
-        assert metadata_events[0]["data"]["input_type"] == "text"
-        assert metadata_events[0]["data"]["text_length"] == len(long_text)

         json={"url": "https://example.com/test", "top_p": 1.5},  # Too high
     )
     assert response.status_code == 422