Spaces:

colin730
/

SummarizerApp

Running

ming commited on 3 days ago

Commit

9884884

1 Parent(s): 698636a

Improve V2 summarization: adaptive tokens, recursive summarization, better defaults

- Add temperature and top_p parameters to SummarizeRequest schema (defaults: 0.3, 0.9)
- Implement adaptive max_new_tokens logic: 60-100 for short texts, proportional for long texts
- Add recursive summarization for texts >1500 chars with chunking and summary-of-summaries
- Fix generation parameters to prevent rambling (reduce min_new_tokens, neutral length_penalty)
- Update default prompt to be more concise
- Add comprehensive unit tests for all improvements (40 tests)
- Fix V2 API to generate concise summaries instead of rambling output

Files changed (6) hide show

app/api/v1/schemas.py +3 -1
app/api/v2/summarize.py +20 -4
app/services/hf_streaming_summarizer.py +227 -10
tests/test_hf_streaming_improvements.py +286 -0
tests/test_schemas.py +52 -1
tests/test_v2_api.py +138 -0

app/api/v1/schemas.py CHANGED Viewed

@@ -10,8 +10,10 @@ class SummarizeRequest(BaseModel):
     text: str = Field(..., min_length=1, max_length=32000, description="Text to summarize")
     max_tokens: Optional[int] = Field(default=256, ge=1, le=2048, description="Maximum tokens for summary")
     prompt: Optional[str] = Field(
-        default="Provide a comprehensive summary of the following text, including main arguments, key findings, important details, and specific examples. Structure your response clearly:",
         max_length=500,
         description="Custom prompt for summarization"
     )

     text: str = Field(..., min_length=1, max_length=32000, description="Text to summarize")
     max_tokens: Optional[int] = Field(default=256, ge=1, le=2048, description="Maximum tokens for summary")
+    temperature: Optional[float] = Field(default=0.3, ge=0.0, le=2.0, description="Sampling temperature for generation")
+    top_p: Optional[float] = Field(default=0.9, ge=0.0, le=1.0, description="Nucleus sampling parameter")
     prompt: Optional[str] = Field(
+        default="Summarize the key points concisely:",
         max_length=500,
         description="Custom prompt for summarization"
     )

app/api/v2/summarize.py CHANGED Viewed

@@ -27,12 +27,28 @@ async def summarize_stream(payload: SummarizeRequest):
 async def _stream_generator(payload: SummarizeRequest):
     """Generator function for streaming SSE responses using HuggingFace."""
     try:
         async for chunk in hf_streaming_service.summarize_text_stream(
             text=payload.text,
-            max_new_tokens=payload.max_tokens or 128,  # Map max_tokens to max_new_tokens
-            temperature=0.7,  # Use default temperature
-            top_p=0.95,  # Use default top_p
-            prompt=payload.prompt or "Provide a comprehensive summary of the following text, including main arguments, key findings, important details, and specific examples. Structure your response clearly:",
         ):
             # Format as SSE event (same format as V1)
             sse_data = json.dumps(chunk)

 async def _stream_generator(payload: SummarizeRequest):
     """Generator function for streaming SSE responses using HuggingFace."""
     try:
+        # Calculate adaptive max_new_tokens based on text length
+        text_length = len(payload.text)
+        if text_length < 1500:
+            # Short texts: use 60-100 tokens
+            adaptive_max_tokens = min(100, max(60, text_length // 15))
+        else:
+            # Longer texts: scale proportionally but cap appropriately
+            adaptive_max_tokens = min(400, max(100, text_length // 20))
+        # Use adaptive calculation by default, but allow user override
+        # Check if max_tokens was explicitly provided (not just the default 256)
+        if hasattr(payload, 'model_fields_set') and 'max_tokens' in payload.model_fields_set:
+            max_new_tokens = payload.max_tokens
+        else:
+            max_new_tokens = adaptive_max_tokens
         async for chunk in hf_streaming_service.summarize_text_stream(
             text=payload.text,
+            max_new_tokens=max_new_tokens,
+            temperature=payload.temperature,  # Use user-provided temperature
+            top_p=payload.top_p,  # Use user-provided top_p
+            prompt=payload.prompt,
         ):
             # Format as SSE event (same format as V1)
             sse_data = json.dumps(chunk)

app/services/hf_streaming_summarizer.py CHANGED Viewed

@@ -164,7 +164,7 @@ class HFStreamingSummarizer:
         max_new_tokens: int = None,
         temperature: float = None,
         top_p: float = None,
-        prompt: str = "Provide a comprehensive summary of the following text, including main arguments, key findings, important details, and specific examples. Structure your response clearly:",
     ) -> AsyncGenerator[Dict[str, Any], None]:
         """
         Stream text summarization using HuggingFace's TextIteratorStreamer.
@@ -194,13 +194,19 @@ class HFStreamingSummarizer:
         logger.info(f"Processing text of {text_length} chars with HuggingFace model: {settings.hf_model_id}")
         try:
             # Use provided parameters or sensible defaults
-            # Aim for ~200–400 tokens summary by default.
-            # If settings.hf_max_new_tokens is small, override with 256.
-            max_new_tokens = max_new_tokens or max(getattr(settings, "hf_max_new_tokens", 0) or 0, 256)
-            temperature = temperature or settings.hf_temperature
-            top_p = top_p or settings.hf_top_p
             # Determine a generous encoder max length (respect tokenizer.model_max_length)
             model_max = getattr(self.tokenizer, "model_max_length", 1024)
@@ -319,10 +325,10 @@ class HFStreamingSummarizer:
             gen_kwargs["num_return_sequences"] = 1
             gen_kwargs["num_beams"] = 1
             gen_kwargs["num_beam_groups"] = 1
-            # Ensure we don't stop too early; set a floor and slightly favor longer generations
-            gen_kwargs["min_new_tokens"] = max(96, min(192, max_new_tokens // 2))  # floor ~100–192
-            # length_penalty > 1.0 encourages longer outputs on encoder-decoder models
-            gen_kwargs["length_penalty"] = 1.1
             # Reduce premature EOS in some checkpoints (optional)
             gen_kwargs["no_repeat_ngram_size"] = 3
             gen_kwargs["repetition_penalty"] = 1.05
@@ -376,6 +382,217 @@ class HFStreamingSummarizer:
                 "error": "HF summarization failed. See server logs for traceback.",
             }
     async def check_health(self) -> bool:
         """
         Check if the HuggingFace model is properly initialized and ready.

         max_new_tokens: int = None,
         temperature: float = None,
         top_p: float = None,
+        prompt: str = "Summarize the key points concisely:",
     ) -> AsyncGenerator[Dict[str, Any], None]:
         """
         Stream text summarization using HuggingFace's TextIteratorStreamer.
         logger.info(f"Processing text of {text_length} chars with HuggingFace model: {settings.hf_model_id}")
+        # Check if text is long enough to require recursive summarization
+        if text_length > 1500:
+            logger.info(f"Text is long ({text_length} chars), using recursive summarization")
+            async for chunk in self._recursive_summarize(text, max_new_tokens, temperature, top_p, prompt):
+                yield chunk
+            return
         try:
             # Use provided parameters or sensible defaults
+            # For short texts, aim for concise summaries (60-100 tokens)
+            max_new_tokens = max_new_tokens or max(getattr(settings, "hf_max_new_tokens", 0) or 0, 80)
+            temperature = temperature or getattr(settings, "hf_temperature", 0.3)
+            top_p = top_p or getattr(settings, "hf_top_p", 0.9)
             # Determine a generous encoder max length (respect tokenizer.model_max_length)
             model_max = getattr(self.tokenizer, "model_max_length", 1024)
             gen_kwargs["num_return_sequences"] = 1
             gen_kwargs["num_beams"] = 1
             gen_kwargs["num_beam_groups"] = 1
+            # Set conservative min_new_tokens to prevent rambling
+            gen_kwargs["min_new_tokens"] = max(20, min(50, max_new_tokens // 4))  # floor ~20-50
+            # Use neutral length_penalty to avoid encouraging longer outputs
+            gen_kwargs["length_penalty"] = 1.0
             # Reduce premature EOS in some checkpoints (optional)
             gen_kwargs["no_repeat_ngram_size"] = 3
             gen_kwargs["repetition_penalty"] = 1.05
                 "error": "HF summarization failed. See server logs for traceback.",
             }
+    async def _recursive_summarize(
+        self,
+        text: str,
+        max_new_tokens: int,
+        temperature: float,
+        top_p: float,
+        prompt: str,
+    ) -> AsyncGenerator[Dict[str, Any], None]:
+        """
+        Recursively summarize long text by chunking and summarizing each chunk,
+        then summarizing the summaries if there are multiple chunks.
+        """
+        try:
+            # Split text into chunks of ~800-1000 tokens
+            chunks = _split_into_chunks(text, chunk_chars=4000, overlap=400)
+            logger.info(f"Split long text into {len(chunks)} chunks for recursive summarization")
+            chunk_summaries = []
+            # Summarize each chunk
+            for i, chunk in enumerate(chunks):
+                logger.info(f"Summarizing chunk {i+1}/{len(chunks)}")
+                # Use smaller max_new_tokens for individual chunks
+                chunk_max_tokens = min(max_new_tokens, 80)
+                chunk_summary = ""
+                async for chunk_result in self._single_chunk_summarize(
+                    chunk, chunk_max_tokens, temperature, top_p, prompt
+                ):
+                    if chunk_result.get("content"):
+                        chunk_summary += chunk_result["content"]
+                    yield chunk_result  # Stream each chunk's summary
+                chunk_summaries.append(chunk_summary.strip())
+            # If we have multiple chunks, create a final summary of summaries
+            if len(chunk_summaries) > 1:
+                logger.info("Creating final summary of summaries")
+                combined_summaries = "\n\n".join(chunk_summaries)
+                # Use original max_new_tokens for final summary
+                async for final_result in self._single_chunk_summarize(
+                    combined_summaries, max_new_tokens, temperature, top_p,
+                    "Summarize the key points from these summaries:"
+                ):
+                    yield final_result
+            else:
+                # Single chunk, just yield the done signal
+                yield {
+                    "content": "",
+                    "done": True,
+                    "tokens_used": 0,
+                }
+        except Exception as e:
+            logger.exception("❌ Recursive summarization failed")
+            yield {
+                "content": "",
+                "done": True,
+                "error": f"Recursive summarization failed: {str(e)}",
+            }
+    async def _single_chunk_summarize(
+        self,
+        text: str,
+        max_new_tokens: int,
+        temperature: float,
+        top_p: float,
+        prompt: str,
+    ) -> AsyncGenerator[Dict[str, Any], None]:
+        """
+        Summarize a single chunk of text using the same logic as the main method
+        but without the recursive check.
+        """
+        if not self.model or not self.tokenizer:
+            error_msg = "HuggingFace model not available. Please check model initialization."
+            logger.error(f"❌ {error_msg}")
+            yield {
+                "content": "",
+                "done": True,
+                "error": error_msg,
+            }
+            return
+        try:
+            # Use provided parameters or sensible defaults
+            max_new_tokens = max_new_tokens or 80
+            temperature = temperature or 0.3
+            top_p = top_p or 0.9
+            # Determine encoder max length
+            model_max = getattr(self.tokenizer, "model_max_length", 1024)
+            if not isinstance(model_max, int) or model_max <= 0:
+                model_max = 1024
+            enc_max_len = min(model_max, 2048)
+            # Build tokenized inputs
+            if "t5" in settings.hf_model_id.lower():
+                full_prompt = f"summarize: {text}"
+                inputs_raw = self.tokenizer(full_prompt, return_tensors="pt", max_length=enc_max_len, truncation=True)
+            elif "bart" in settings.hf_model_id.lower():
+                inputs_raw = self.tokenizer(text, return_tensors="pt", max_length=enc_max_len, truncation=True)
+            else:
+                messages = [
+                    {"role": "system", "content": prompt},
+                    {"role": "user", "content": text}
+                ]
+                if hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template:
+                    inputs_raw = self.tokenizer.apply_chat_template(
+                        messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
+                    )
+                else:
+                    full_prompt = f"{prompt}\n\n{text}"
+                    inputs_raw = self.tokenizer(full_prompt, return_tensors="pt")
+            # Normalize inputs (same logic as main method)
+            if isinstance(inputs_raw, (dict, BatchEncoding)):
+                try:
+                    inputs = dict(inputs_raw)
+                except Exception:
+                    inputs = dict(getattr(inputs_raw, "data", {}))
+            else:
+                inputs = {"input_ids": inputs_raw}
+            if "attention_mask" not in inputs and "input_ids" in inputs:
+                if TRANSFORMERS_AVAILABLE and 'torch' in globals() and isinstance(inputs["input_ids"], torch.Tensor):
+                    inputs["attention_mask"] = torch.ones_like(inputs["input_ids"])
+            def _to_singleton_batch(d):
+                out = {}
+                for k, v in d.items():
+                    if TRANSFORMERS_AVAILABLE and 'torch' in globals() and isinstance(v, torch.Tensor):
+                        if v.dim() == 1:
+                            out[k] = v.unsqueeze(0)
+                        elif v.dim() >= 2:
+                            out[k] = v[:1]
+                        else:
+                            out[k] = v
+                    else:
+                        out[k] = v
+                return out
+            inputs = _to_singleton_batch(inputs)
+            # Validate pad/eos ids
+            pad_id = self.tokenizer.pad_token_id
+            eos_id = self.tokenizer.eos_token_id
+            if pad_id is None and eos_id is not None:
+                pad_id = eos_id
+            elif pad_id is None and eos_id is None:
+                pad_id = 0
+            # Create streamer
+            streamer = TextIteratorStreamer(
+                self.tokenizer,
+                skip_prompt=True,
+                skip_special_tokens=True
+            )
+            gen_kwargs = {
+                **inputs,
+                "streamer": streamer,
+                "max_new_tokens": max_new_tokens,
+                "do_sample": True,
+                "temperature": temperature,
+                "top_p": top_p,
+                "pad_token_id": pad_id,
+                "eos_token_id": eos_id,
+                "num_return_sequences": 1,
+                "num_beams": 1,
+                "num_beam_groups": 1,
+                "min_new_tokens": max(20, min(50, max_new_tokens // 4)),
+                "length_penalty": 1.0,
+                "no_repeat_ngram_size": 3,
+                "repetition_penalty": 1.05,
+            }
+            generation_thread = threading.Thread(target=self.model.generate, kwargs=gen_kwargs, daemon=True)
+            generation_thread.start()
+            # Stream tokens as they arrive
+            token_count = 0
+            for text_chunk in streamer:
+                if text_chunk:
+                    yield {
+                        "content": text_chunk,
+                        "done": False,
+                        "tokens_used": token_count,
+                    }
+                    token_count += 1
+            # Wait for generation to complete
+            generation_thread.join()
+            # Send final "done" chunk
+            yield {
+                "content": "",
+                "done": True,
+                "tokens_used": token_count,
+            }
+        except Exception:
+            logger.exception("❌ Single chunk summarization failed")
+            yield {
+                "content": "",
+                "done": True,
+                "error": "Single chunk summarization failed. See server logs for traceback.",
+            }
     async def check_health(self) -> bool:
         """
         Check if the HuggingFace model is properly initialized and ready.

tests/test_hf_streaming_improvements.py ADDED Viewed

	@@ -0,0 +1,286 @@

+"""
+Tests for HuggingFace streaming summarizer improvements.
+"""
+import pytest
+from unittest.mock import AsyncMock, patch, MagicMock
+from app.services.hf_streaming_summarizer import HFStreamingSummarizer, _split_into_chunks
+class TestSplitIntoChunks:
+    """Test the text chunking utility function."""
+    def test_split_short_text(self):
+        """Test splitting short text that doesn't need chunking."""
+        text = "This is a short text."
+        chunks = _split_into_chunks(text, chunk_chars=100, overlap=20)
+        assert len(chunks) == 1
+        assert chunks[0] == text
+    def test_split_long_text(self):
+        """Test splitting long text into multiple chunks."""
+        text = "This is a longer text. " * 50  # ~1000 chars
+        chunks = _split_into_chunks(text, chunk_chars=200, overlap=50)
+        assert len(chunks) > 1
+        # All chunks should be within reasonable size
+        for chunk in chunks:
+            assert len(chunk) <= 200
+            assert len(chunk) > 0
+    def test_chunk_overlap(self):
+        """Test that chunks have proper overlap."""
+        text = "This is a test text for overlap testing. " * 20  # ~800 chars
+        chunks = _split_into_chunks(text, chunk_chars=200, overlap=50)
+        if len(chunks) > 1:
+            # Check that consecutive chunks share some content
+            for i in range(len(chunks) - 1):
+                # There should be some overlap between consecutive chunks
+                assert len(chunks[i]) > 0
+                assert len(chunks[i+1]) > 0
+    def test_empty_text(self):
+        """Test splitting empty text."""
+        chunks = _split_into_chunks("", chunk_chars=100, overlap=20)
+        assert len(chunks) == 0  # Empty text returns empty list
+class TestHFStreamingSummarizerImprovements:
+    """Test improvements to HFStreamingSummarizer."""
+    @pytest.fixture
+    def mock_summarizer(self):
+        """Create a mock HFStreamingSummarizer for testing."""
+        summarizer = HFStreamingSummarizer()
+        summarizer.model = MagicMock()
+        summarizer.tokenizer = MagicMock()
+        return summarizer
+    @pytest.mark.asyncio
+    async def test_recursive_summarization_long_text(self, mock_summarizer):
+        """Test recursive summarization for long text."""
+        # Mock the _single_chunk_summarize method
+        async def mock_single_chunk(text, max_tokens, temp, top_p, prompt):
+            yield {"content": f"Summary of: {text[:50]}...", "done": False, "tokens_used": 10}
+            yield {"content": "", "done": True, "tokens_used": 10}
+        mock_summarizer._single_chunk_summarize = mock_single_chunk
+        # Long text (>1500 chars)
+        long_text = "This is a very long text that should trigger recursive summarization. " * 30  # ~2000+ chars
+        results = []
+        async for chunk in mock_summarizer._recursive_summarize(
+            long_text, max_new_tokens=100, temperature=0.3, top_p=0.9, prompt="Test prompt"
+        ):
+            results.append(chunk)
+        # Should have multiple chunks (one for each text chunk + final summary)
+        assert len(results) > 2  # At least 2 chunks + final done signal
+        # Check that we get proper streaming format
+        content_chunks = [r for r in results if r.get("content") and not r.get("done")]
+        assert len(content_chunks) > 0
+        # Should end with done signal
+        final_chunk = results[-1]
+        assert final_chunk.get("done") is True
+    @pytest.mark.asyncio
+    async def test_recursive_summarization_single_chunk(self, mock_summarizer):
+        """Test recursive summarization when text fits in single chunk."""
+        # Mock the _single_chunk_summarize method
+        async def mock_single_chunk(text, max_tokens, temp, top_p, prompt):
+            yield {"content": "Single chunk summary", "done": False, "tokens_used": 5}
+            yield {"content": "", "done": True, "tokens_used": 5}
+        mock_summarizer._single_chunk_summarize = mock_single_chunk
+        # Text that would fit in single chunk after splitting
+        text = "This is a medium length text. " * 20  # ~600 chars
+        results = []
+        async for chunk in mock_summarizer._recursive_summarize(
+            text, max_new_tokens=100, temperature=0.3, top_p=0.9, prompt="Test prompt"
+        ):
+            results.append(chunk)
+        # Should have at least 2 chunks (content + done)
+        assert len(results) >= 2
+        # Should end with done signal
+        final_chunk = results[-1]
+        assert final_chunk.get("done") is True
+    @pytest.mark.asyncio
+    async def test_single_chunk_summarize_parameters(self, mock_summarizer):
+        """Test that _single_chunk_summarize uses correct parameters."""
+        # Mock the tokenizer and model
+        mock_summarizer.tokenizer.model_max_length = 1024
+        mock_summarizer.tokenizer.pad_token_id = 0
+        mock_summarizer.tokenizer.eos_token_id = 1
+        # Mock the model generation
+        mock_streamer = MagicMock()
+        mock_streamer.__iter__ = MagicMock(return_value=iter(["test", "summary"]))
+        with patch('app.services.hf_streaming_summarizer.TextIteratorStreamer', return_value=mock_streamer):
+            with patch('app.services.hf_streaming_summarizer.settings') as mock_settings:
+                mock_settings.hf_model_id = "test-model"
+                results = []
+                async for chunk in mock_summarizer._single_chunk_summarize(
+                    "Test text", max_new_tokens=80, temperature=0.3, top_p=0.9, prompt="Test prompt"
+                ):
+                    results.append(chunk)
+                # Should have content chunks + final done
+                assert len(results) >= 2
+                # Check that generation was called with correct parameters
+                mock_summarizer.model.generate.assert_called_once()
+                call_kwargs = mock_summarizer.model.generate.call_args[1]
+                assert call_kwargs["max_new_tokens"] == 80
+                assert call_kwargs["temperature"] == 0.3
+                assert call_kwargs["top_p"] == 0.9
+                assert call_kwargs["length_penalty"] == 1.0  # Should be neutral
+                assert call_kwargs["min_new_tokens"] <= 50  # Should be conservative
+    @pytest.mark.asyncio
+    async def test_single_chunk_summarize_defaults(self, mock_summarizer):
+        """Test that _single_chunk_summarize uses correct defaults."""
+        # Mock the tokenizer and model
+        mock_summarizer.tokenizer.model_max_length = 1024
+        mock_summarizer.tokenizer.pad_token_id = 0
+        mock_summarizer.tokenizer.eos_token_id = 1
+        # Mock the model generation
+        mock_streamer = MagicMock()
+        mock_streamer.__iter__ = MagicMock(return_value=iter(["test", "summary"]))
+        with patch('app.services.hf_streaming_summarizer.TextIteratorStreamer', return_value=mock_streamer):
+            with patch('app.services.hf_streaming_summarizer.settings') as mock_settings:
+                mock_settings.hf_model_id = "test-model"
+                results = []
+                async for chunk in mock_summarizer._single_chunk_summarize(
+                    "Test text", max_new_tokens=None, temperature=None, top_p=None, prompt="Test prompt"
+                ):
+                    results.append(chunk)
+                # Check that generation was called with correct defaults
+                mock_summarizer.model.generate.assert_called_once()
+                call_kwargs = mock_summarizer.model.generate.call_args[1]
+                assert call_kwargs["max_new_tokens"] == 80  # Default
+                assert call_kwargs["temperature"] == 0.3  # Default
+                assert call_kwargs["top_p"] == 0.9  # Default
+    @pytest.mark.asyncio
+    async def test_recursive_summarization_error_handling(self, mock_summarizer):
+        """Test error handling in recursive summarization."""
+        # Mock _single_chunk_summarize to raise an exception
+        async def mock_single_chunk_error(text, max_tokens, temp, top_p, prompt):
+            raise Exception("Test error")
+            yield  # This line will never be reached, but makes it an async generator
+        mock_summarizer._single_chunk_summarize = mock_single_chunk_error
+        long_text = "This is a long text. " * 30
+        results = []
+        async for chunk in mock_summarizer._recursive_summarize(
+            long_text, max_new_tokens=100, temperature=0.3, top_p=0.9, prompt="Test prompt"
+        ):
+            results.append(chunk)
+        # Should have error chunk
+        assert len(results) == 1
+        error_chunk = results[0]
+        assert error_chunk.get("done") is True
+        assert "error" in error_chunk
+        assert "Test error" in error_chunk["error"]
+    @pytest.mark.asyncio
+    async def test_single_chunk_summarize_error_handling(self, mock_summarizer):
+        """Test error handling in single chunk summarization."""
+        # Mock model to raise exception
+        mock_summarizer.model.generate.side_effect = Exception("Generation error")
+        results = []
+        async for chunk in mock_summarizer._single_chunk_summarize(
+            "Test text", max_new_tokens=80, temperature=0.3, top_p=0.9, prompt="Test prompt"
+        ):
+            results.append(chunk)
+        # Should have error chunk
+        assert len(results) == 1
+        error_chunk = results[0]
+        assert error_chunk.get("done") is True
+        assert "error" in error_chunk
+        assert "Generation error" in error_chunk["error"]
+class TestHFStreamingSummarizerIntegration:
+    """Integration tests for HFStreamingSummarizer improvements."""
+    @pytest.mark.asyncio
+    async def test_summarize_text_stream_long_text_detection(self):
+        """Test that summarize_text_stream detects long text and uses recursive summarization."""
+        summarizer = HFStreamingSummarizer()
+        # Mock the recursive summarization method
+        async def mock_recursive(text, max_tokens, temp, top_p, prompt):
+            yield {"content": "Recursive summary", "done": False, "tokens_used": 10}
+            yield {"content": "", "done": True, "tokens_used": 10}
+        summarizer._recursive_summarize = mock_recursive
+        # Long text (>1500 chars)
+        long_text = "This is a very long text. " * 60  # ~1500+ chars
+        results = []
+        async for chunk in summarizer.summarize_text_stream(long_text):
+            results.append(chunk)
+        # Should have used recursive summarization
+        assert len(results) >= 2
+        assert results[0]["content"] == "Recursive summary"
+        assert results[-1]["done"] is True
+    @pytest.mark.asyncio
+    async def test_summarize_text_stream_short_text_normal_flow(self):
+        """Test that summarize_text_stream uses normal flow for short text."""
+        summarizer = HFStreamingSummarizer()
+        # Mock model and tokenizer
+        summarizer.model = MagicMock()
+        summarizer.tokenizer = MagicMock()
+        summarizer.tokenizer.model_max_length = 1024
+        summarizer.tokenizer.pad_token_id = 0
+        summarizer.tokenizer.eos_token_id = 1
+        # Mock the streamer
+        mock_streamer = MagicMock()
+        mock_streamer.__iter__ = MagicMock(return_value=iter(["short", "summary"]))
+        with patch('app.services.hf_streaming_summarizer.TextIteratorStreamer', return_value=mock_streamer):
+            with patch('app.services.hf_streaming_summarizer.settings') as mock_settings:
+                mock_settings.hf_model_id = "test-model"
+                mock_settings.hf_temperature = 0.3
+                mock_settings.hf_top_p = 0.9
+                # Short text (<1500 chars)
+                short_text = "This is a short text."
+                results = []
+                async for chunk in summarizer.summarize_text_stream(short_text):
+                    results.append(chunk)
+                # Should have used normal flow (not recursive)
+                assert len(results) >= 2
+                assert results[0]["content"] == "short"
+                assert results[1]["content"] == "summary"
+                assert results[-1]["done"] is True

tests/test_schemas.py CHANGED Viewed

@@ -15,7 +15,7 @@ class TestSummarizeRequest:
         assert request.text == sample_text.strip()
         assert request.max_tokens == 256
-        assert request.prompt == "Summarize the following text concisely:"
     def test_custom_parameters(self):
         """Test request with custom parameters."""
@@ -73,6 +73,57 @@ class TestSummarizeRequest:
         long_prompt = "x" * 501
         with pytest.raises(ValidationError):
             SummarizeRequest(text="test", prompt=long_prompt)
 class TestSummarizeResponse:

         assert request.text == sample_text.strip()
         assert request.max_tokens == 256
+        assert request.prompt == "Summarize the key points concisely:"
     def test_custom_parameters(self):
         """Test request with custom parameters."""
         long_prompt = "x" * 501
         with pytest.raises(ValidationError):
             SummarizeRequest(text="test", prompt=long_prompt)
+    def test_temperature_parameter(self):
+        """Test temperature parameter validation."""
+        # Valid temperature values
+        request = SummarizeRequest(text="test", temperature=0.0)
+        assert request.temperature == 0.0
+        request = SummarizeRequest(text="test", temperature=2.0)
+        assert request.temperature == 2.0
+        request = SummarizeRequest(text="test", temperature=0.3)
+        assert request.temperature == 0.3
+        # Default temperature
+        request = SummarizeRequest(text="test")
+        assert request.temperature == 0.3
+        # Invalid temperature values
+        with pytest.raises(ValidationError):
+            SummarizeRequest(text="test", temperature=-0.1)
+        with pytest.raises(ValidationError):
+            SummarizeRequest(text="test", temperature=2.1)
+    def test_top_p_parameter(self):
+        """Test top_p parameter validation."""
+        # Valid top_p values
+        request = SummarizeRequest(text="test", top_p=0.0)
+        assert request.top_p == 0.0
+        request = SummarizeRequest(text="test", top_p=1.0)
+        assert request.top_p == 1.0
+        request = SummarizeRequest(text="test", top_p=0.9)
+        assert request.top_p == 0.9
+        # Default top_p
+        request = SummarizeRequest(text="test")
+        assert request.top_p == 0.9
+        # Invalid top_p values
+        with pytest.raises(ValidationError):
+            SummarizeRequest(text="test", top_p=-0.1)
+        with pytest.raises(ValidationError):
+            SummarizeRequest(text="test", top_p=1.1)
+    def test_updated_default_prompt(self):
+        """Test that the default prompt has been updated to be more concise."""
+        request = SummarizeRequest(text="test")
+        assert request.prompt == "Summarize the key points concisely:"
 class TestSummarizeResponse:

tests/test_v2_api.py CHANGED Viewed

@@ -155,6 +155,144 @@ class TestV2SummarizeStream:
             assert call_args[1]['prompt'] == "Custom prompt"
             assert call_args[1]['text'] == "Test text"
 class TestV2APICompatibility:
     """Test V2 API compatibility with V1."""

             assert call_args[1]['prompt'] == "Custom prompt"
             assert call_args[1]['text'] == "Test text"
+    @pytest.mark.integration
+    def test_v2_adaptive_token_logic_short_text(self, client: TestClient):
+        """Test adaptive token logic for short texts (<1500 chars)."""
+        with patch('app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream') as mock_stream:
+            async def mock_generator():
+                yield {"content": "", "done": True}
+            mock_stream.return_value = mock_generator()
+            # Short text (500 chars)
+            short_text = "This is a short text. " * 20  # ~500 chars
+            response = client.post(
+                "/api/v2/summarize/stream",
+                json={
+                    "text": short_text,
+                    # Don't specify max_tokens to test adaptive logic
+                }
+            )
+            assert response.status_code == 200
+            # Verify service was called with adaptive max_new_tokens
+            mock_stream.assert_called_once()
+            call_args = mock_stream.call_args
+            # For short text, should use 60-100 tokens
+            max_new_tokens = call_args[1]['max_new_tokens']
+            assert 60 <= max_new_tokens <= 100
+    @pytest.mark.integration
+    def test_v2_adaptive_token_logic_long_text(self, client: TestClient):
+        """Test adaptive token logic for long texts (>1500 chars)."""
+        with patch('app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream') as mock_stream:
+            async def mock_generator():
+                yield {"content": "", "done": True}
+            mock_stream.return_value = mock_generator()
+            # Long text (2000 chars)
+            long_text = "This is a longer text that should trigger adaptive token logic. " * 40  # ~2000 chars
+            response = client.post(
+                "/api/v2/summarize/stream",
+                json={
+                    "text": long_text,
+                    # Don't specify max_tokens to test adaptive logic
+                }
+            )
+            assert response.status_code == 200
+            # Verify service was called with adaptive max_new_tokens
+            mock_stream.assert_called_once()
+            call_args = mock_stream.call_args
+            # For long text, should use proportional scaling but capped
+            max_new_tokens = call_args[1]['max_new_tokens']
+            assert 100 <= max_new_tokens <= 400
+    @pytest.mark.integration
+    def test_v2_temperature_and_top_p_parameters(self, client: TestClient):
+        """Test that temperature and top_p parameters are passed correctly."""
+        with patch('app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream') as mock_stream:
+            async def mock_generator():
+                yield {"content": "", "done": True}
+            mock_stream.return_value = mock_generator()
+            response = client.post(
+                "/api/v2/summarize/stream",
+                json={
+                    "text": "Test text",
+                    "temperature": 0.5,
+                    "top_p": 0.8
+                }
+            )
+            assert response.status_code == 200
+            # Verify service was called with correct parameters
+            mock_stream.assert_called_once()
+            call_args = mock_stream.call_args
+            assert call_args[1]['temperature'] == 0.5
+            assert call_args[1]['top_p'] == 0.8
+    @pytest.mark.integration
+    def test_v2_default_temperature_and_top_p(self, client: TestClient):
+        """Test that default temperature and top_p values are used when not specified."""
+        with patch('app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream') as mock_stream:
+            async def mock_generator():
+                yield {"content": "", "done": True}
+            mock_stream.return_value = mock_generator()
+            response = client.post(
+                "/api/v2/summarize/stream",
+                json={
+                    "text": "Test text"
+                    # Don't specify temperature or top_p
+                }
+            )
+            assert response.status_code == 200
+            # Verify service was called with default parameters
+            mock_stream.assert_called_once()
+            call_args = mock_stream.call_args
+            assert call_args[1]['temperature'] == 0.3  # Default temperature
+            assert call_args[1]['top_p'] == 0.9  # Default top_p
+    @pytest.mark.integration
+    def test_v2_recursive_summarization_trigger(self, client: TestClient):
+        """Test that recursive summarization is triggered for long texts."""
+        with patch('app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream') as mock_stream:
+            async def mock_generator():
+                yield {"content": "", "done": True}
+            mock_stream.return_value = mock_generator()
+            # Very long text (>1500 chars) to trigger recursive summarization
+            very_long_text = "This is a very long text that should definitely trigger recursive summarization logic. " * 30  # ~2000+ chars
+            response = client.post(
+                "/api/v2/summarize/stream",
+                json={
+                    "text": very_long_text
+                }
+            )
+            assert response.status_code == 200
+            # The service should be called, and internally it should detect long text
+            # and use recursive summarization
+            mock_stream.assert_called_once()
 class TestV2APICompatibility:
     """Test V2 API compatibility with V1."""