Spaces:

colin730
/

SummarizerApp

Running

ming commited on 14 days ago

Commit

3570bfd

1 Parent(s): 87d9e3a

Fix HF streaming crash: enforce batch size = 1 for TextIteratorStreamer

- Normalize tokenizer outputs to dict format (handles tensor returns from apply_chat_template)
- Enforce batch size == 1 for all input tensors (add batch dim if 1D, trim if > 1)
- Add num_return_sequences=1 to gen_kwargs for streamer safety
- Add regression test to verify batch size enforcement

Fixes crash: 'TextStreamer only supports batch size 1'

Files changed (2) hide show

app/services/hf_streaming_summarizer.py +25 -40
tests/test_hf_streaming.py +24 -0

app/services/hf_streaming_summarizer.py CHANGED Viewed

@@ -172,57 +172,40 @@ class HFStreamingSummarizer:
             temperature = temperature or settings.hf_temperature
             top_p = top_p or settings.hf_top_p
-            # Check if model is t5 (doesn't use chat templates)
             if "t5" in settings.hf_model_id.lower():
-                # t5 models use simple prompt format for summarization
                 full_prompt = f"summarize: {text}"
-                inputs = self.tokenizer(
-                    full_prompt,
-                    return_tensors="pt",
-                    max_length=512,
-                    truncation=True
-                )
             elif "bart" in settings.hf_model_id.lower():
-                # BART models (including DistilBART) expect direct text input
-                # No prefixes or chat templates needed
-                inputs = self.tokenizer(
-                    text,
-                    return_tensors="pt",
-                    max_length=1024,
-                    truncation=True
-                )
             else:
-                # Other models use chat template
                 messages = [
                     {"role": "system", "content": prompt},
-                    {"role": "user", "content": text}
                 ]
-                # Apply chat template if available, otherwise use simple prompt
                 if hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template:
-                    inputs = self.tokenizer.apply_chat_template(
-                        messages,
-                        tokenize=True,
-                        add_generation_prompt=True,
-                        return_tensors="pt"
                     )
                 else:
-                    # Fallback to simple prompt format
                     full_prompt = f"{prompt}\n\n{text}"
-                    inputs = self.tokenizer(full_prompt, return_tensors="pt")
-            inputs = inputs.to(self.model.device)
-            # CRITICAL FIX: Ensure batch size is 1 for TextIteratorStreamer
-            # The streamer only works with batch size 1, so we need to ensure
-            # that all input tensors have batch dimension of 1
-            for key, tensor in inputs.items():
-                if tensor.dim() > 1 and tensor.size(0) > 1:
-                    # If batch size > 1, take only the first sample
-                    inputs[key] = tensor[:1]
-                elif tensor.dim() == 1:
-                    # If tensor is 1D, add batch dimension
-                    inputs[key] = tensor.unsqueeze(0)
             # Create streamer for token-by-token output
             streamer = TextIteratorStreamer(
@@ -241,6 +224,8 @@ class HFStreamingSummarizer:
                 "top_p": top_p,
                 "pad_token_id": self.tokenizer.pad_token_id or self.tokenizer.eos_token_id,
             }
             # Run generation in background thread
             generation_thread = threading.Thread(

             temperature = temperature or settings.hf_temperature
             top_p = top_p or settings.hf_top_p
+            # --- Build tokenized inputs robustly ---
             if "t5" in settings.hf_model_id.lower():
                 full_prompt = f"summarize: {text}"
+                inputs_raw = self.tokenizer(full_prompt, return_tensors="pt", max_length=512, truncation=True)
             elif "bart" in settings.hf_model_id.lower():
+                inputs_raw = self.tokenizer(text, return_tensors="pt", max_length=1024, truncation=True)
             else:
                 messages = [
                     {"role": "system", "content": prompt},
+                    {"role": "user", "content": text},
                 ]
                 if hasattr(self.tokenizer, "apply_chat_template") and self.tokenizer.chat_template:
+                    inputs_raw = self.tokenizer.apply_chat_template(
+                        messages, tokenize=True, add_generation_prompt=True, return_tensors="pt"
                     )
                 else:
                     full_prompt = f"{prompt}\n\n{text}"
+                    inputs_raw = self.tokenizer(full_prompt, return_tensors="pt")
+            # Normalize to dict regardless of tokenizer return type
+            if isinstance(inputs_raw, dict):
+                inputs = inputs_raw
+            else:
+                inputs = {"input_ids": inputs_raw}
+            # Move to model device
+            inputs = {k: v.to(self.model.device) for k, v in inputs.items()}
+            # Enforce batch size == 1 for streamer safety
+            for k, v in list(inputs.items()):
+                if v.dim() == 1:
+                    inputs[k] = v.unsqueeze(0)      # [seq] -> [1, seq]
+                elif v.dim() >= 2 and v.size(0) > 1:
+                    inputs[k] = v[:1]               # [B, ...] -> [1, ...]
             # Create streamer for token-by-token output
             streamer = TextIteratorStreamer(
                 "top_p": top_p,
                 "pad_token_id": self.tokenizer.pad_token_id or self.tokenizer.eos_token_id,
             }
+            # Streamer only supports a single sequence
+            gen_kwargs["num_return_sequences"] = 1
             # Run generation in background thread
             generation_thread = threading.Thread(

tests/test_hf_streaming.py CHANGED Viewed

@@ -119,6 +119,30 @@ class TestHFStreamingSummarizer:
             # Expected when torch is not available
             pass
 class TestHFStreamingServiceIntegration:
     """Test the global HF streaming service instance."""

             # Expected when torch is not available
             pass
+    @pytest.mark.asyncio
+    async def test_streaming_single_batch(self):
+        """Test that streaming enforces batch size = 1 and completes successfully."""
+        service = HFStreamingSummarizer()
+        # Skip if model not initialized (transformers not available)
+        if not service.model or not service.tokenizer:
+            pytest.skip("Transformers not available")
+        chunks = []
+        async for chunk in service.summarize_text_stream(
+            text="This is a short test article about New Zealand tech news.",
+            max_new_tokens=32,
+            temperature=0.7,
+            top_p=0.9,
+            prompt="Summarize:"
+        ):
+            chunks.append(chunk)
+        # Should complete without ValueError and have a final done=True
+        assert len(chunks) > 0
+        assert any(c.get("done") for c in chunks)
+        assert all("error" not in c or c.get("error") is None for c in chunks if not c.get("done"))
 class TestHFStreamingServiceIntegration:
     """Test the global HF streaming service instance."""