ming Claude commited on
Commit
f724bab
·
1 Parent(s): b88a215

Fix V3 API to support both URL and text input

Browse files

The V3 endpoint was causing 422 errors when receiving plain text from
the Android app because it only accepted URLs. This update adds
intelligent input detection to handle both modes:

- URL mode: Scrapes article from URL then summarizes
- Text mode: Summarizes provided text directly (no scraping)

Changes:
- Updated schema to make url and text optional (exactly one required)
- Added model validator to ensure mutual exclusivity
- Enhanced endpoint to detect and route based on input type
- Updated metadata to include input_type field
- Added comprehensive documentation and examples

This fixes the 422 Unprocessable Entity errors and provides a single
unified endpoint for the Android app.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <[email protected]>

README.md CHANGED
@@ -236,6 +236,10 @@ for line in response.iter_lines():
236
  ```
237
 
238
  ### V3 API (Web Scraping + Summarization) - Android App Primary Use Case
 
 
 
 
239
  ```python
240
  import requests
241
  import json
@@ -255,13 +259,14 @@ response = requests.post(
255
  for line in response.iter_lines():
256
  if line.startswith(b'data: '):
257
  data = json.loads(line[6:])
258
-
259
  # First event: metadata
260
  if data.get("type") == "metadata":
 
261
  print(f"Title: {data['data']['title']}")
262
  print(f"Author: {data['data']['author']}")
263
  print(f"Scrape time: {data['data']['scrape_latency_ms']}ms\n")
264
-
265
  # Content events
266
  elif "content" in data:
267
  print(data["content"], end="")
@@ -270,6 +275,43 @@ for line in response.iter_lines():
270
  break
271
  ```
272
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
  ### Android Client (SSE)
274
  ```kotlin
275
  // Android SSE client example
@@ -315,10 +357,15 @@ curl -X POST "https://colin730-SummarizerApp.hf.space/api/v2/summarize/stream" \
315
  -H "Content-Type: application/json" \
316
  -d '{"text": "Your text...", "max_tokens": 128}'
317
 
318
- # V3 API (Web scraping + summarization)
319
  curl -X POST "https://colin730-SummarizerApp.hf.space/api/v3/scrape-and-summarize/stream" \
320
  -H "Content-Type: application/json" \
321
  -d '{"url": "https://example.com/article", "max_tokens": 256, "include_metadata": true}'
 
 
 
 
 
322
  ```
323
 
324
  ### Test Script
 
236
  ```
237
 
238
  ### V3 API (Web Scraping + Summarization) - Android App Primary Use Case
239
+
240
+ **V3 supports two modes: URL scraping or direct text summarization**
241
+
242
+ #### Mode 1: URL Scraping (recommended for articles)
243
  ```python
244
  import requests
245
  import json
 
259
  for line in response.iter_lines():
260
  if line.startswith(b'data: '):
261
  data = json.loads(line[6:])
262
+
263
  # First event: metadata
264
  if data.get("type") == "metadata":
265
+ print(f"Input type: {data['data']['input_type']}") # 'url'
266
  print(f"Title: {data['data']['title']}")
267
  print(f"Author: {data['data']['author']}")
268
  print(f"Scrape time: {data['data']['scrape_latency_ms']}ms\n")
269
+
270
  # Content events
271
  elif "content" in data:
272
  print(data["content"], end="")
 
275
  break
276
  ```
277
 
278
+ #### Mode 2: Direct Text Summarization (fallback when scraping fails)
279
+ ```python
280
+ import requests
281
+ import json
282
+
283
+ # V3 direct text summarization (no scraping)
284
+ response = requests.post(
285
+ "https://colin730-SummarizerApp.hf.space/api/v3/scrape-and-summarize/stream",
286
+ json={
287
+ "text": "Your article text here... (minimum 50 characters)",
288
+ "max_tokens": 256,
289
+ "include_metadata": True
290
+ },
291
+ stream=True
292
+ )
293
+
294
+ for line in response.iter_lines():
295
+ if line.startswith(b'data: '):
296
+ data = json.loads(line[6:])
297
+
298
+ # First event: metadata
299
+ if data.get("type") == "metadata":
300
+ print(f"Input type: {data['data']['input_type']}") # 'text'
301
+ print(f"Text length: {data['data']['text_length']} chars\n")
302
+
303
+ # Content events
304
+ elif "content" in data:
305
+ print(data["content"], end="")
306
+ if data["done"]:
307
+ break
308
+ ```
309
+
310
+ **Note:** Provide either `url` OR `text`, not both. Text mode is useful as a fallback when:
311
+ - Article is behind a paywall
312
+ - Website blocks scrapers
313
+ - User has already extracted the text manually
314
+
315
  ### Android Client (SSE)
316
  ```kotlin
317
  // Android SSE client example
 
357
  -H "Content-Type: application/json" \
358
  -d '{"text": "Your text...", "max_tokens": 128}'
359
 
360
+ # V3 API - URL mode (web scraping + summarization)
361
  curl -X POST "https://colin730-SummarizerApp.hf.space/api/v3/scrape-and-summarize/stream" \
362
  -H "Content-Type: application/json" \
363
  -d '{"url": "https://example.com/article", "max_tokens": 256, "include_metadata": true}'
364
+
365
+ # V3 API - Text mode (direct summarization, no scraping)
366
+ curl -X POST "https://colin730-SummarizerApp.hf.space/api/v3/scrape-and-summarize/stream" \
367
+ -H "Content-Type: application/json" \
368
+ -d '{"text": "Your article text here (minimum 50 characters)...", "max_tokens": 256}'
369
  ```
370
 
371
  ### Test Script
V3_FIX_URL_AND_TEXT_INPUT.md ADDED
@@ -0,0 +1,844 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # V3 API Fix: Support Both URL and Text Input
2
+
3
+ ## Problem Statement
4
+
5
+ The V3 endpoint `/api/v3/scrape-and-summarize/stream` currently only accepts URLs in the request body. When the Android app sends plain text instead of a URL, the request fails with **422 Unprocessable Entity** due to URL validation failure.
6
+
7
+ ### Error Symptoms
8
+ ```
9
+ INFO: 10.16.17.219:29372 - "POST /api/v3/scrape-and-summarize/stream HTTP/1.1" 422 Unprocessable Entity
10
+ 2025-11-11 05:39:49,140 - app.core.middleware - INFO - Request lXqCov: POST /api/v3/scrape-and-summarize/stream
11
+ 2025-11-11 05:39:49,143 - app.core.middleware - INFO - Response lXqCov: 422 (2.64ms)
12
+ ```
13
+
14
+ **Key Indicator:** Response time < 3ms means the request is failing at **schema validation** before any scraping logic runs.
15
+
16
+ ### Root Cause
17
+
18
+ **Current Schema** (`app/api/v3/schemas.py`):
19
+ ```python
20
+ class ScrapeAndSummarizeRequest(BaseModel):
21
+ url: str = Field(..., description="URL of article to scrape and summarize")
22
+ # ... other fields
23
+
24
+ @validator('url')
25
+ def validate_url(cls, v):
26
+ # URL validation regex that rejects plain text
27
+ if not url_pattern.match(v):
28
+ raise ValueError('Invalid URL format')
29
+ return v
30
+ ```
31
+
32
+ **Problem:** The `url` field is **required** and must match URL pattern. When Android app sends plain text (non-URL), validation fails → 422 error.
33
+
34
+ ---
35
+
36
+ ## Solution Overview
37
+
38
+ Make the V3 endpoint **intelligent** - it should handle both:
39
+
40
+ 1. **URL Input** → Scrape article from URL + Summarize
41
+ 2. **Text Input** → Skip scraping + Summarize directly
42
+
43
+ This provides a single, unified endpoint for the Android app without needing to choose between multiple endpoints.
44
+
45
+ ---
46
+
47
+ ## Design Approach
48
+
49
+ ### Option 1: Flexible Input Field (Recommended)
50
+
51
+ **Schema Design:**
52
+ ```python
53
+ class ScrapeAndSummarizeRequest(BaseModel):
54
+ url: Optional[str] = None
55
+ text: Optional[str] = None
56
+ # ... other fields (max_tokens, temperature, etc.)
57
+
58
+ @model_validator(mode='after')
59
+ def check_url_or_text(self):
60
+ """Ensure exactly one of url or text is provided."""
61
+ if not self.url and not self.text:
62
+ raise ValueError('Either url or text must be provided')
63
+ if self.url and self.text:
64
+ raise ValueError('Provide either url OR text, not both')
65
+ return self
66
+
67
+ @field_validator('url')
68
+ def validate_url(cls, v):
69
+ """Validate URL format if provided."""
70
+ if v is None:
71
+ return v
72
+ # URL validation logic
73
+ return v
74
+
75
+ @field_validator('text')
76
+ def validate_text(cls, v):
77
+ """Validate text if provided."""
78
+ if v is None:
79
+ return v
80
+ if len(v) < 50:
81
+ raise ValueError('Text too short (minimum 50 characters)')
82
+ if len(v) > 50000:
83
+ raise ValueError('Text too long (maximum 50,000 characters)')
84
+ return v
85
+ ```
86
+
87
+ **Request Examples:**
88
+ ```json
89
+ // URL-based request (scraping enabled)
90
+ {
91
+ "url": "https://example.com/article",
92
+ "max_tokens": 256,
93
+ "temperature": 0.3
94
+ }
95
+
96
+ // Text-based request (direct summarization)
97
+ {
98
+ "text": "Your article text here...",
99
+ "max_tokens": 256,
100
+ "temperature": 0.3
101
+ }
102
+ ```
103
+
104
+ **Endpoint Logic:**
105
+ ```python
106
+ @router.post("/scrape-and-summarize/stream")
107
+ async def scrape_and_summarize_stream(
108
+ request: Request,
109
+ payload: ScrapeAndSummarizeRequest
110
+ ):
111
+ """Handle both URL scraping and direct text summarization."""
112
+
113
+ # Determine input type
114
+ if payload.url:
115
+ # URL input → Scrape + Summarize
116
+ article_data = await article_scraper_service.scrape_article(payload.url)
117
+ text_to_summarize = article_data['text']
118
+ metadata = {
119
+ 'title': article_data.get('title'),
120
+ 'author': article_data.get('author'),
121
+ 'source': 'scraped',
122
+ 'scrape_latency_ms': article_data.get('scrape_time_ms')
123
+ }
124
+ else:
125
+ # Text input → Direct Summarization
126
+ text_to_summarize = payload.text
127
+ metadata = {
128
+ 'source': 'direct_text',
129
+ 'text_length': len(payload.text)
130
+ }
131
+
132
+ # Stream summarization (same for both paths)
133
+ return StreamingResponse(
134
+ _stream_generator(text_to_summarize, payload, metadata, request_id),
135
+ media_type="text/event-stream",
136
+ headers={"Cache-Control": "no-cache", ...}
137
+ )
138
+ ```
139
+
140
+ ---
141
+
142
+ ### Option 2: Auto-Detection (Alternative)
143
+
144
+ **Schema Design:**
145
+ ```python
146
+ class ScrapeAndSummarizeRequest(BaseModel):
147
+ input: str = Field(..., description="URL to scrape OR text to summarize")
148
+ # ... other fields
149
+ ```
150
+
151
+ **Endpoint Logic:**
152
+ ```python
153
+ # Auto-detect if input is URL or text
154
+ if _is_valid_url(payload.input):
155
+ # URL detected → Scrape + Summarize
156
+ article_data = await article_scraper_service.scrape_article(payload.input)
157
+ text_to_summarize = article_data['text']
158
+ else:
159
+ # Plain text detected → Direct Summarization
160
+ text_to_summarize = payload.input
161
+ ```
162
+
163
+ **Pros:**
164
+ - Single input field (simpler API)
165
+ - Auto-detection is smart
166
+
167
+ **Cons:**
168
+ - Ambiguous: What if text looks like a URL?
169
+ - Harder to debug issues
170
+ - Less explicit intent
171
+
172
+ **Verdict:** Option 1 is clearer and more explicit.
173
+
174
+ ---
175
+
176
+ ## Implementation Plan
177
+
178
+ ### Step 1: Update Request Schema
179
+
180
+ **File:** `app/api/v3/schemas.py`
181
+
182
+ **Changes:**
183
+ 1. Make `url` field Optional (change from required to `Optional[str] = None`)
184
+ 2. Add `text` field as Optional (`Optional[str] = None`)
185
+ 3. Add `@model_validator` to ensure exactly one is provided
186
+ 4. Update `url` validator to handle None
187
+ 5. Add `text` validator for length constraints
188
+
189
+ **Code:**
190
+ ```python
191
+ from pydantic import BaseModel, Field, field_validator, model_validator
192
+ from typing import Optional
193
+ import re
194
+
195
+ class ScrapeAndSummarizeRequest(BaseModel):
196
+ """Request schema supporting both URL scraping and direct text summarization."""
197
+
198
+ url: Optional[str] = Field(
199
+ None,
200
+ description="URL of article to scrape and summarize",
201
+ example="https://example.com/article"
202
+ )
203
+
204
+ text: Optional[str] = Field(
205
+ None,
206
+ description="Direct text to summarize (alternative to URL)",
207
+ example="Your article text here..."
208
+ )
209
+
210
+ max_tokens: Optional[int] = Field(
211
+ default=256,
212
+ ge=1,
213
+ le=2048,
214
+ description="Maximum tokens in summary"
215
+ )
216
+
217
+ temperature: Optional[float] = Field(
218
+ default=0.3,
219
+ ge=0.0,
220
+ le=2.0,
221
+ description="Sampling temperature"
222
+ )
223
+
224
+ top_p: Optional[float] = Field(
225
+ default=0.9,
226
+ ge=0.0,
227
+ le=1.0,
228
+ description="Nucleus sampling"
229
+ )
230
+
231
+ prompt: Optional[str] = Field(
232
+ default="Summarize this article concisely:",
233
+ description="Custom summarization prompt"
234
+ )
235
+
236
+ include_metadata: Optional[bool] = Field(
237
+ default=True,
238
+ description="Include article metadata in response"
239
+ )
240
+
241
+ use_cache: Optional[bool] = Field(
242
+ default=True,
243
+ description="Use cached content if available (URL mode only)"
244
+ )
245
+
246
+ @model_validator(mode='after')
247
+ def check_url_or_text(self):
248
+ """Ensure exactly one of url or text is provided."""
249
+ if not self.url and not self.text:
250
+ raise ValueError('Either "url" or "text" must be provided')
251
+ if self.url and self.text:
252
+ raise ValueError('Provide either "url" OR "text", not both')
253
+ return self
254
+
255
+ @field_validator('url')
256
+ @classmethod
257
+ def validate_url(cls, v: Optional[str]) -> Optional[str]:
258
+ """Validate URL format if provided."""
259
+ if v is None:
260
+ return v
261
+
262
+ # URL validation regex
263
+ url_pattern = re.compile(
264
+ r'^https?://' # http:// or https://
265
+ r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+[A-Z]{2,6}\.?|' # domain
266
+ r'localhost|' # localhost
267
+ r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # or IP
268
+ r'(?::\d+)?' # optional port
269
+ r'(?:/?|[/?]\S+)$', re.IGNORECASE
270
+ )
271
+
272
+ if not url_pattern.match(v):
273
+ raise ValueError('Invalid URL format. Must start with http:// or https://')
274
+
275
+ # SSRF protection
276
+ v_lower = v.lower()
277
+ if 'localhost' in v_lower or '127.0.0.1' in v:
278
+ raise ValueError('Cannot scrape localhost URLs')
279
+
280
+ if any(private in v for private in ['192.168.', '10.', '172.16.', '172.17.', '172.18.']):
281
+ raise ValueError('Cannot scrape private IP addresses')
282
+
283
+ if len(v) > 2000:
284
+ raise ValueError('URL too long (maximum 2000 characters)')
285
+
286
+ return v
287
+
288
+ @field_validator('text')
289
+ @classmethod
290
+ def validate_text(cls, v: Optional[str]) -> Optional[str]:
291
+ """Validate text content if provided."""
292
+ if v is None:
293
+ return v
294
+
295
+ if len(v) < 50:
296
+ raise ValueError('Text too short (minimum 50 characters)')
297
+
298
+ if len(v) > 50000:
299
+ raise ValueError('Text too long (maximum 50,000 characters)')
300
+
301
+ # Check for mostly whitespace
302
+ non_whitespace = len(v.replace(' ', '').replace('\n', '').replace('\t', ''))
303
+ if non_whitespace < 30:
304
+ raise ValueError('Text contains mostly whitespace')
305
+
306
+ return v
307
+ ```
308
+
309
+ ---
310
+
311
+ ### Step 2: Update Endpoint Logic
312
+
313
+ **File:** `app/api/v3/scrape_summarize.py`
314
+
315
+ **Changes:**
316
+ 1. Detect input type (URL vs text)
317
+ 2. Branch logic accordingly
318
+ 3. Adjust metadata based on input type
319
+ 4. Keep streaming logic the same
320
+
321
+ **Code:**
322
+ ```python
323
+ @router.post("/scrape-and-summarize/stream")
324
+ async def scrape_and_summarize_stream(
325
+ request: Request,
326
+ payload: ScrapeAndSummarizeRequest
327
+ ):
328
+ """
329
+ Scrape article from URL OR summarize provided text.
330
+
331
+ Supports two modes:
332
+ 1. URL mode: Scrape article from URL then summarize
333
+ 2. Text mode: Summarize provided text directly
334
+
335
+ Returns:
336
+ Server-Sent Events stream with metadata and content chunks
337
+ """
338
+ request_id = getattr(request.state, 'request_id', 'unknown')
339
+
340
+ # Determine input mode
341
+ if payload.url:
342
+ # URL Mode: Scrape + Summarize
343
+ logger.info(f"[{request_id}] V3 URL mode: {payload.url}")
344
+
345
+ scrape_start = time.time()
346
+ try:
347
+ article_data = await article_scraper_service.scrape_article(
348
+ url=payload.url,
349
+ use_cache=payload.use_cache
350
+ )
351
+ except Exception as e:
352
+ logger.error(f"[{request_id}] Scraping failed: {e}")
353
+ raise HTTPException(
354
+ status_code=502,
355
+ detail=f"Failed to scrape article: {str(e)}"
356
+ )
357
+
358
+ scrape_latency_ms = (time.time() - scrape_start) * 1000
359
+ logger.info(f"[{request_id}] Scraped in {scrape_latency_ms:.2f}ms, "
360
+ f"extracted {len(article_data['text'])} chars")
361
+
362
+ # Validate scraped content
363
+ if len(article_data['text']) < 100:
364
+ raise HTTPException(
365
+ status_code=422,
366
+ detail="Insufficient content extracted from URL. "
367
+ "Article may be behind paywall or site may block scrapers."
368
+ )
369
+
370
+ text_to_summarize = article_data['text']
371
+ metadata = {
372
+ 'input_type': 'url',
373
+ 'url': payload.url,
374
+ 'title': article_data.get('title'),
375
+ 'author': article_data.get('author'),
376
+ 'date': article_data.get('date'),
377
+ 'site_name': article_data.get('site_name'),
378
+ 'scrape_method': article_data.get('method', 'static'),
379
+ 'scrape_latency_ms': scrape_latency_ms,
380
+ 'extracted_text_length': len(article_data['text']),
381
+ }
382
+
383
+ else:
384
+ # Text Mode: Direct Summarization
385
+ logger.info(f"[{request_id}] V3 text mode: {len(payload.text)} chars")
386
+
387
+ text_to_summarize = payload.text
388
+ metadata = {
389
+ 'input_type': 'text',
390
+ 'text_length': len(payload.text),
391
+ }
392
+
393
+ # Stream summarization (same for both modes)
394
+ return StreamingResponse(
395
+ _stream_generator(text_to_summarize, payload, metadata, request_id),
396
+ media_type="text/event-stream",
397
+ headers={
398
+ "Cache-Control": "no-cache",
399
+ "Connection": "keep-alive",
400
+ "X-Accel-Buffering": "no",
401
+ "X-Request-ID": request_id,
402
+ }
403
+ )
404
+
405
+
406
+ async def _stream_generator(text: str, payload, metadata: dict, request_id: str):
407
+ """Generate SSE stream for summarization."""
408
+
409
+ # Send metadata event first
410
+ if payload.include_metadata:
411
+ metadata_event = {
412
+ "type": "metadata",
413
+ "data": metadata
414
+ }
415
+ yield f"data: {json.dumps(metadata_event)}\n\n"
416
+
417
+ # Stream summarization chunks
418
+ summarization_start = time.time()
419
+ tokens_used = 0
420
+
421
+ try:
422
+ async for chunk in hf_streaming_service.summarize_text_stream(
423
+ text=text,
424
+ max_new_tokens=payload.max_tokens,
425
+ temperature=payload.temperature,
426
+ top_p=payload.top_p,
427
+ prompt=payload.prompt,
428
+ ):
429
+ if not chunk.get('done', False):
430
+ tokens_used = chunk.get('tokens_used', tokens_used)
431
+
432
+ yield f"data: {json.dumps(chunk)}\n\n"
433
+
434
+ except Exception as e:
435
+ logger.error(f"[{request_id}] Summarization failed: {e}")
436
+ error_event = {
437
+ "type": "error",
438
+ "error": str(e),
439
+ "done": True
440
+ }
441
+ yield f"data: {json.dumps(error_event)}\n\n"
442
+ return
443
+
444
+ summarization_latency_ms = (time.time() - summarization_start) * 1000
445
+
446
+ # Calculate total latency
447
+ total_latency_ms = summarization_latency_ms
448
+ if metadata.get('input_type') == 'url':
449
+ total_latency_ms += metadata.get('scrape_latency_ms', 0)
450
+
451
+ logger.info(f"[{request_id}] V3 request completed in {total_latency_ms:.2f}ms")
452
+ ```
453
+
454
+ ---
455
+
456
+ ### Step 3: Update Tests
457
+
458
+ **File:** `tests/test_v3_api.py`
459
+
460
+ **New Test Cases:**
461
+
462
+ ```python
463
+ @pytest.mark.asyncio
464
+ async def test_v3_text_mode_success(client):
465
+ """Test V3 endpoint with text input (no scraping)."""
466
+ response = await client.post(
467
+ "/api/v3/scrape-and-summarize/stream",
468
+ json={
469
+ "text": "This is a test article with enough content to summarize properly. "
470
+ "It has multiple sentences and provides meaningful information.",
471
+ "max_tokens": 128,
472
+ "include_metadata": True
473
+ }
474
+ )
475
+
476
+ assert response.status_code == 200
477
+ assert response.headers['content-type'] == 'text/event-stream'
478
+
479
+ # Parse SSE stream
480
+ events = []
481
+ for line in response.text.split('\n'):
482
+ if line.startswith('data: '):
483
+ events.append(json.loads(line[6:]))
484
+
485
+ # Check metadata event
486
+ metadata_event = next(e for e in events if e.get('type') == 'metadata')
487
+ assert metadata_event['data']['input_type'] == 'text'
488
+ assert metadata_event['data']['text_length'] > 0
489
+ assert 'scrape_latency_ms' not in metadata_event['data'] # No scraping in text mode
490
+
491
+ # Check content events exist
492
+ content_events = [e for e in events if 'content' in e]
493
+ assert len(content_events) > 0
494
+
495
+
496
+ @pytest.mark.asyncio
497
+ async def test_v3_url_mode_success(client):
498
+ """Test V3 endpoint with URL input (with scraping)."""
499
+ with patch('app.services.article_scraper.article_scraper_service.scrape_article') as mock_scrape:
500
+ mock_scrape.return_value = {
501
+ 'text': 'Scraped article content here...',
502
+ 'title': 'Test Article',
503
+ 'url': 'https://example.com/test',
504
+ 'method': 'static'
505
+ }
506
+
507
+ response = await client.post(
508
+ "/api/v3/scrape-and-summarize/stream",
509
+ json={
510
+ "url": "https://example.com/test",
511
+ "max_tokens": 128
512
+ }
513
+ )
514
+
515
+ assert response.status_code == 200
516
+
517
+ # Parse events
518
+ events = []
519
+ for line in response.text.split('\n'):
520
+ if line.startswith('data: '):
521
+ events.append(json.loads(line[6:]))
522
+
523
+ # Check metadata shows URL mode
524
+ metadata_event = next(e for e in events if e.get('type') == 'metadata')
525
+ assert metadata_event['data']['input_type'] == 'url'
526
+ assert 'scrape_latency_ms' in metadata_event['data']
527
+
528
+
529
+ @pytest.mark.asyncio
530
+ async def test_v3_missing_both_url_and_text(client):
531
+ """Test validation error when neither url nor text provided."""
532
+ response = await client.post(
533
+ "/api/v3/scrape-and-summarize/stream",
534
+ json={
535
+ "max_tokens": 128
536
+ }
537
+ )
538
+
539
+ assert response.status_code == 422
540
+ error_detail = response.json()['detail']
541
+ assert 'url' in error_detail[0]['loc'] or 'text' in error_detail[0]['loc']
542
+
543
+
544
+ @pytest.mark.asyncio
545
+ async def test_v3_both_url_and_text_provided(client):
546
+ """Test validation error when both url and text provided."""
547
+ response = await client.post(
548
+ "/api/v3/scrape-and-summarize/stream",
549
+ json={
550
+ "url": "https://example.com/test",
551
+ "text": "Some text here",
552
+ "max_tokens": 128
553
+ }
554
+ )
555
+
556
+ assert response.status_code == 422
557
+
558
+
559
+ @pytest.mark.asyncio
560
+ async def test_v3_text_too_short(client):
561
+ """Test validation error for text that's too short."""
562
+ response = await client.post(
563
+ "/api/v3/scrape-and-summarize/stream",
564
+ json={
565
+ "text": "Too short", # Less than 50 chars
566
+ "max_tokens": 128
567
+ }
568
+ )
569
+
570
+ assert response.status_code == 422
571
+ assert 'too short' in response.json()['detail'][0]['msg'].lower()
572
+ ```
573
+
574
+ ---
575
+
576
+ ### Step 4: Update Documentation
577
+
578
+ **File:** `CLAUDE.md`
579
+
580
+ **Update V3 API section:**
581
+
582
+ ```markdown
583
+ ### V3 API (/api/v3/*): Web Scraping + Summarization
584
+
585
+ **Endpoint:** POST `/api/v3/scrape-and-summarize/stream`
586
+
587
+ **Supports two modes:**
588
+
589
+ 1. **URL Mode** (scraping enabled):
590
+ ```json
591
+ {
592
+ "url": "https://example.com/article",
593
+ "max_tokens": 256
594
+ }
595
+ ```
596
+ - Scrapes article from URL
597
+ - Caches result for 1 hour
598
+ - Streams summarization
599
+
600
+ 2. **Text Mode** (direct summarization):
601
+ ```json
602
+ {
603
+ "text": "Your article text here...",
604
+ "max_tokens": 256
605
+ }
606
+ ```
607
+ - Skips scraping
608
+ - Summarizes text directly
609
+ - Useful when scraping fails or text already extracted
610
+
611
+ **Features:**
612
+ - Intelligent input detection (URL vs text)
613
+ - Backend web scraping with trafilatura
614
+ - In-memory caching (URL mode only)
615
+ - User-agent rotation
616
+ - Metadata extraction (URL mode: title, author, date)
617
+ - SSRF protection
618
+ - Rate limiting
619
+
620
+ **Response Format:**
621
+ Same Server-Sent Events format for both modes:
622
+ ```
623
+ data: {"type":"metadata","data":{"input_type":"url|text",...}}
624
+ data: {"content":"token","done":false,"tokens_used":N}
625
+ data: {"content":"","done":true,"latency_ms":MS}
626
+ ```
627
+ ```
628
+
629
+ **File:** `README.md`
630
+
631
+ **Add usage examples:**
632
+
633
+ ```markdown
634
+ ### V3 API Examples
635
+
636
+ **Scrape and Summarize from URL:**
637
+ ```bash
638
+ curl -X POST "https://your-space.hf.space/api/v3/scrape-and-summarize/stream" \
639
+ -H "Content-Type: application/json" \
640
+ -d '{
641
+ "url": "https://example.com/article",
642
+ "max_tokens": 256,
643
+ "temperature": 0.3
644
+ }'
645
+ ```
646
+
647
+ **Summarize Direct Text:**
648
+ ```bash
649
+ curl -X POST "https://your-space.hf.space/api/v3/scrape-and-summarize/stream" \
650
+ -H "Content-Type: application/json" \
651
+ -d '{
652
+ "text": "Your article text here...",
653
+ "max_tokens": 256,
654
+ "temperature": 0.3
655
+ }'
656
+ ```
657
+
658
+ **Python Example:**
659
+ ```python
660
+ import requests
661
+
662
+ # URL mode
663
+ response = requests.post(
664
+ "https://your-space.hf.space/api/v3/scrape-and-summarize/stream",
665
+ json={"url": "https://example.com/article", "max_tokens": 256},
666
+ stream=True
667
+ )
668
+
669
+ # Text mode
670
+ response = requests.post(
671
+ "https://your-space.hf.space/api/v3/scrape-and-summarize/stream",
672
+ json={"text": "Article content here...", "max_tokens": 256},
673
+ stream=True
674
+ )
675
+
676
+ for line in response.iter_lines():
677
+ if line.startswith(b'data: '):
678
+ data = json.loads(line[6:])
679
+ if data.get('content'):
680
+ print(data['content'], end='')
681
+ ```
682
+ ```
683
+
684
+ ---
685
+
686
+ ## Benefits of This Approach
687
+
688
+ ### 1. Single Unified Endpoint
689
+ - Android app uses one endpoint for everything
690
+ - No need to choose between `/api/v2/` and `/api/v3/`
691
+ - Simpler client-side logic
692
+
693
+ ### 2. Graceful Fallback
694
+ - If scraping fails (paywall, blocked), user can paste text manually
695
+ - App can catch 502 errors and prompt user to provide text directly
696
+
697
+ ### 3. Backward Compatible
698
+ - Existing URL-based requests still work
699
+ - No breaking changes for current users
700
+
701
+ ### 4. Better Error Messages
702
+ ```json
703
+ // Missing both
704
+ {
705
+ "detail": [
706
+ {
707
+ "type": "value_error",
708
+ "msg": "Either 'url' or 'text' must be provided"
709
+ }
710
+ ]
711
+ }
712
+
713
+ // Both provided
714
+ {
715
+ "detail": [
716
+ {
717
+ "type": "value_error",
718
+ "msg": "Provide either 'url' OR 'text', not both"
719
+ }
720
+ ]
721
+ }
722
+
723
+ // Text too short
724
+ {
725
+ "detail": [
726
+ {
727
+ "loc": ["body", "text"],
728
+ "msg": "Text too short (minimum 50 characters)"
729
+ }
730
+ ]
731
+ }
732
+ ```
733
+
734
+ ### 5. Clear Metadata
735
+ ```json
736
+ // URL mode metadata
737
+ {
738
+ "type": "metadata",
739
+ "data": {
740
+ "input_type": "url",
741
+ "url": "https://...",
742
+ "title": "Article Title",
743
+ "scrape_latency_ms": 450.2
744
+ }
745
+ }
746
+
747
+ // Text mode metadata
748
+ {
749
+ "type": "metadata",
750
+ "data": {
751
+ "input_type": "text",
752
+ "text_length": 1234
753
+ }
754
+ }
755
+ ```
756
+
757
+ ---
758
+
759
+ ## Testing Checklist
760
+
761
+ - [ ] Test URL mode with valid URL
762
+ - [ ] Test text mode with valid text
763
+ - [ ] Test validation: missing both url and text (expect 422)
764
+ - [ ] Test validation: both url and text provided (expect 422)
765
+ - [ ] Test validation: text too short (< 50 chars, expect 422)
766
+ - [ ] Test validation: text too long (> 50k chars, expect 422)
767
+ - [ ] Test validation: invalid URL format (expect 422)
768
+ - [ ] Test SSRF protection: localhost URL (expect 422)
769
+ - [ ] Test SSRF protection: private IP (expect 422)
770
+ - [ ] Test metadata event in URL mode (includes scrape_latency_ms)
771
+ - [ ] Test metadata event in text mode (no scrape_latency_ms)
772
+ - [ ] Test streaming format same for both modes
773
+ - [ ] Test cache works in URL mode
774
+ - [ ] Test cache not used in text mode
775
+
776
+ ---
777
+
778
+ ## Deployment Steps
779
+
780
+ 1. **Update Schema** (`app/api/v3/schemas.py`)
781
+ - Make url Optional
782
+ - Add text Optional
783
+ - Add model_validator for mutual exclusivity
784
+ - Update validators
785
+
786
+ 2. **Update Endpoint** (`app/api/v3/scrape_summarize.py`)
787
+ - Add input type detection
788
+ - Branch logic for URL vs text mode
789
+ - Adjust metadata
790
+
791
+ 3. **Update Tests** (`tests/test_v3_api.py`)
792
+ - Add text mode tests
793
+ - Add validation tests
794
+ - Ensure 90% coverage
795
+
796
+ 4. **Update Docs** (`CLAUDE.md`, `README.md`)
797
+ - Document both modes
798
+ - Add examples
799
+
800
+ 5. **Test Locally**
801
+ ```bash
802
+ pytest tests/test_v3_api.py -v
803
+ ```
804
+
805
+ 6. **Deploy to HF Spaces**
806
+ - Push changes
807
+ - Monitor logs
808
+ - Test both modes on live deployment
809
+
810
+ 7. **Update Android App**
811
+ - App can now send either URL or text to same endpoint
812
+ - Graceful fallback: if scraping fails, prompt user for text
813
+
814
+ ---
815
+
816
+ ## Success Criteria
817
+
818
+ ✅ URL mode works (scraping + summarization)
819
+ ✅ Text mode works (direct summarization)
820
+ ✅ Validation errors are clear and helpful
821
+ ✅ No 422 errors when text is sent
822
+ ✅ Metadata correctly indicates input type
823
+ ✅ Tests pass with 90%+ coverage
824
+ ✅ Documentation updated
825
+ ✅ Android app can use single endpoint for both scenarios
826
+
827
+ ---
828
+
829
+ ## Estimated Impact
830
+
831
+ - **Code Changes:** ~100 lines modified
832
+ - **New Tests:** ~8 test cases
833
+ - **Breaking Changes:** None (backward compatible)
834
+ - **Performance:** No impact (same logic, just more flexible input)
835
+ - **Memory:** No impact
836
+ - **Deployment Time:** ~30 minutes
837
+
838
+ ---
839
+
840
+ ## Conclusion
841
+
842
+ This fix transforms the V3 API from a URL-only endpoint to a **smart, dual-mode endpoint** that gracefully handles both URLs and plain text. The Android app gains flexibility without added complexity, and users get better error messages when validation fails.
843
+
844
+ **Ready to implement!** 🚀
app/api/v3/schemas.py CHANGED
@@ -5,17 +5,22 @@ Request and response schemas for V3 API.
5
  import re
6
  from typing import Optional
7
 
8
- from pydantic import BaseModel, Field, validator
9
 
10
 
11
  class ScrapeAndSummarizeRequest(BaseModel):
12
- """Request schema for scrape-and-summarize endpoint."""
13
 
14
- url: str = Field(
15
- ...,
16
  description="URL of article to scrape and summarize",
17
  example="https://example.com/article",
18
  )
 
 
 
 
 
19
  max_tokens: Optional[int] = Field(
20
  default=256, ge=1, le=2048, description="Maximum tokens in summary"
21
  )
@@ -36,12 +41,25 @@ class ScrapeAndSummarizeRequest(BaseModel):
36
  default=True, description="Include article metadata in response"
37
  )
38
  use_cache: Optional[bool] = Field(
39
- default=True, description="Use cached content if available"
40
  )
41
 
42
- @validator("url")
43
- def validate_url(cls, v):
 
 
 
 
 
 
 
 
 
 
44
  """Validate URL format and security."""
 
 
 
45
  # Basic URL pattern validation
46
  url_pattern = re.compile(
47
  r"^https?://" # http:// or https://
@@ -53,12 +71,12 @@ class ScrapeAndSummarizeRequest(BaseModel):
53
  re.IGNORECASE,
54
  )
55
  if not url_pattern.match(v):
56
- raise ValueError("Invalid URL format")
57
 
58
  # SSRF protection - block localhost and private IPs
59
  v_lower = v.lower()
60
  if "localhost" in v_lower or "127.0.0.1" in v_lower:
61
- raise ValueError("Cannot scrape localhost")
62
 
63
  # Block common private IP ranges
64
  from urllib.parse import urlparse
@@ -95,7 +113,27 @@ class ScrapeAndSummarizeRequest(BaseModel):
95
 
96
  # Limit URL length
97
  if len(v) > 2000:
98
- raise ValueError("URL too long (max 2000 characters)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
99
 
100
  return v
101
 
 
5
  import re
6
  from typing import Optional
7
 
8
+ from pydantic import BaseModel, Field, field_validator, model_validator
9
 
10
 
11
  class ScrapeAndSummarizeRequest(BaseModel):
12
+ """Request schema supporting both URL scraping and direct text summarization."""
13
 
14
+ url: Optional[str] = Field(
15
+ None,
16
  description="URL of article to scrape and summarize",
17
  example="https://example.com/article",
18
  )
19
+ text: Optional[str] = Field(
20
+ None,
21
+ description="Direct text to summarize (alternative to URL)",
22
+ example="Your article text here...",
23
+ )
24
  max_tokens: Optional[int] = Field(
25
  default=256, ge=1, le=2048, description="Maximum tokens in summary"
26
  )
 
41
  default=True, description="Include article metadata in response"
42
  )
43
  use_cache: Optional[bool] = Field(
44
+ default=True, description="Use cached content if available (URL mode only)"
45
  )
46
 
47
+ @model_validator(mode="after")
48
+ def check_url_or_text(self):
49
+ """Ensure exactly one of url or text is provided."""
50
+ if not self.url and not self.text:
51
+ raise ValueError('Either "url" or "text" must be provided')
52
+ if self.url and self.text:
53
+ raise ValueError('Provide either "url" OR "text", not both')
54
+ return self
55
+
56
+ @field_validator("url")
57
+ @classmethod
58
+ def validate_url(cls, v: Optional[str]) -> Optional[str]:
59
  """Validate URL format and security."""
60
+ if v is None:
61
+ return v
62
+
63
  # Basic URL pattern validation
64
  url_pattern = re.compile(
65
  r"^https?://" # http:// or https://
 
71
  re.IGNORECASE,
72
  )
73
  if not url_pattern.match(v):
74
+ raise ValueError("Invalid URL format. Must start with http:// or https://")
75
 
76
  # SSRF protection - block localhost and private IPs
77
  v_lower = v.lower()
78
  if "localhost" in v_lower or "127.0.0.1" in v_lower:
79
+ raise ValueError("Cannot scrape localhost URLs")
80
 
81
  # Block common private IP ranges
82
  from urllib.parse import urlparse
 
113
 
114
  # Limit URL length
115
  if len(v) > 2000:
116
+ raise ValueError("URL too long (maximum 2000 characters)")
117
+
118
+ return v
119
+
120
+ @field_validator("text")
121
+ @classmethod
122
+ def validate_text(cls, v: Optional[str]) -> Optional[str]:
123
+ """Validate text content if provided."""
124
+ if v is None:
125
+ return v
126
+
127
+ if len(v) < 50:
128
+ raise ValueError("Text too short (minimum 50 characters)")
129
+
130
+ if len(v) > 50000:
131
+ raise ValueError("Text too long (maximum 50,000 characters)")
132
+
133
+ # Check for mostly whitespace
134
+ non_whitespace = len(v.replace(" ", "").replace("\n", "").replace("\t", ""))
135
+ if non_whitespace < 30:
136
+ raise ValueError("Text contains mostly whitespace")
137
 
138
  return v
139
 
app/api/v3/scrape_summarize.py CHANGED
@@ -22,53 +22,80 @@ async def scrape_and_summarize_stream(
22
  request: Request, payload: ScrapeAndSummarizeRequest
23
  ):
24
  """
25
- Scrape article from URL and stream summarization.
 
 
 
 
26
 
27
  Process:
28
- 1. Scrape article content from URL (with caching)
29
- 2. Validate content quality
30
- 3. Stream summarization using V2 HF engine
31
 
32
  Returns:
33
  Server-Sent Events stream with:
34
- - Metadata event (title, author, scrape latency)
35
  - Content chunks (streaming summary tokens)
36
  - Done event (final latency)
37
  """
38
  request_id = getattr(request.state, "request_id", "unknown")
39
- logger.info(
40
- f"[{request_id}] V3 scrape-and-summarize request for: {payload.url[:80]}..."
41
- )
42
 
43
- # Step 1: Scrape article
44
- scrape_start = time.time()
45
- try:
46
- article_data = await article_scraper_service.scrape_article(
47
- url=payload.url, use_cache=payload.use_cache
48
- )
49
- except Exception as e:
50
- logger.error(f"[{request_id}] Scraping failed: {e}")
51
- raise HTTPException(
52
- status_code=502, detail=f"Failed to scrape article: {str(e)}"
 
 
 
 
 
 
 
 
 
 
53
  )
54
 
55
- scrape_latency_ms = (time.time() - scrape_start) * 1000
56
- logger.info(
57
- f"[{request_id}] Scraped in {scrape_latency_ms:.2f}ms, "
58
- f"extracted {len(article_data['text'])} chars"
59
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
 
61
- # Step 2: Validate content
62
- if len(article_data["text"]) < 100:
63
- raise HTTPException(
64
- status_code=422,
65
- detail="Insufficient content extracted from URL. "
66
- "Article may be behind paywall or site may block scrapers.",
67
- )
 
 
68
 
69
- # Step 3: Stream summarization
70
  return StreamingResponse(
71
- _stream_generator(article_data, payload, scrape_latency_ms, request_id),
72
  media_type="text/event-stream",
73
  headers={
74
  "Cache-Control": "no-cache",
@@ -79,33 +106,21 @@ async def scrape_and_summarize_stream(
79
  )
80
 
81
 
82
- async def _stream_generator(article_data, payload, scrape_latency_ms, request_id):
83
- """Generate SSE stream for scraping + summarization."""
84
 
85
  # Send metadata event first
86
  if payload.include_metadata:
87
- metadata_event = {
88
- "type": "metadata",
89
- "data": {
90
- "title": article_data.get("title"),
91
- "author": article_data.get("author"),
92
- "date": article_data.get("date"),
93
- "site_name": article_data.get("site_name"),
94
- "url": article_data.get("url"),
95
- "scrape_method": article_data.get("method", "static"),
96
- "scrape_latency_ms": scrape_latency_ms,
97
- "extracted_text_length": len(article_data["text"]),
98
- },
99
- }
100
  yield f"data: {json.dumps(metadata_event)}\n\n"
101
 
102
- # Stream summarization chunks (reuse V2 HF service)
103
  summarization_start = time.time()
104
  tokens_used = 0
105
 
106
  try:
107
  async for chunk in hf_streaming_service.summarize_text_stream(
108
- text=article_data["text"],
109
  max_new_tokens=payload.max_tokens,
110
  temperature=payload.temperature,
111
  top_p=payload.top_p,
@@ -123,9 +138,17 @@ async def _stream_generator(article_data, payload, scrape_latency_ms, request_id
123
  return
124
 
125
  summarization_latency_ms = (time.time() - summarization_start) * 1000
126
- total_latency_ms = scrape_latency_ms + summarization_latency_ms
127
 
128
- logger.info(
129
- f"[{request_id}] V3 request completed in {total_latency_ms:.2f}ms "
130
- f"(scrape: {scrape_latency_ms:.2f}ms, summary: {summarization_latency_ms:.2f}ms)"
131
- )
 
 
 
 
 
 
 
 
 
 
22
  request: Request, payload: ScrapeAndSummarizeRequest
23
  ):
24
  """
25
+ Scrape article from URL OR summarize provided text.
26
+
27
+ Supports two modes:
28
+ 1. URL mode: Scrape article from URL then summarize
29
+ 2. Text mode: Summarize provided text directly
30
 
31
  Process:
32
+ - URL mode: Scrape article (with caching) -> Validate -> Stream summarization
33
+ - Text mode: Validate text -> Stream summarization
 
34
 
35
  Returns:
36
  Server-Sent Events stream with:
37
+ - Metadata event (input_type, title/author for URL mode, text_length for text mode)
38
  - Content chunks (streaming summary tokens)
39
  - Done event (final latency)
40
  """
41
  request_id = getattr(request.state, "request_id", "unknown")
 
 
 
42
 
43
+ # Determine input mode and prepare data
44
+ if payload.url:
45
+ # URL Mode: Scrape + Summarize
46
+ logger.info(f"[{request_id}] V3 URL mode: {payload.url[:80]}...")
47
+
48
+ scrape_start = time.time()
49
+ try:
50
+ article_data = await article_scraper_service.scrape_article(
51
+ url=payload.url, use_cache=payload.use_cache
52
+ )
53
+ except Exception as e:
54
+ logger.error(f"[{request_id}] Scraping failed: {e}")
55
+ raise HTTPException(
56
+ status_code=502, detail=f"Failed to scrape article: {str(e)}"
57
+ )
58
+
59
+ scrape_latency_ms = (time.time() - scrape_start) * 1000
60
+ logger.info(
61
+ f"[{request_id}] Scraped in {scrape_latency_ms:.2f}ms, "
62
+ f"extracted {len(article_data['text'])} chars"
63
  )
64
 
65
+ # Validate scraped content
66
+ if len(article_data["text"]) < 100:
67
+ raise HTTPException(
68
+ status_code=422,
69
+ detail="Insufficient content extracted from URL. "
70
+ "Article may be behind paywall or site may block scrapers.",
71
+ )
72
+
73
+ text_to_summarize = article_data["text"]
74
+ metadata = {
75
+ "input_type": "url",
76
+ "url": payload.url,
77
+ "title": article_data.get("title"),
78
+ "author": article_data.get("author"),
79
+ "date": article_data.get("date"),
80
+ "site_name": article_data.get("site_name"),
81
+ "scrape_method": article_data.get("method", "static"),
82
+ "scrape_latency_ms": scrape_latency_ms,
83
+ "extracted_text_length": len(article_data["text"]),
84
+ }
85
 
86
+ else:
87
+ # Text Mode: Direct Summarization
88
+ logger.info(f"[{request_id}] V3 text mode: {len(payload.text)} chars")
89
+
90
+ text_to_summarize = payload.text
91
+ metadata = {
92
+ "input_type": "text",
93
+ "text_length": len(payload.text),
94
+ }
95
 
96
+ # Stream summarization (same for both modes)
97
  return StreamingResponse(
98
+ _stream_generator(text_to_summarize, payload, metadata, request_id),
99
  media_type="text/event-stream",
100
  headers={
101
  "Cache-Control": "no-cache",
 
106
  )
107
 
108
 
109
+ async def _stream_generator(text: str, payload, metadata: dict, request_id: str):
110
+ """Generate SSE stream for summarization (works for both URL and text modes)."""
111
 
112
  # Send metadata event first
113
  if payload.include_metadata:
114
+ metadata_event = {"type": "metadata", "data": metadata}
 
 
 
 
 
 
 
 
 
 
 
 
115
  yield f"data: {json.dumps(metadata_event)}\n\n"
116
 
117
+ # Stream summarization chunks
118
  summarization_start = time.time()
119
  tokens_used = 0
120
 
121
  try:
122
  async for chunk in hf_streaming_service.summarize_text_stream(
123
+ text=text,
124
  max_new_tokens=payload.max_tokens,
125
  temperature=payload.temperature,
126
  top_p=payload.top_p,
 
138
  return
139
 
140
  summarization_latency_ms = (time.time() - summarization_start) * 1000
 
141
 
142
+ # Calculate total latency (include scrape time for URL mode)
143
+ total_latency_ms = summarization_latency_ms
144
+ if metadata.get("input_type") == "url":
145
+ total_latency_ms += metadata.get("scrape_latency_ms", 0)
146
+ logger.info(
147
+ f"[{request_id}] V3 request completed in {total_latency_ms:.2f}ms "
148
+ f"(scrape: {metadata.get('scrape_latency_ms', 0):.2f}ms, "
149
+ f"summary: {summarization_latency_ms:.2f}ms)"
150
+ )
151
+ else:
152
+ logger.info(
153
+ f"[{request_id}] V3 text mode completed in {total_latency_ms:.2f}ms"
154
+ )