""" Tests for V3 API endpoints. """ import json from unittest.mock import patch import pytest from fastapi.testclient import TestClient from app.main import app def test_scrape_and_summarize_stream_success(client: TestClient): """Test successful scrape-and-summarize flow.""" # Mock article scraping with patch( "app.services.article_scraper.article_scraper_service.scrape_article" ) as mock_scrape: mock_scrape.return_value = { "text": "This is a test article with enough content to summarize properly. " * 20, "title": "Test Article", "author": "Test Author", "date": "2024-01-15", "site_name": "Test Site", "url": "https://example.com/test", "method": "static", "scrape_time_ms": 450.2, } # Mock HF summarization streaming async def mock_stream(*args, **kwargs): yield {"content": "The", "done": False, "tokens_used": 1} yield {"content": " article", "done": False, "tokens_used": 3} yield {"content": " discusses", "done": False, "tokens_used": 5} yield {"content": "", "done": True, "tokens_used": 5, "latency_ms": 2000.0} with patch( "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream", side_effect=mock_stream, ): response = client.post( "/api/v3/scrape-and-summarize/stream", json={ "url": "https://example.com/test", "max_tokens": 128, "include_metadata": True, }, ) assert response.status_code == 200 assert ( response.headers["content-type"] == "text/event-stream; charset=utf-8" ) # Parse SSE stream events = [] for line in response.text.split("\n"): if line.startswith("data: "): try: events.append(json.loads(line[6:])) except json.JSONDecodeError: pass assert len(events) > 0 # Check metadata event metadata_events = [e for e in events if e.get("type") == "metadata"] assert len(metadata_events) == 1 metadata = metadata_events[0]["data"] assert metadata["title"] == "Test Article" assert metadata["author"] == "Test Author" assert "scrape_latency_ms" in metadata # Check content events content_events = [ e for e in events if "content" in e and not e.get("done", False) ] assert len(content_events) >= 3 # Check done event done_events = [e for e in events if e.get("done") == True] assert len(done_events) == 1 def test_scrape_invalid_url(client: TestClient): """Test error handling for invalid URL.""" response = client.post( "/api/v3/scrape-and-summarize/stream", json={"url": "not-a-valid-url", "max_tokens": 128}, ) assert response.status_code == 422 # Validation error def test_scrape_localhost_blocked(client: TestClient): """Test SSRF protection - localhost blocked.""" response = client.post( "/api/v3/scrape-and-summarize/stream", json={"url": "http://localhost:8000/secret", "max_tokens": 128}, ) assert response.status_code == 422 assert "localhost" in response.text.lower() def test_scrape_private_ip_blocked(client: TestClient): """Test SSRF protection - private IPs blocked.""" response = client.post( "/api/v3/scrape-and-summarize/stream", json={"url": "http://192.168.1.1/secret", "max_tokens": 128}, ) assert response.status_code == 422 assert "private" in response.text.lower() def test_scrape_insufficient_content(client: TestClient): """Test error when extracted content is insufficient.""" with patch( "app.services.article_scraper.article_scraper_service.scrape_article" ) as mock_scrape: mock_scrape.return_value = { "text": "Too short", # Less than 100 chars "title": "Test", "url": "https://example.com/short", "method": "static", "scrape_time_ms": 100.0, } response = client.post( "/api/v3/scrape-and-summarize/stream", json={"url": "https://example.com/short"}, ) assert response.status_code == 422 assert "insufficient" in response.text.lower() def test_scrape_failure(client: TestClient): """Test error handling when scraping fails.""" with patch( "app.services.article_scraper.article_scraper_service.scrape_article" ) as mock_scrape: mock_scrape.side_effect = Exception("Connection timeout") response = client.post( "/api/v3/scrape-and-summarize/stream", json={"url": "https://example.com/timeout"}, ) assert response.status_code == 502 assert "failed to scrape" in response.text.lower() def test_scrape_without_metadata(client: TestClient): """Test scraping without metadata in response.""" with patch( "app.services.article_scraper.article_scraper_service.scrape_article" ) as mock_scrape: mock_scrape.return_value = { "text": "Test article content. " * 50, "title": "Test Article", "url": "https://example.com/test", "method": "static", "scrape_time_ms": 200.0, } async def mock_stream(*args, **kwargs): yield {"content": "Summary", "done": False, "tokens_used": 1} yield {"content": "", "done": True, "tokens_used": 1, "latency_ms": 1000.0} with patch( "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream", side_effect=mock_stream, ): response = client.post( "/api/v3/scrape-and-summarize/stream", json={"url": "https://example.com/test", "include_metadata": False}, ) assert response.status_code == 200 # Parse events events = [] for line in response.text.split("\n"): if line.startswith("data: "): try: events.append(json.loads(line[6:])) except json.JSONDecodeError: pass # Should not have metadata event metadata_events = [e for e in events if e.get("type") == "metadata"] assert len(metadata_events) == 0 def test_scrape_with_cache(client: TestClient): """Test caching functionality.""" from app.core.cache import scraping_cache scraping_cache.clear_all() mock_article = { "text": "Cached test article content. " * 50, "title": "Cached Article", "url": "https://example.com/cached", "method": "static", "scrape_time_ms": 100.0, } with patch( "app.services.article_scraper.article_scraper_service.scrape_article" ) as mock_scrape: mock_scrape.return_value = mock_article async def mock_stream(*args, **kwargs): yield {"content": "Summary", "done": False, "tokens_used": 1} yield {"content": "", "done": True, "tokens_used": 1} with patch( "app.services.hf_streaming_summarizer.hf_streaming_service.summarize_text_stream", side_effect=mock_stream, ): # First request - should call scraper response1 = client.post( "/api/v3/scrape-and-summarize/stream", json={"url": "https://example.com/cached", "use_cache": True}, ) assert response1.status_code == 200 assert mock_scrape.call_count == 1 # Second request - should use cache response2 = client.post( "/api/v3/scrape-and-summarize/stream", json={"url": "https://example.com/cached", "use_cache": True}, ) assert response2.status_code == 200 # scrape_article is called again but should hit cache internally assert mock_scrape.call_count == 2 def test_request_validation(): """Test request schema validation.""" from fastapi.testclient import TestClient client = TestClient(app) # Test invalid max_tokens response = client.post( "/api/v3/scrape-and-summarize/stream", json={"url": "https://example.com/test", "max_tokens": 10000}, # Too high ) assert response.status_code == 422 # Test invalid temperature response = client.post( "/api/v3/scrape-and-summarize/stream", json={"url": "https://example.com/test", "temperature": 5.0}, # Too high ) assert response.status_code == 422 # Test invalid top_p response = client.post( "/api/v3/scrape-and-summarize/stream", json={"url": "https://example.com/test", "top_p": 1.5}, # Too high ) assert response.status_code == 422