Spaces:

colin730
/

SummarizerApp

Sleeping

SummarizerApp / tests /test_article_scraper.py

ming

feat: Implement V3 Web Scraping + Summarization API

2ed2bd7 about 1 month ago

7.98 kB

	"""
	Tests for the article scraper service.
	"""

	from unittest.mock import AsyncMock, Mock, patch

	import pytest

	from app.services.article_scraper import ArticleScraperService


	@pytest.fixture
	def scraper_service():
	"""Create article scraper service instance."""
	return ArticleScraperService()


	@pytest.fixture
	def sample_html():
	"""Sample HTML for testing."""
	return """
	<html>
	<head>
	<title>Test Article Title</title>
	</head>
	<body>
	<article>
	<h1>Test Article</h1>
	<p>This is a test article with meaningful content that should be extracted successfully.</p>
	<p>It has multiple paragraphs to ensure proper content extraction.</p>
	<p>The content is long enough to pass quality validation checks.</p>
	</article>
	</body>
	</html>
	"""


	@pytest.mark.asyncio
	async def test_scrape_article_success(scraper_service, sample_html):
	"""Test successful article scraping."""
	with patch("httpx.AsyncClient") as mock_client:
	# Mock the HTTP response
	mock_response = Mock()
	mock_response.text = sample_html
	mock_response.status_code = 200
	mock_response.raise_for_status = Mock()

	mock_client_instance = AsyncMock()
	mock_client_instance.get.return_value = mock_response
	mock_client.return_value.__aenter__.return_value = mock_client_instance

	result = await scraper_service.scrape_article("https://example.com/article")

	assert result["text"]
	assert len(result["text"]) > 50
	assert result["url"] == "https://example.com/article"
	assert result["method"] == "static"
	assert "scrape_time_ms" in result
	assert result["scrape_time_ms"] > 0


	@pytest.mark.asyncio
	async def test_scrape_article_timeout(scraper_service):
	"""Test timeout handling."""
	with patch("httpx.AsyncClient") as mock_client:
	import httpx

	mock_client_instance = AsyncMock()
	mock_client_instance.get.side_effect = httpx.TimeoutException("Timeout")
	mock_client.return_value.__aenter__.return_value = mock_client_instance

	with pytest.raises(Exception) as exc_info:
	await scraper_service.scrape_article("https://slow-site.com/article")

	assert "timeout" in str(exc_info.value).lower()


	@pytest.mark.asyncio
	async def test_scrape_article_http_error(scraper_service):
	"""Test HTTP error handling."""
	with patch("httpx.AsyncClient") as mock_client:
	import httpx

	mock_response = Mock()
	mock_response.status_code = 404
	mock_response.reason_phrase = "Not Found"

	mock_client_instance = AsyncMock()
	mock_client_instance.get.return_value = mock_response
	mock_response.raise_for_status.side_effect = httpx.HTTPStatusError(
	"404", request=Mock(), response=mock_response
	)
	mock_client.return_value.__aenter__.return_value = mock_client_instance

	with pytest.raises(Exception) as exc_info:
	await scraper_service.scrape_article("https://example.com/notfound")

	assert "404" in str(exc_info.value)


	def test_validate_content_quality_success(scraper_service):
	"""Test content quality validation for good content."""
	good_content = "This is a well-formed article with multiple sentences. " * 10
	is_valid, reason = scraper_service._validate_content_quality(good_content)
	assert is_valid
	assert reason == "OK"


	def test_validate_content_quality_too_short(scraper_service):
	"""Test content quality validation for short content."""
	short_content = "Too short"
	is_valid, reason = scraper_service._validate_content_quality(short_content)
	assert not is_valid
	assert "too short" in reason.lower()


	def test_validate_content_quality_mostly_whitespace(scraper_service):
	"""Test content quality validation for whitespace content."""
	whitespace_content = " \n\n\n \t\t\t " * 20
	is_valid, reason = scraper_service._validate_content_quality(whitespace_content)
	assert not is_valid
	assert "whitespace" in reason.lower()


	def test_validate_content_quality_no_sentences(scraper_service):
	"""Test content quality validation for content without sentences."""
	no_sentences = "word " * 100 # No sentence endings
	is_valid, reason = scraper_service._validate_content_quality(no_sentences)
	assert not is_valid
	assert "sentence" in reason.lower()


	def test_get_random_headers(scraper_service):
	"""Test random header generation."""
	headers = scraper_service._get_random_headers()

	assert "User-Agent" in headers
	assert "Accept" in headers
	assert "Accept-Language" in headers
	assert headers["DNT"] == "1"

	# Test randomness by generating multiple headers
	headers1 = scraper_service._get_random_headers()
	headers2 = scraper_service._get_random_headers()
	headers3 = scraper_service._get_random_headers()

	# At least one should be different (probabilistically)
	user_agents = [
	headers1["User-Agent"],
	headers2["User-Agent"],
	headers3["User-Agent"],
	]
	# With 5 user agents, getting 3 different ones is likely but not guaranteed
	# So we just check the structure is consistent
	for ua in user_agents:
	assert "Mozilla" in ua


	def test_extract_site_name(scraper_service):
	"""Test site name extraction from URL."""
	assert (
	scraper_service._extract_site_name("https://www.example.com/article")
	== "example.com"
	)
	assert (
	scraper_service._extract_site_name("https://example.com/article")
	== "example.com"
	)
	assert (
	scraper_service._extract_site_name("https://subdomain.example.com/article")
	== "subdomain.example.com"
	)


	def test_extract_title_fallback(scraper_service):
	"""Test fallback title extraction from HTML."""
	html_with_title = "<html><head><title>Test Title</title></head><body></body></html>"
	title = scraper_service._extract_title_fallback(html_with_title)
	assert title == "Test Title"

	html_no_title = "<html><head></head><body></body></html>"
	title = scraper_service._extract_title_fallback(html_no_title)
	assert title is None


	@pytest.mark.asyncio
	async def test_cache_hit(scraper_service):
	"""Test cache hit scenario."""
	from app.core.cache import scraping_cache

	# Pre-populate cache
	cached_data = {
	"text": "Cached article content that is long enough to pass validation checks. "
	* 10,
	"title": "Cached Title",
	"url": "https://example.com/cached",
	"method": "static",
	"scrape_time_ms": 100.0,
	"author": None,
	"date": None,
	"site_name": "example.com",
	}
	scraping_cache.set("https://example.com/cached", cached_data)

	result = await scraper_service.scrape_article(
	"https://example.com/cached", use_cache=True
	)

	assert result["text"] == cached_data["text"]
	assert result["title"] == "Cached Title"


	@pytest.mark.asyncio
	async def test_cache_disabled(scraper_service, sample_html):
	"""Test scraping with cache disabled."""
	from app.core.cache import scraping_cache

	scraping_cache.clear_all()

	with patch("httpx.AsyncClient") as mock_client:
	mock_response = Mock()
	mock_response.text = sample_html
	mock_response.status_code = 200
	mock_response.raise_for_status = Mock()

	mock_client_instance = AsyncMock()
	mock_client_instance.get.return_value = mock_response
	mock_client.return_value.__aenter__.return_value = mock_client_instance

	result = await scraper_service.scrape_article(
	"https://example.com/nocache", use_cache=False
	)

	assert result["text"]
	# Verify it's not in cache
	assert scraping_cache.get("https://example.com/nocache") is None