Spaces:

yashgori20
/

ThinklySEO

Running

App Files Files Community

ThinklySEO / modules /content_audit.py

yashgori20

domne

8913f77 3 months ago

raw

history blame contribute delete

18.6 kB

	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse, parse_qs
	import re
	from datetime import datetime, timedelta
	from typing import Dict, Any, List, Set
	import xml.etree.ElementTree as ET

	from utils import safe_pct

	class ContentAuditModule:
	def __init__(self):
	self.session = requests.Session()
	self.session.headers.update({
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	})

	# CTA keywords to look for
	self.cta_keywords = [
	'contact', 'download', 'subscribe', 'buy', 'purchase', 'order',
	'register', 'sign up', 'get started', 'learn more', 'book now',
	'free trial', 'demo', 'consultation', 'quote', 'call now'
	]

	def analyze(self, url: str, quick_scan: bool = False) -> Dict[str, Any]:
	"""
	Perform content audit for a given URL

	Args:
	url: Website URL to analyze
	quick_scan: If True, perform limited analysis (for competitors)

	Returns:
	Dictionary containing content audit metrics
	"""
	try:
	# Normalize URL
	if not url.startswith(('http://', 'https://')):
	url = 'https://' + url

	# Get sitemap URLs
	sitemap_urls = self._get_sitemap_urls(url, limit=200 if not quick_scan else 50)

	# If no sitemap, crawl from homepage
	if not sitemap_urls:
	sitemap_urls = self._crawl_from_homepage(url, limit=50 if not quick_scan else 20)

	# Analyze pages
	pages_analyzed = []
	for page_url in sitemap_urls[:200 if not quick_scan else 20]:
	page_data = self._analyze_page(page_url)
	if page_data:
	pages_analyzed.append(page_data)

	# Calculate aggregate metrics
	result = self._calculate_metrics(url, pages_analyzed, quick_scan)

	return result

	except Exception as e:
	return self._get_fallback_data(url, str(e))

	def _get_sitemap_urls(self, base_url: str, limit: int = 200) -> List[str]:
	urls = []

	# Common sitemap locations
	sitemap_locations = [
	f"{base_url}/sitemap.xml",
	f"{base_url}/sitemap_index.xml",
	f"{base_url}/sitemaps/sitemap.xml"
	]

	for sitemap_url in sitemap_locations:
	try:
	response = self.session.get(sitemap_url, timeout=10)
	if response.status_code == 200:
	urls.extend(self._parse_sitemap(response.content, base_url, limit))
	break
	except:
	continue

	return urls[:limit]

	def _parse_sitemap(self, sitemap_content: bytes, base_url: str, limit: int) -> List[str]:
	urls = []

	try:
	root = ET.fromstring(sitemap_content)

	# Handle sitemap index
	for sitemap_elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap'):
	loc_elem = sitemap_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
	if loc_elem is not None and len(urls) < limit:
	# Recursively parse sub-sitemaps
	try:
	response = self.session.get(loc_elem.text, timeout=10)
	if response.status_code == 200:
	sub_urls = self._parse_sitemap(response.content, base_url, limit - len(urls))
	urls.extend(sub_urls)
	except:
	continue

	# Handle direct URL entries
	for url_elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
	if len(urls) >= limit:
	break

	loc_elem = url_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
	if loc_elem is not None:
	url = loc_elem.text
	if self._is_valid_content_url(url):
	urls.append(url)

	except ET.ParseError:
	pass

	return urls[:limit]

	def _crawl_from_homepage(self, base_url: str, limit: int = 50) -> List[str]:
	urls = set([base_url])
	processed = set()

	try:
	response = self.session.get(base_url, timeout=10)
	if response.status_code == 200:
	soup = BeautifulSoup(response.content, 'html.parser')

	# Find all internal links
	for link in soup.find_all('a', href=True):
	if len(urls) >= limit:
	break

	href = link['href']
	full_url = urljoin(base_url, href)

	if self._is_same_domain(full_url, base_url) and self._is_valid_content_url(full_url):
	urls.add(full_url)

	except:
	pass

	return list(urls)[:limit]

	def _analyze_page(self, url: str) -> Dict[str, Any]:
	try:
	response = self.session.get(url, timeout=15)
	if response.status_code != 200:
	return None

	soup = BeautifulSoup(response.content, 'html.parser')

	# Extract metadata
	title = soup.find('title')
	title_text = title.text.strip() if title else ""

	meta_description = soup.find('meta', attrs={'name': 'description'})
	description_text = meta_description.get('content', '').strip() if meta_description else ""

	# H1 tags
	h1_tags = soup.find_all('h1')
	h1_text = [h1.text.strip() for h1 in h1_tags]

	# Word count (main content)
	content_text = self._extract_main_content(soup)
	word_count = len(content_text.split()) if content_text else 0

	# CTA presence
	has_cta = self._detect_cta(soup)

	# Last modified (if available)
	last_modified = self._get_last_modified(response.headers, soup)

	# hreflang detection
	hreflang_data = self._detect_hreflang(soup)

	return {
	'url': url,
	'title': title_text,
	'title_length': len(title_text),
	'meta_description': description_text,
	'description_length': len(description_text),
	'h1_tags': h1_text,
	'h1_count': len(h1_text),
	'word_count': word_count,
	'has_cta': has_cta,
	'last_modified': last_modified,
	'hreflang_data': hreflang_data,
	'status_code': response.status_code
	}

	except Exception as e:
	return {
	'url': url,
	'error': str(e),
	'status_code': 0
	}

	def _extract_main_content(self, soup: BeautifulSoup) -> str:
	"""Extract main content text from HTML"""
	# Remove script and style elements
	for script in soup(["script", "style", "nav", "header", "footer"]):
	script.decompose()

	# Try to find main content areas
	main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content\|main\|body'))

	if main_content:
	return main_content.get_text()
	else:
	return soup.get_text()

	def _detect_cta(self, soup: BeautifulSoup) -> bool:
	text_content = soup.get_text().lower()

	for keyword in self.cta_keywords:
	if keyword in text_content:
	return True

	# Check for buttons and links with CTA-like text
	for element in soup.find_all(['button', 'a']):
	element_text = element.get_text().lower()
	for keyword in self.cta_keywords:
	if keyword in element_text:
	return True

	return False

	def _get_last_modified(self, headers: Dict, soup: BeautifulSoup) -> str:
	# Check headers first
	if 'last-modified' in headers:
	return headers['last-modified']

	# Check meta tags
	meta_modified = soup.find('meta', attrs={'name': 'last-modified'}) or \
	soup.find('meta', attrs={'property': 'article:modified_time'})

	if meta_modified:
	return meta_modified.get('content', '')

	return ""

	def _detect_hreflang(self, soup: BeautifulSoup) -> Dict[str, Any]:
	"""Detect hreflang implementation on a page"""
	links = soup.find_all("link", rel="alternate")
	hreflangs = []

	for link in links:
	hreflang = link.get("hreflang")
	if hreflang:
	hreflangs.append({
	'hreflang': hreflang,
	'href': link.get('href', '')
	})

	has_x_default = any(h['hreflang'] == 'x-default' for h in hreflangs)

	return {
	'has_hreflang': len(hreflangs) > 0,
	'tags': hreflangs,
	'count': len(hreflangs),
	'has_x_default': has_x_default
	}

	def _extract_stale_pages(self, pages_data: List[Dict]) -> List[Dict[str, Any]]:
	"""Extract pages that are 18+ months old"""
	eighteen_months_ago = datetime.now() - timedelta(days=540)
	stale_pages = []

	for page in pages_data:
	last_modified = page.get('last_modified', '')
	if not last_modified:
	continue

	try:
	# Parse various date formats
	if 'GMT' in last_modified:
	modified_date = datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S GMT')
	else:
	# Try ISO format
	modified_date = datetime.fromisoformat(last_modified.replace('Z', '+00:00'))

	if modified_date <= eighteen_months_ago:
	stale_pages.append({
	'url': page.get('url', ''),
	'last_modified': last_modified
	})

	except:
	continue

	# Sort by oldest first and limit to 200
	stale_pages.sort(key=lambda x: x['last_modified'])
	return stale_pages[:200]

	def _analyze_hreflang(self, pages_data: List[Dict]) -> Dict[str, Any]:
	"""Analyze hreflang implementation across the site"""
	pages_with_hreflang = 0
	sample_pages = []

	for page in pages_data:
	hreflang_data = page.get('hreflang_data', {})
	if hreflang_data.get('has_hreflang', False):
	pages_with_hreflang += 1

	# Collect samples (up to 5)
	if len(sample_pages) < 5:
	sample_pages.append({
	'url': page.get('url', ''),
	'tags': [tag['hreflang'] for tag in hreflang_data.get('tags', [])]
	})

	total_pages = len(pages_data)
	site_pct = safe_pct(pages_with_hreflang, total_pages)

	return {
	'site_pct': site_pct,
	'samples': sample_pages,
	'pages_with_hreflang': pages_with_hreflang,
	'total_pages_checked': total_pages
	}

	def _is_valid_content_url(self, url: str) -> bool:
	if not url:
	return False

	# Skip non-content URLs
	skip_extensions = ['.pdf', '.jpg', '.png', '.gif', '.css', '.js', '.xml']
	skip_paths = ['/wp-admin/', '/admin/', '/api/', '/feed/']

	url_lower = url.lower()

	for ext in skip_extensions:
	if url_lower.endswith(ext):
	return False

	for path in skip_paths:
	if path in url_lower:
	return False

	return True

	def _is_same_domain(self, url1: str, url2: str) -> bool:
	try:
	domain1 = urlparse(url1).netloc
	domain2 = urlparse(url2).netloc
	return domain1 == domain2
	except:
	return False

	def _calculate_metrics(self, base_url: str, pages_data: List[Dict], quick_scan: bool) -> Dict[str, Any]:
	total_pages = len(pages_data)
	valid_pages = [p for p in pages_data if 'error' not in p]

	if not valid_pages:
	return self._get_fallback_data(base_url, "No valid pages found")

	# Title metrics
	pages_with_title = len([p for p in valid_pages if p.get('title')])
	avg_title_length = sum(p.get('title_length', 0) for p in valid_pages) / len(valid_pages)

	# Meta description metrics
	pages_with_description = len([p for p in valid_pages if p.get('meta_description')])
	avg_description_length = sum(p.get('description_length', 0) for p in valid_pages) / len(valid_pages)

	# H1 metrics
	pages_with_h1 = len([p for p in valid_pages if p.get('h1_count', 0) > 0])

	# Word count metrics
	word_counts = [p.get('word_count', 0) for p in valid_pages if p.get('word_count', 0) > 0]
	avg_word_count = sum(word_counts) / len(word_counts) if word_counts else 0

	# CTA metrics
	pages_with_cta = len([p for p in valid_pages if p.get('has_cta')])

	# Content freshness
	freshness_data = self._analyze_content_freshness(valid_pages)

	# Extract stale pages (18+ months old)
	stale_pages = self._extract_stale_pages(valid_pages)

	# hreflang analysis
	hreflang_analysis = self._analyze_hreflang(valid_pages)

	# Calculate metadata completeness percentage
	meta_complete_pct = safe_pct(pages_with_title + pages_with_description + pages_with_h1, len(valid_pages) * 3)

	return {
	'url': base_url,
	'total_pages_discovered': total_pages,
	'pages_analyzed': len(valid_pages),
	'meta_complete_pct': meta_complete_pct,
	'avg_words': round(avg_word_count, 0),
	'metadata_completeness': {
	'title_coverage': safe_pct(pages_with_title, len(valid_pages)),
	'description_coverage': safe_pct(pages_with_description, len(valid_pages)),
	'h1_coverage': safe_pct(pages_with_h1, len(valid_pages)),
	'avg_title_length': round(avg_title_length, 1),
	'avg_description_length': round(avg_description_length, 1)
	},
	'content_metrics': {
	'avg_word_count': round(avg_word_count, 0),
	'cta_coverage': safe_pct(pages_with_cta, len(valid_pages))
	},
	'content_freshness': freshness_data,
	'stale_pages': stale_pages,
	'hreflang': hreflang_analysis,
	'data_source': 'Site crawl',
	'quick_scan': quick_scan
	}

	def _analyze_content_freshness(self, pages_data: List[Dict]) -> Dict[str, Any]:
	now = datetime.now()
	six_months_ago = now - timedelta(days=180)
	eighteen_months_ago = now - timedelta(days=540)

	fresh_count = 0
	moderate_count = 0
	stale_count = 0
	unknown_count = 0

	for page in pages_data:
	last_modified = page.get('last_modified', '')
	if not last_modified:
	unknown_count += 1
	continue

	try:
	# Parse various date formats
	if 'GMT' in last_modified:
	modified_date = datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S GMT')
	else:
	# Try ISO format
	modified_date = datetime.fromisoformat(last_modified.replace('Z', '+00:00'))

	if modified_date >= six_months_ago:
	fresh_count += 1
	elif modified_date >= eighteen_months_ago:
	moderate_count += 1
	else:
	stale_count += 1

	except:
	unknown_count += 1

	total = len(pages_data)
	return {
	'fresh_content': {'count': fresh_count, 'percentage': safe_pct(fresh_count, total)},
	'moderate_content': {'count': moderate_count, 'percentage': safe_pct(moderate_count, total)},
	'stale_content': {'count': stale_count, 'percentage': safe_pct(stale_count, total)},
	'unknown_date': {'count': unknown_count, 'percentage': safe_pct(unknown_count, total)}
	}

	def _get_fallback_data(self, url: str, error: str) -> Dict[str, Any]:
	return {
	'url': url,
	'error': f"Content audit failed: {error}",
	'total_pages_discovered': 0,
	'pages_analyzed': 0,
	'metadata_completeness': {
	'title_coverage': 0,
	'description_coverage': 0,
	'h1_coverage': 0,
	'avg_title_length': 0,
	'avg_description_length': 0
	},
	'content_metrics': {
	'avg_word_count': 0,
	'cta_coverage': 0
	},
	'content_freshness': {
	'fresh_content': {'count': 0, 'percentage': 0},
	'moderate_content': {'count': 0, 'percentage': 0},
	'stale_content': {'count': 0, 'percentage': 0},
	'unknown_date': {'count': 0, 'percentage': 0}
	},
	'stale_pages': [],
	'hreflang': {'site_pct': 0, 'samples': []},
	'data_source': 'Site crawl',
	'meta_complete_pct': 0,
	'avg_words': 0,
	'quick_scan': False
	}