Spaces:

yashgori20
/

ThinklySEO

Running

File size: 18,611 Bytes

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse, parse_qs
import re
from datetime import datetime, timedelta
from typing import Dict, Any, List, Set
import xml.etree.ElementTree as ET

from utils import safe_pct

class ContentAuditModule:
    def __init__(self):
        self.session = requests.Session()
        self.session.headers.update({
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        })
        
        # CTA keywords to look for
        self.cta_keywords = [
            'contact', 'download', 'subscribe', 'buy', 'purchase', 'order',
            'register', 'sign up', 'get started', 'learn more', 'book now',
            'free trial', 'demo', 'consultation', 'quote', 'call now'
        ]
    
    def analyze(self, url: str, quick_scan: bool = False) -> Dict[str, Any]:
        """
        Perform content audit for a given URL
        
        Args:
            url: Website URL to analyze
            quick_scan: If True, perform limited analysis (for competitors)
            
        Returns:
            Dictionary containing content audit metrics
        """
        try:
            # Normalize URL
            if not url.startswith(('http://', 'https://')):
                url = 'https://' + url
            
            # Get sitemap URLs
            sitemap_urls = self._get_sitemap_urls(url, limit=200 if not quick_scan else 50)
            
            # If no sitemap, crawl from homepage
            if not sitemap_urls:
                sitemap_urls = self._crawl_from_homepage(url, limit=50 if not quick_scan else 20)
            
            # Analyze pages
            pages_analyzed = []
            for page_url in sitemap_urls[:200 if not quick_scan else 20]:
                page_data = self._analyze_page(page_url)
                if page_data:
                    pages_analyzed.append(page_data)
            
            # Calculate aggregate metrics
            result = self._calculate_metrics(url, pages_analyzed, quick_scan)
            
            return result
            
        except Exception as e:
            return self._get_fallback_data(url, str(e))
    
    def _get_sitemap_urls(self, base_url: str, limit: int = 200) -> List[str]:
        urls = []
        
        # Common sitemap locations
        sitemap_locations = [
            f"{base_url}/sitemap.xml",
            f"{base_url}/sitemap_index.xml",
            f"{base_url}/sitemaps/sitemap.xml"
        ]
        
        for sitemap_url in sitemap_locations:
            try:
                response = self.session.get(sitemap_url, timeout=10)
                if response.status_code == 200:
                    urls.extend(self._parse_sitemap(response.content, base_url, limit))
                    break
            except:
                continue
        
        return urls[:limit]
    
    def _parse_sitemap(self, sitemap_content: bytes, base_url: str, limit: int) -> List[str]:
        urls = []
        
        try:
            root = ET.fromstring(sitemap_content)
            
            # Handle sitemap index
            for sitemap_elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}sitemap'):
                loc_elem = sitemap_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
                if loc_elem is not None and len(urls) < limit:
                    # Recursively parse sub-sitemaps
                    try:
                        response = self.session.get(loc_elem.text, timeout=10)
                        if response.status_code == 200:
                            sub_urls = self._parse_sitemap(response.content, base_url, limit - len(urls))
                            urls.extend(sub_urls)
                    except:
                        continue
            
            # Handle direct URL entries
            for url_elem in root.findall('.//{http://www.sitemaps.org/schemas/sitemap/0.9}url'):
                if len(urls) >= limit:
                    break
                    
                loc_elem = url_elem.find('{http://www.sitemaps.org/schemas/sitemap/0.9}loc')
                if loc_elem is not None:
                    url = loc_elem.text
                    if self._is_valid_content_url(url):
                        urls.append(url)
                        
        except ET.ParseError:
            pass
        
        return urls[:limit]
    
    def _crawl_from_homepage(self, base_url: str, limit: int = 50) -> List[str]:
        urls = set([base_url])
        processed = set()
        
        try:
            response = self.session.get(base_url, timeout=10)
            if response.status_code == 200:
                soup = BeautifulSoup(response.content, 'html.parser')
                
                # Find all internal links
                for link in soup.find_all('a', href=True):
                    if len(urls) >= limit:
                        break
                        
                    href = link['href']
                    full_url = urljoin(base_url, href)
                    
                    if self._is_same_domain(full_url, base_url) and self._is_valid_content_url(full_url):
                        urls.add(full_url)
                        
        except:
            pass
            
        return list(urls)[:limit]
    
    def _analyze_page(self, url: str) -> Dict[str, Any]:
        try:
            response = self.session.get(url, timeout=15)
            if response.status_code != 200:
                return None
                
            soup = BeautifulSoup(response.content, 'html.parser')
            
            # Extract metadata
            title = soup.find('title')
            title_text = title.text.strip() if title else ""
            
            meta_description = soup.find('meta', attrs={'name': 'description'})
            description_text = meta_description.get('content', '').strip() if meta_description else ""
            
            # H1 tags
            h1_tags = soup.find_all('h1')
            h1_text = [h1.text.strip() for h1 in h1_tags]
            
            # Word count (main content)
            content_text = self._extract_main_content(soup)
            word_count = len(content_text.split()) if content_text else 0
            
            # CTA presence
            has_cta = self._detect_cta(soup)
            
            # Last modified (if available)
            last_modified = self._get_last_modified(response.headers, soup)
            
            # hreflang detection
            hreflang_data = self._detect_hreflang(soup)
            
            return {
                'url': url,
                'title': title_text,
                'title_length': len(title_text),
                'meta_description': description_text,
                'description_length': len(description_text),
                'h1_tags': h1_text,
                'h1_count': len(h1_text),
                'word_count': word_count,
                'has_cta': has_cta,
                'last_modified': last_modified,
                'hreflang_data': hreflang_data,
                'status_code': response.status_code
            }
            
        except Exception as e:
            return {
                'url': url,
                'error': str(e),
                'status_code': 0
            }
    
    def _extract_main_content(self, soup: BeautifulSoup) -> str:
        """Extract main content text from HTML"""
        # Remove script and style elements
        for script in soup(["script", "style", "nav", "header", "footer"]):
            script.decompose()
        
        # Try to find main content areas
        main_content = soup.find('main') or soup.find('article') or soup.find('div', class_=re.compile(r'content|main|body'))
        
        if main_content:
            return main_content.get_text()
        else:
            return soup.get_text()
    
    def _detect_cta(self, soup: BeautifulSoup) -> bool:
        text_content = soup.get_text().lower()
        
        for keyword in self.cta_keywords:
            if keyword in text_content:
                return True
        
        # Check for buttons and links with CTA-like text
        for element in soup.find_all(['button', 'a']):
            element_text = element.get_text().lower()
            for keyword in self.cta_keywords:
                if keyword in element_text:
                    return True
        
        return False
    
    def _get_last_modified(self, headers: Dict, soup: BeautifulSoup) -> str:
        # Check headers first
        if 'last-modified' in headers:
            return headers['last-modified']
        
        # Check meta tags
        meta_modified = soup.find('meta', attrs={'name': 'last-modified'}) or \
                      soup.find('meta', attrs={'property': 'article:modified_time'})
        
        if meta_modified:
            return meta_modified.get('content', '')
        
        return ""
    
    def _detect_hreflang(self, soup: BeautifulSoup) -> Dict[str, Any]:
        """Detect hreflang implementation on a page"""
        links = soup.find_all("link", rel="alternate")
        hreflangs = []
        
        for link in links:
            hreflang = link.get("hreflang")
            if hreflang:
                hreflangs.append({
                    'hreflang': hreflang,
                    'href': link.get('href', '')
                })
        
        has_x_default = any(h['hreflang'] == 'x-default' for h in hreflangs)
        
        return {
            'has_hreflang': len(hreflangs) > 0,
            'tags': hreflangs,
            'count': len(hreflangs),
            'has_x_default': has_x_default
        }
    
    def _extract_stale_pages(self, pages_data: List[Dict]) -> List[Dict[str, Any]]:
        """Extract pages that are 18+ months old"""
        eighteen_months_ago = datetime.now() - timedelta(days=540)
        stale_pages = []
        
        for page in pages_data:
            last_modified = page.get('last_modified', '')
            if not last_modified:
                continue
            
            try:
                # Parse various date formats
                if 'GMT' in last_modified:
                    modified_date = datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S GMT')
                else:
                    # Try ISO format
                    modified_date = datetime.fromisoformat(last_modified.replace('Z', '+00:00'))
                
                if modified_date <= eighteen_months_ago:
                    stale_pages.append({
                        'url': page.get('url', ''),
                        'last_modified': last_modified
                    })
                    
            except:
                continue
        
        # Sort by oldest first and limit to 200
        stale_pages.sort(key=lambda x: x['last_modified'])
        return stale_pages[:200]
    
    def _analyze_hreflang(self, pages_data: List[Dict]) -> Dict[str, Any]:
        """Analyze hreflang implementation across the site"""
        pages_with_hreflang = 0
        sample_pages = []
        
        for page in pages_data:
            hreflang_data = page.get('hreflang_data', {})
            if hreflang_data.get('has_hreflang', False):
                pages_with_hreflang += 1
                
                # Collect samples (up to 5)
                if len(sample_pages) < 5:
                    sample_pages.append({
                        'url': page.get('url', ''),
                        'tags': [tag['hreflang'] for tag in hreflang_data.get('tags', [])]
                    })
        
        total_pages = len(pages_data)
        site_pct = safe_pct(pages_with_hreflang, total_pages)
        
        return {
            'site_pct': site_pct,
            'samples': sample_pages,
            'pages_with_hreflang': pages_with_hreflang,
            'total_pages_checked': total_pages
        }
    
    def _is_valid_content_url(self, url: str) -> bool:
        if not url:
            return False
            
        # Skip non-content URLs
        skip_extensions = ['.pdf', '.jpg', '.png', '.gif', '.css', '.js', '.xml']
        skip_paths = ['/wp-admin/', '/admin/', '/api/', '/feed/']
        
        url_lower = url.lower()
        
        for ext in skip_extensions:
            if url_lower.endswith(ext):
                return False
                
        for path in skip_paths:
            if path in url_lower:
                return False
                
        return True
    
    def _is_same_domain(self, url1: str, url2: str) -> bool:
        try:
            domain1 = urlparse(url1).netloc
            domain2 = urlparse(url2).netloc
            return domain1 == domain2
        except:
            return False
    
    def _calculate_metrics(self, base_url: str, pages_data: List[Dict], quick_scan: bool) -> Dict[str, Any]:
        total_pages = len(pages_data)
        valid_pages = [p for p in pages_data if 'error' not in p]
        
        if not valid_pages:
            return self._get_fallback_data(base_url, "No valid pages found")
        
        # Title metrics
        pages_with_title = len([p for p in valid_pages if p.get('title')])
        avg_title_length = sum(p.get('title_length', 0) for p in valid_pages) / len(valid_pages)
        
        # Meta description metrics
        pages_with_description = len([p for p in valid_pages if p.get('meta_description')])
        avg_description_length = sum(p.get('description_length', 0) for p in valid_pages) / len(valid_pages)
        
        # H1 metrics
        pages_with_h1 = len([p for p in valid_pages if p.get('h1_count', 0) > 0])
        
        # Word count metrics
        word_counts = [p.get('word_count', 0) for p in valid_pages if p.get('word_count', 0) > 0]
        avg_word_count = sum(word_counts) / len(word_counts) if word_counts else 0
        
        # CTA metrics
        pages_with_cta = len([p for p in valid_pages if p.get('has_cta')])
        
        # Content freshness
        freshness_data = self._analyze_content_freshness(valid_pages)
        
        # Extract stale pages (18+ months old)
        stale_pages = self._extract_stale_pages(valid_pages)
        
        # hreflang analysis
        hreflang_analysis = self._analyze_hreflang(valid_pages)
        
        # Calculate metadata completeness percentage
        meta_complete_pct = safe_pct(pages_with_title + pages_with_description + pages_with_h1, len(valid_pages) * 3)
        
        return {
            'url': base_url,
            'total_pages_discovered': total_pages,
            'pages_analyzed': len(valid_pages),
            'meta_complete_pct': meta_complete_pct,
            'avg_words': round(avg_word_count, 0),
            'metadata_completeness': {
                'title_coverage': safe_pct(pages_with_title, len(valid_pages)),
                'description_coverage': safe_pct(pages_with_description, len(valid_pages)),
                'h1_coverage': safe_pct(pages_with_h1, len(valid_pages)),
                'avg_title_length': round(avg_title_length, 1),
                'avg_description_length': round(avg_description_length, 1)
            },
            'content_metrics': {
                'avg_word_count': round(avg_word_count, 0),
                'cta_coverage': safe_pct(pages_with_cta, len(valid_pages))
            },
            'content_freshness': freshness_data,
            'stale_pages': stale_pages,
            'hreflang': hreflang_analysis,
            'data_source': 'Site crawl',
            'quick_scan': quick_scan
        }
    
    def _analyze_content_freshness(self, pages_data: List[Dict]) -> Dict[str, Any]:
        now = datetime.now()
        six_months_ago = now - timedelta(days=180)
        eighteen_months_ago = now - timedelta(days=540)
        
        fresh_count = 0
        moderate_count = 0
        stale_count = 0
        unknown_count = 0
        
        for page in pages_data:
            last_modified = page.get('last_modified', '')
            if not last_modified:
                unknown_count += 1
                continue
            
            try:
                # Parse various date formats
                if 'GMT' in last_modified:
                    modified_date = datetime.strptime(last_modified, '%a, %d %b %Y %H:%M:%S GMT')
                else:
                    # Try ISO format
                    modified_date = datetime.fromisoformat(last_modified.replace('Z', '+00:00'))
                
                if modified_date >= six_months_ago:
                    fresh_count += 1
                elif modified_date >= eighteen_months_ago:
                    moderate_count += 1
                else:
                    stale_count += 1
                    
            except:
                unknown_count += 1
        
        total = len(pages_data)
        return {
            'fresh_content': {'count': fresh_count, 'percentage': safe_pct(fresh_count, total)},
            'moderate_content': {'count': moderate_count, 'percentage': safe_pct(moderate_count, total)},
            'stale_content': {'count': stale_count, 'percentage': safe_pct(stale_count, total)},
            'unknown_date': {'count': unknown_count, 'percentage': safe_pct(unknown_count, total)}
        }
    
    def _get_fallback_data(self, url: str, error: str) -> Dict[str, Any]:
        return {
            'url': url,
            'error': f"Content audit failed: {error}",
            'total_pages_discovered': 0,
            'pages_analyzed': 0,
            'metadata_completeness': {
                'title_coverage': 0,
                'description_coverage': 0,
                'h1_coverage': 0,
                'avg_title_length': 0,
                'avg_description_length': 0
            },
            'content_metrics': {
                'avg_word_count': 0,
                'cta_coverage': 0
            },
            'content_freshness': {
                'fresh_content': {'count': 0, 'percentage': 0},
                'moderate_content': {'count': 0, 'percentage': 0},
                'stale_content': {'count': 0, 'percentage': 0},
                'unknown_date': {'count': 0, 'percentage': 0}
            },
            'stale_pages': [],
            'hreflang': {'site_pct': 0, 'samples': []},
            'data_source': 'Site crawl',
            'meta_complete_pct': 0,
            'avg_words': 0,
            'quick_scan': False
        }