Spaces:
Running
Running
| """ | |
| Backlinks Profile Module using RapidAPI endpoints | |
| Combines 3 RapidAPI endpoints: Best Backlink Checker, Majestic, and Domain Metrics Check | |
| """ | |
| import os | |
| import requests | |
| import time | |
| from typing import Dict, Any, List, Optional | |
| from urllib.parse import urlparse | |
| from datetime import datetime, timedelta | |
| from utils import safe_pct | |
| class ModuleResult: | |
| """Standard result object for SEO modules""" | |
| def __init__(self, success: bool, data: Dict[str, Any], error: str = None): | |
| self.success = success | |
| self.data = data | |
| self.error = error | |
| class BacklinksModule: | |
| def __init__(self): | |
| self.rapidapi_key = os.getenv('RAPIDAPI_KEY') | |
| self.timeout = int(os.getenv('RAPIDAPI_TIMEOUT', '30')) | |
| self.max_retries = int(os.getenv('BACKLINKS_MAX_RETRIES', '3')) | |
| # RapidAPI endpoints | |
| self.backlink_checker_url = "https://best-backlink-checker-api.p.rapidapi.com/excatbacklinks_noneng.php" | |
| self.majestic_url = "https://majestic1.p.rapidapi.com/url_metrics" | |
| self.domain_metrics_url = "https://domain-metrics-check.p.rapidapi.com/domain-metrics" | |
| # Common headers | |
| self.headers = { | |
| 'x-rapidapi-key': self.rapidapi_key, | |
| 'Accept': 'application/json' | |
| } | |
| def analyze(self, url: str, quick_scan: bool = False) -> ModuleResult: | |
| """ | |
| Analyze backlink profile using multiple RapidAPI endpoints | |
| Args: | |
| url: Target website URL | |
| quick_scan: If True, use cached data or limited analysis | |
| Returns: | |
| ModuleResult with comprehensive backlinks data | |
| """ | |
| try: | |
| if not self.rapidapi_key: | |
| return self._generate_no_api_data(url) | |
| domain = self._extract_domain(url) | |
| # Call all 3 APIs with retry logic and track status | |
| api_status = { | |
| 'working_apis': [], | |
| 'failed_apis': [], | |
| 'failed_messages': [] | |
| } | |
| print("π Trying Best Backlink Checker API...") | |
| individual_backlinks = self._get_individual_backlinks(domain, quick_scan) | |
| if individual_backlinks: | |
| api_status['working_apis'].append('Best Backlink Checker') | |
| print("β Best Backlink Checker API - SUCCESS") | |
| else: | |
| api_status['failed_apis'].append('Best Backlink Checker') | |
| api_status['failed_messages'].append("β Best Backlink Checker API failed - using mock data") | |
| print("β Best Backlink Checker API - FAILED") | |
| print("π Trying Majestic API...") | |
| majestic_metrics = self._get_majestic_metrics(domain) | |
| if majestic_metrics: | |
| api_status['working_apis'].append('Majestic') | |
| print("β Majestic API - SUCCESS") | |
| else: | |
| api_status['failed_apis'].append('Majestic') | |
| api_status['failed_messages'].append("β Majestic API failed - using mock data") | |
| print("β Majestic API - FAILED") | |
| print("π Trying Domain Metrics Check API...") | |
| domain_metrics = self._get_domain_metrics(domain) | |
| if domain_metrics: | |
| api_status['working_apis'].append('Domain Metrics Check') | |
| print("β Domain Metrics Check API - SUCCESS") | |
| else: | |
| api_status['failed_apis'].append('Domain Metrics Check') | |
| api_status['failed_messages'].append("β Domain Metrics Check API failed - using mock data") | |
| print("β Domain Metrics Check API - FAILED") | |
| # Combine and process all data | |
| combined_data = self._combine_backlink_data( | |
| domain, individual_backlinks, majestic_metrics, domain_metrics, quick_scan, api_status | |
| ) | |
| return ModuleResult(success=True, data=combined_data) | |
| except Exception as e: | |
| return ModuleResult( | |
| success=False, | |
| data={}, | |
| error=f"Backlinks analysis failed: {str(e)}" | |
| ) | |
| def _extract_domain(self, url: str) -> str: | |
| if not url.startswith(('http://', 'https://')): | |
| url = 'https://' + url | |
| domain = urlparse(url).netloc.replace('www.', '') | |
| return domain | |
| def _api_request_with_retry(self, url: str, params: Dict = None, headers: Dict = None) -> Optional[Dict]: | |
| if headers is None: | |
| headers = self.headers.copy() | |
| for attempt in range(self.max_retries): | |
| try: | |
| response = requests.get(url, params=params, headers=headers, timeout=self.timeout) | |
| if response.status_code == 200: | |
| return response.json() | |
| elif response.status_code == 429: | |
| wait_time = (attempt + 1) * 2 | |
| print(f"Rate limited, waiting {wait_time}s...") | |
| time.sleep(wait_time) | |
| continue | |
| else: | |
| print(f"API error {response.status_code}: {response.text}") | |
| except requests.exceptions.Timeout: | |
| print(f"Timeout on attempt {attempt + 1}") | |
| if attempt < self.max_retries - 1: | |
| time.sleep(2) | |
| except Exception as e: | |
| print(f"Request error: {str(e)}") | |
| if attempt < self.max_retries - 1: | |
| time.sleep(2) | |
| return None | |
| def _get_individual_backlinks(self, domain: str, quick_scan: bool = False) -> List[Dict]: | |
| """Get individual backlinks data""" | |
| try: | |
| headers = self.headers.copy() | |
| headers['x-rapidapi-host'] = 'best-backlink-checker-api.p.rapidapi.com' | |
| params = {'domain': f'https://{domain}'} | |
| data = self._api_request_with_retry(self.backlink_checker_url, params, headers) | |
| if data and isinstance(data, list): | |
| # Limit results for quick scan | |
| if quick_scan: | |
| return data[:50] | |
| return data[:500] | |
| except Exception as e: | |
| print(f"Individual backlinks API error: {str(e)}") | |
| return [] | |
| def _get_majestic_metrics(self, domain: str) -> Dict[str, Any]: | |
| try: | |
| headers = self.headers.copy() | |
| headers['x-rapidapi-host'] = 'majestic1.p.rapidapi.com' | |
| params = {'url': domain} | |
| data = self._api_request_with_retry(self.majestic_url, params, headers) | |
| if data and data.get('status') == 'success': | |
| return data | |
| except Exception as e: | |
| print(f"Majestic RapidAPI error: {str(e)}") | |
| return {} | |
| def _get_domain_metrics(self, domain: str) -> Dict[str, Any]: | |
| """Get comprehensive domain metrics""" | |
| try: | |
| headers = self.headers.copy() | |
| headers['x-rapidapi-host'] = 'domain-metrics-check.p.rapidapi.com' | |
| # API expects domain with trailing slash | |
| url = f"{self.domain_metrics_url}/{domain}/" | |
| data = self._api_request_with_retry(url, headers=headers) | |
| if data and data.get('domain'): | |
| return data | |
| except Exception as e: | |
| print(f"Domain metrics API error: {str(e)}") | |
| return {} | |
| def _combine_backlink_data(self, domain: str, individual_backlinks: List[Dict], | |
| majestic_metrics: Dict, domain_metrics: Dict, quick_scan: bool, api_status: Dict) -> Dict[str, Any]: | |
| """Combine data from all 3 APIs into comprehensive backlinks profile""" | |
| # Primary metrics (prefer Domain Metrics Check, fallback to Majestic) | |
| total_backlinks = ( | |
| int(domain_metrics.get('ahrefsBacklinks', 0)) or | |
| int(domain_metrics.get('majesticLinks', 0)) or | |
| int(majestic_metrics.get('majesticLinks', 0)) or | |
| len(individual_backlinks) | |
| ) | |
| total_ref_domains = ( | |
| int(domain_metrics.get('ahrefsRefDomains', 0)) or | |
| int(domain_metrics.get('majesticRefDomains', 0)) or | |
| int(majestic_metrics.get('majesticRefDomains', 0)) or | |
| len(set(link.get('url_from', '').split('/')[2] for link in individual_backlinks if link.get('url_from'))) | |
| ) | |
| # Authority scores (multiple sources for validation) | |
| domain_rating = ( | |
| int(domain_metrics.get('ahrefsDR', 0)) or | |
| int(domain_metrics.get('majesticTF', 0)) or | |
| int(majestic_metrics.get('majesticTF', 0)) | |
| ) | |
| # Process individual backlinks for detailed analysis | |
| referring_domains = self._extract_referring_domains(individual_backlinks) | |
| anchor_distribution = self._extract_anchor_distribution(individual_backlinks) | |
| monthly_changes = self._calculate_monthly_changes(individual_backlinks) | |
| top_backlinks = self._get_top_backlinks(individual_backlinks) | |
| # Link quality analysis | |
| quality_metrics = self._analyze_link_quality(individual_backlinks, domain_metrics) | |
| # Comprehensive backlinks data | |
| backlinks_data = { | |
| 'ref_domains': total_ref_domains, # Match expected key name | |
| 'new_backlinks_30d': monthly_changes.get('new_backlinks', 0), | |
| 'lost_backlinks_30d': None, # Explicit N/A placeholder | |
| 'total_backlinks': total_backlinks, | |
| 'total_ref_domains': total_ref_domains, | |
| 'domain_rating': domain_rating, | |
| # Authority scores from multiple sources | |
| 'authority_scores': { | |
| 'ahrefs_dr': int(domain_metrics.get('ahrefsDR', 0)), | |
| 'moz_da': int(domain_metrics.get('mozDA', 0)), | |
| 'moz_pa': int(domain_metrics.get('mozPA', 0)), | |
| 'majestic_tf': int(domain_metrics.get('majesticTF', 0) or majestic_metrics.get('majesticTF', 0)), | |
| 'majestic_cf': int(domain_metrics.get('majesticCF', 0) or majestic_metrics.get('majesticCF', 0)) | |
| }, | |
| # Detailed analysis | |
| 'referring_domains': referring_domains, | |
| 'anchor_distribution': anchor_distribution, | |
| 'monthly_changes': monthly_changes, | |
| 'top_backlinks': top_backlinks, | |
| 'quality_metrics': quality_metrics, | |
| # Educational and government links (high-quality indicators) | |
| 'edu_links': int(domain_metrics.get('majesticRefEDU', 0) or majestic_metrics.get('majesticRefEDU', 0)), | |
| 'gov_links': int(domain_metrics.get('majesticRefGov', 0) or majestic_metrics.get('majesticRefGov', 0)), | |
| # Traffic estimates (if available) | |
| 'estimated_organic_traffic': float(domain_metrics.get('ahrefsTraffic', 0)), | |
| 'organic_keywords': int(domain_metrics.get('ahrefsOrganicKeywords', 0)), | |
| # Data sources and metadata | |
| 'data_sources': self._get_data_sources(individual_backlinks, majestic_metrics, domain_metrics), | |
| 'data_source': self._get_primary_data_source(individual_backlinks, majestic_metrics, domain_metrics), | |
| 'api_status': api_status, | |
| 'last_updated': datetime.now().isoformat(), | |
| 'quick_scan': quick_scan, | |
| 'analysis_depth': 'comprehensive' if not quick_scan else 'basic' | |
| } | |
| return backlinks_data | |
| def _extract_referring_domains(self, backlinks: List[Dict]) -> List[Dict[str, Any]]: | |
| """Extract and analyze referring domains""" | |
| domain_stats = {} | |
| for link in backlinks: | |
| if not link.get('url_from'): | |
| continue | |
| try: | |
| source_domain = urlparse(link['url_from']).netloc | |
| if source_domain not in domain_stats: | |
| domain_stats[source_domain] = { | |
| 'domain': source_domain, | |
| 'backlinks': 0, | |
| 'first_seen': link.get('first_seen', ''), | |
| 'domain_authority': link.get('domain_inlink_rank', 0), | |
| 'follow_links': 0, | |
| 'nofollow_links': 0 | |
| } | |
| domain_stats[source_domain]['backlinks'] += 1 | |
| if link.get('nofollow'): | |
| domain_stats[source_domain]['nofollow_links'] += 1 | |
| else: | |
| domain_stats[source_domain]['follow_links'] += 1 | |
| except Exception: | |
| continue | |
| # Sort by backlinks count and return top domains | |
| top_domains = sorted(domain_stats.values(), key=lambda x: x['backlinks'], reverse=True) | |
| return top_domains[:20] | |
| def _extract_anchor_distribution(self, backlinks: List[Dict]) -> List[Dict[str, Any]]: | |
| """Analyze anchor text distribution""" | |
| anchor_stats = {} | |
| for link in backlinks: | |
| anchor = link.get('anchor', '').strip() | |
| if not anchor or len(anchor) > 100: | |
| continue | |
| if anchor not in anchor_stats: | |
| anchor_stats[anchor] = { | |
| 'anchor_text': anchor, | |
| 'backlinks': 0, | |
| 'follow_links': 0, | |
| 'nofollow_links': 0, | |
| 'unique_domains': set() | |
| } | |
| anchor_stats[anchor]['backlinks'] += 1 | |
| if link.get('nofollow'): | |
| anchor_stats[anchor]['nofollow_links'] += 1 | |
| else: | |
| anchor_stats[anchor]['follow_links'] += 1 | |
| # Track unique domains for this anchor | |
| try: | |
| domain = urlparse(link.get('url_from', '')).netloc | |
| anchor_stats[anchor]['unique_domains'].add(domain) | |
| except Exception: | |
| pass | |
| # Convert sets to counts and sort | |
| anchor_distribution = [] | |
| for anchor_data in anchor_stats.values(): | |
| anchor_data['unique_domains'] = len(anchor_data['unique_domains']) | |
| anchor_distribution.append(anchor_data) | |
| # Sort by backlinks count | |
| anchor_distribution.sort(key=lambda x: x['backlinks'], reverse=True) | |
| return anchor_distribution[:15] | |
| def _calculate_monthly_changes(self, backlinks: List[Dict]) -> Dict[str, int]: | |
| """Calculate monthly backlinks changes""" | |
| now = datetime.now() | |
| last_month = now - timedelta(days=30) | |
| new_links = 0 | |
| recent_links = 0 | |
| for link in backlinks: | |
| first_seen = link.get('first_seen', '') | |
| if not first_seen: | |
| continue | |
| try: | |
| link_date = datetime.strptime(first_seen, '%Y-%m-%d') | |
| if link_date >= last_month: | |
| new_links += 1 | |
| if link_date >= now - timedelta(days=90): | |
| recent_links += 1 | |
| except Exception: | |
| continue | |
| return { | |
| 'new_backlinks': new_links, | |
| 'lost_backlinks_30d': None, # Explicit N/A placeholder | |
| 'net_change': new_links, | |
| 'recent_backlinks_3m': recent_links | |
| } | |
| def _get_top_backlinks(self, backlinks: List[Dict]) -> List[Dict[str, Any]]: | |
| """Get top-quality backlinks""" | |
| # Sort by inlink_rank (higher is better) | |
| sorted_links = sorted( | |
| backlinks, | |
| key=lambda x: x.get('inlink_rank', 0), | |
| reverse=True | |
| ) | |
| top_links = [] | |
| for link in sorted_links[:10]: | |
| top_links.append({ | |
| 'source_url': link.get('url_from', ''), | |
| 'source_title': link.get('title', ''), | |
| 'anchor_text': link.get('anchor', ''), | |
| 'is_follow': not link.get('nofollow', True), | |
| 'authority_score': link.get('inlink_rank', 0), | |
| 'first_seen': link.get('first_seen', '') | |
| }) | |
| return top_links | |
| def _analyze_link_quality(self, backlinks: List[Dict], domain_metrics: Dict) -> Dict[str, Any]: | |
| """Analyze overall link quality metrics""" | |
| if not backlinks: | |
| return {'follow_ratio': 0, 'avg_authority': 0, 'quality_score': 0} | |
| follow_count = sum(1 for link in backlinks if not link.get('nofollow', True)) | |
| total_links = len(backlinks) | |
| follow_ratio = (follow_count / total_links * 100) if total_links > 0 else 0 | |
| # Average authority score | |
| authority_scores = [link.get('inlink_rank', 0) for link in backlinks if link.get('inlink_rank')] | |
| avg_authority = sum(authority_scores) / len(authority_scores) if authority_scores else 0 | |
| # Quality score (0-100) | |
| quality_score = min(100, ( | |
| (follow_ratio * 0.4) + | |
| (avg_authority * 2) + | |
| (min(20, len(set(link.get('url_from', '').split('/')[2] for link in backlinks))) * 1) | |
| )) | |
| return { | |
| 'follow_ratio': round(follow_ratio, 1), | |
| 'avg_authority': round(avg_authority, 1), | |
| 'quality_score': round(quality_score, 1), | |
| 'total_analyzed': total_links, | |
| 'edu_gov_count': int(domain_metrics.get('majesticRefEDU', 0)) + int(domain_metrics.get('majesticRefGov', 0)) | |
| } | |
| def _get_data_sources(self, individual_backlinks: List, majestic_metrics: Dict, domain_metrics: Dict) -> List[str]: | |
| sources = [] | |
| if individual_backlinks: | |
| sources.append('Best Backlink Checker API') | |
| if majestic_metrics: | |
| sources.append('Majestic RapidAPI') | |
| if domain_metrics: | |
| sources.append('Domain Metrics Check API') | |
| return sources or ['No data sources available'] | |
| def _get_primary_data_source(self, individual_backlinks: List, majestic_metrics: Dict, domain_metrics: Dict) -> str: | |
| """Get primary data source for labeling""" | |
| if domain_metrics: | |
| return 'Domain Metrics Check API' | |
| elif majestic_metrics: | |
| return 'Majestic RapidAPI' | |
| elif individual_backlinks: | |
| return 'Best Backlink Checker API' | |
| else: | |
| return 'No API credentials available' | |
| def _generate_no_api_data(self, url: str) -> ModuleResult: | |
| domain = self._extract_domain(url) | |
| no_api_data = { | |
| 'total_backlinks': 0, | |
| 'total_ref_domains': 0, | |
| 'domain_rating': 0, | |
| 'authority_scores': { | |
| 'ahrefs_dr': 0, | |
| 'moz_da': 0, | |
| 'moz_pa': 0, | |
| 'majestic_tf': 0, | |
| 'majestic_cf': 0 | |
| }, | |
| 'referring_domains': [], | |
| 'anchor_distribution': [], | |
| 'monthly_changes': { | |
| 'new_backlinks': 0, | |
| 'lost_backlinks_30d': None, # Explicit N/A | |
| 'net_change': 0 | |
| }, | |
| 'ref_domains': 0, | |
| 'new_backlinks_30d': 0, | |
| 'lost_backlinks_30d': None, | |
| 'top_backlinks': [], | |
| 'quality_metrics': { | |
| 'follow_ratio': 0, | |
| 'avg_authority': 0, | |
| 'quality_score': 0 | |
| }, | |
| 'edu_links': 0, | |
| 'gov_links': 0, | |
| 'estimated_organic_traffic': 0, | |
| 'organic_keywords': 0, | |
| 'data_sources': ['No API credentials available'], | |
| 'data_source': 'No API credentials available', | |
| 'api_status': { | |
| 'working_apis': [], | |
| 'failed_apis': ['Best Backlink Checker', 'Majestic', 'Domain Metrics Check'], | |
| 'failed_messages': [ | |
| 'β Best Backlink Checker API failed - no RAPIDAPI_KEY', | |
| 'β Majestic API failed - no RAPIDAPI_KEY', | |
| 'β Domain Metrics Check API failed - no RAPIDAPI_KEY' | |
| ] | |
| }, | |
| 'last_updated': datetime.now().isoformat(), | |
| 'placeholder': True, | |
| 'message': 'Add RAPIDAPI_KEY to your .env file to unlock comprehensive backlinks analysis using Best Backlink Checker, Majestic, and Domain Metrics Check RapidAPIs.' | |
| } | |
| return ModuleResult(success=True, data=no_api_data) |