Spaces:
Sleeping
Sleeping
| import os | |
| import re | |
| import json | |
| import pdfkit | |
| import requests | |
| import warnings | |
| import tempfile | |
| # import textract | |
| import html2text | |
| import inscriptis | |
| import trafilatura | |
| from pathlib import Path | |
| from markdownify import markdownify | |
| from json_repair import repair_json | |
| from bs4 import BeautifulSoup, Comment | |
| from html_chunking import get_html_chunks | |
| from urllib.error import URLError, HTTPError | |
| from html_to_markdown import convert_to_markdown | |
| from readabilipy import simple_json_from_html_string | |
| from docling.document_converter import DocumentConverter | |
| from dateparser_scripts.update_supported_languages_and_locales import to_string | |
| def clean_html(html_content: str) -> str: | |
| """ | |
| Cleans up the given HTML content by: | |
| - Removing <script> and <style> tags and their content. | |
| - Removing HTML comments. | |
| - Extracting and returning the visible text with normalized whitespace. | |
| Args: | |
| html_content (str): The HTML content to clean. | |
| Returns: | |
| str: The cleaned, visible text from the HTML. | |
| """ | |
| # Parse the HTML content | |
| soup = BeautifulSoup(html_content, "html.parser") | |
| # Remove script and style elements | |
| # Remove unwanted tags | |
| for tag in soup(["script", "style", "img", "a", "table", "tr", "td", "th", "thead", "tbody", | |
| "tfoot", "header", "footer", "link", "rel"]): | |
| tag.decompose() | |
| # Remove elements that do not contain any visible text | |
| for element in soup.find_all(): | |
| # If the element has no text (after stripping whitespace), remove it | |
| if not element.get_text(strip=True): | |
| element.decompose() | |
| # Remove HTML comments | |
| for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): | |
| comment.extract() | |
| # Extract text and normalize whitespace | |
| # text = soup.get_text(separator=" ", strip=True) | |
| # clean_text = re.sub(r'\s+', ' ', text) | |
| # return clean_text | |
| return str(soup) | |
| def print_content_extractors(): | |
| print( | |
| [ | |
| "Default: the plain text of the HTML page", | |
| "Inscriptis", | |
| "Trafilatura", | |
| ] | |
| ) | |
| class ContentExtractor: | |
| def get_text(self, html): | |
| return clean_html(html) | |
| # TODO: Clean this mess | |
| def url_to_html(self, url,clean=False): | |
| # Define custom headers to mimic a browser request | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36", | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.6", | |
| "Cache-Control": "max-age=0", | |
| "Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"", | |
| "Sec-Ch-Ua-Mobile": "?0", | |
| "Sec-Ch-Ua-Platform": "\"Windows\"", | |
| "Sec-Fetch-Dest": "document", | |
| "Sec-Fetch-Mode": "navigate", | |
| "Sec-Fetch-Site": "none", | |
| "Sec-Fetch-User": "?1", | |
| "Upgrade-Insecure-Requests": "1" | |
| } | |
| try: | |
| # Create a Request object with custom headers | |
| response = requests.get(url, headers=headers, timeout=10) | |
| html = None | |
| if response.status_code == 200: | |
| html = response.text | |
| else: | |
| print(f"Failed to retrieve HTML. Status code: {response.status_code}") | |
| return None | |
| if clean: | |
| return self.get_text(html) | |
| return html | |
| except HTTPError as e: | |
| print(f"HTTP Error: {e.code} - {e.reason}") | |
| return None | |
| except URLError as e: | |
| print(f"URL Error: {e.reason}") | |
| return None | |
| except Exception as e: | |
| print(f"An unexpected error occurred: {e}") | |
| return None | |
| class Inscriptis(ContentExtractor): | |
| def __init__(self): | |
| super() | |
| self.headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Brave/119.0.0.0", | |
| "Accept-Language": "en-US,en;q=0.9,ar;q=0.8", | |
| } | |
| warnings.warn("\nBeware, put only clean links with no trackers, or it may produce unexpected results.") | |
| def get_text(self, html): | |
| """Extract text from HTML using inscriptis.""" | |
| return inscriptis.get_text(html) | |
| def url_to_html(self, url): | |
| response = requests.get(url, headers=self.headers) | |
| return response.text | |
| class Docling(ContentExtractor): | |
| def __init__(self): | |
| super().__init__() | |
| # TODO: This is an unexpected behaviour but due to docling docs website being down, it's what works for now | |
| def get_text(self, text_content): | |
| result = None | |
| with tempfile.NamedTemporaryFile(mode='w+', suffix='.html', delete=False, encoding='utf-8') as tmpfile: | |
| tmpfile.write(text_content) | |
| tmpfile.flush() | |
| tmpfile_path = tmpfile.name.replace("\\", "/") | |
| tmpfile_path = Path(tmpfile_path) | |
| try: | |
| converter = DocumentConverter() | |
| document = converter.convert(tmpfile_path).document | |
| tables = [] | |
| for table_ix, table in enumerate(document.tables): | |
| table_text = table.export_to_markdown() | |
| tables.append(table_text) | |
| result = document.export_to_markdown() | |
| for table in tables: | |
| result += "\n\n" + table | |
| finally: | |
| os.remove(tmpfile_path) | |
| return result | |
| class ReadabiliPy(ContentExtractor): | |
| def __init__(self): | |
| super().__init__() | |
| def get_text(self, html): | |
| content = simple_json_from_html_string(html, use_readability=True) | |
| json_object = json.dumps(content, indent=4) | |
| repaired = repair_json(json_object) | |
| return repaired | |
| class Trafilatura(ContentExtractor): | |
| def __init__(self): | |
| super().__init__() | |
| self.headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| } | |
| warnings.warn("\nTrafilatura Content Extractor: Beware, put only clean links with no trackers, or it may produce unexpected results.") | |
| from copy import deepcopy | |
| from trafilatura.settings import DEFAULT_CONFIG | |
| config = deepcopy(DEFAULT_CONFIG) | |
| # config['DEFAULT']['MIN_EXTRACTED_SIZE'] = '5000' # Configurable but this value worked well for me | |
| self.config = config | |
| def url_to_html(self, url): | |
| response = requests.get(url, headers=self.headers) | |
| return response.text | |
| def get_text(self, html, output_format="markdown", min_extracted_size_char=20_000): | |
| # self.config['DEFAULT']['MIN_EXTRACTED_SIZE'] = f"{min_extracted_size_char}" | |
| # self.config['DEFAULT']['MIN_OUTPUT_SIZE'] = f"{min_extracted_size_char}" | |
| return trafilatura.extract(filecontent=html, favor_recall=True, config=self.config, output_format=output_format) | |
| class Markdownify(ContentExtractor): | |
| def get_text(self, html): | |
| alt = re.sub(r"\n{3,}", "\n\n", html) | |
| md = markdownify(alt, strip=['href', 'table', 'tr', 'td', 'header', 'footer']) | |
| md = re.sub(r'!?\[[^\]]*\]\([^)]*\)', '', md) | |
| # Remove extra newlines | |
| md = re.sub(r"\n{3,}", "\n\n", md) | |
| md = md.strip() | |
| return md | |
| class HTML2Text(ContentExtractor): | |
| def get_text(self, html): | |
| converter = html2text.HTML2Text() | |
| converter.ignore_tables=True | |
| converter.ignore_links=True | |
| converter.ignore_images=True | |
| converter.ignore_mailto_links=True | |
| return converter.handle(html) | |
| class HTML_TO_Markdown(ContentExtractor): | |
| def get_text(self, html): | |
| alt = re.sub(r"\n{3,}", "\n\n", html) | |
| md = convert_to_markdown(alt, strip=['href', 'table', 'tr', 'td', 'header', 'footer']) | |
| md = re.sub(r'!?\[[^\]]*\]\([^)]*\)', '', md) | |
| # Remove extra newlines | |
| md = re.sub(r"\n{3,}", "\n\n", md) | |
| md = md.strip() | |
| return md | |
| class PDFkitDocling(ContentExtractor): | |
| def get_text(self, html): | |
| soup = BeautifulSoup(html, "html.parser") | |
| # Remove <a>, <link>, <img>, and other unwanted tags | |
| for tag in soup.find_all(['a', 'link', 'img', 'base', 'meta', 'style', 'script', 'noscript', 'head']): | |
| tag.decompose() | |
| # Remove HTML comments | |
| for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): | |
| comment.extract() | |
| content = str(soup) | |
| # PDF path to save | |
| pdf_path = 'test.pdf' | |
| # Create PDF | |
| pdfkit.from_string(content, pdf_path) | |
| converter = DocumentConverter() | |
| return converter.convert(pdf_path).document.export_to_markdown() | |
| class TrafilatraCHUNKS(ContentExtractor): | |
| def __init__(self): | |
| super().__init__() | |
| # self.trafi = Trafilatura() | |
| def get_text(self, html, max_tokens=1000): | |
| soup = BeautifulSoup(html, "html.parser") | |
| # Remove <a>, <link>, <img>, and other unwanted tags | |
| for tag in soup.find_all(['a', 'link', 'img', 'base', 'meta', 'style', 'script', 'noscript', 'head']): | |
| tag.decompose() | |
| # Remove HTML comments | |
| for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): | |
| comment.extract() | |
| content = str(soup) | |
| chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50) | |
| cleaned = [trafilatura.extract(chunk) for chunk in chunks] | |
| cleaned = [chunk for chunk in cleaned if chunk is not None] | |
| combined_text = "" | |
| for chunk in cleaned: | |
| if chunk is None: | |
| continue | |
| combined_text += chunk + "\n" | |
| return combined_text | |
| class TrafilaCHUNKSRobust(ContentExtractor): | |
| def __init__(self): | |
| super().__init__() | |
| # self.trafi = Trafilatura() | |
| def get_text(self, html, max_tokens=1000): | |
| soup = BeautifulSoup(html, "html.parser") | |
| for tag in soup.find_all(['style', 'script', 'head', 'img', 'base', 'noscript']): | |
| tag.decompose() | |
| for tag in soup.find_all(lambda tag: tag.attrs and any("nav" in str(v) for v in tag.attrs.values())): | |
| tag.decompose() | |
| # Remove HTML comments | |
| for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): | |
| comment.extract() | |
| content = str(soup) | |
| chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50) | |
| cleaned = [trafilatura.extract(chunk) for chunk in chunks] | |
| cleaned = [chunk for chunk in cleaned if chunk is not None] | |
| combined_text = "" | |
| for chunk in cleaned: | |
| if chunk is None: | |
| continue | |
| combined_text += chunk + "\n" | |
| return combined_text | |
| class TrafilaCHUNKSRobustV2(ContentExtractor): | |
| def __init__(self): | |
| super().__init__() | |
| # self.trafi = Trafilatura() | |
| def get_text(self, html, max_tokens=1000): | |
| soup = BeautifulSoup(html, "html.parser") | |
| for tag in soup.find_all(['style', 'script', 'head', 'img', 'base', 'noscript']): | |
| tag.decompose() | |
| # Remove HTML comments | |
| for comment in soup.find_all(string=lambda text: isinstance(text, Comment)): | |
| comment.extract() | |
| content = str(soup) | |
| chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50) | |
| cleaned = [trafilatura.extract(chunk) for chunk in chunks] | |
| cleaned = [chunk for chunk in cleaned if chunk is not None] | |
| combined_text = "" | |
| for chunk in cleaned: | |
| if chunk is None: | |
| continue | |
| combined_text += chunk + "\n" | |
| return combined_text | |
| # Very Bad lol | |
| # class Textract(ContentExtractor): | |
| # def get_text(self, html): | |
| # with tempfile.NamedTemporaryFile(mode='w+', suffix='.html', delete=False, encoding='utf-8') as tmpfile: | |
| # tmpfile.write(html) | |
| # tmpfile.flush() | |
| # tmpfile_path = tmpfile.name.replace("\\", "/") | |
| # tmpfile_path = Path(tmpfile_path) | |
| # try: | |
| # result = textract.process(tmpfile_path) | |
| # finally: | |
| # os.remove(tmpfile_path) | |
| # return result |