Spaces:

garage-lab
/

MCP_STRUCTRA

Sleeping

App Files Files Community

MCP_STRUCTRA / web2json /contentextractors.py

abdo-Mansour

hey

ab74ea1 4 months ago

raw

history blame contribute delete

12.7 kB

	import os
	import re
	import json
	import pdfkit
	import requests
	import warnings
	import tempfile
	# import textract
	import html2text
	import inscriptis
	import trafilatura
	from pathlib import Path
	from markdownify import markdownify
	from json_repair import repair_json
	from bs4 import BeautifulSoup, Comment
	from html_chunking import get_html_chunks
	from urllib.error import URLError, HTTPError
	from html_to_markdown import convert_to_markdown
	from readabilipy import simple_json_from_html_string
	from docling.document_converter import DocumentConverter
	from dateparser_scripts.update_supported_languages_and_locales import to_string


	def clean_html(html_content: str) -> str:
	"""
	Cleans up the given HTML content by:
	- Removing <script> and <style> tags and their content.
	- Removing HTML comments.
	- Extracting and returning the visible text with normalized whitespace.

	Args:
	html_content (str): The HTML content to clean.

	Returns:
	str: The cleaned, visible text from the HTML.
	"""
	# Parse the HTML content
	soup = BeautifulSoup(html_content, "html.parser")

	# Remove script and style elements
	# Remove unwanted tags
	for tag in soup(["script", "style", "img", "a", "table", "tr", "td", "th", "thead", "tbody",
	"tfoot", "header", "footer", "link", "rel"]):
	tag.decompose()

	# Remove elements that do not contain any visible text
	for element in soup.find_all():
	# If the element has no text (after stripping whitespace), remove it
	if not element.get_text(strip=True):
	element.decompose()

	# Remove HTML comments
	for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
	comment.extract()

	# Extract text and normalize whitespace
	# text = soup.get_text(separator=" ", strip=True)
	# clean_text = re.sub(r'\s+', ' ', text)

	# return clean_text
	return str(soup)


	def print_content_extractors():
	print(
	[
	"Default: the plain text of the HTML page",
	"Inscriptis",
	"Trafilatura",
	]
	)


	class ContentExtractor:
	def get_text(self, html):
	return clean_html(html)

	# TODO: Clean this mess
	def url_to_html(self, url,clean=False):
	# Define custom headers to mimic a browser request
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/133.0.0.0 Safari/537.36",
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8",
	"Accept-Language": "en-US,en;q=0.6",
	"Cache-Control": "max-age=0",
	"Sec-Ch-Ua": "\"Not(A:Brand\";v=\"99\", \"Brave\";v=\"133\", \"Chromium\";v=\"133\"",
	"Sec-Ch-Ua-Mobile": "?0",
	"Sec-Ch-Ua-Platform": "\"Windows\"",
	"Sec-Fetch-Dest": "document",
	"Sec-Fetch-Mode": "navigate",
	"Sec-Fetch-Site": "none",
	"Sec-Fetch-User": "?1",
	"Upgrade-Insecure-Requests": "1"
	}

	try:
	# Create a Request object with custom headers
	response = requests.get(url, headers=headers, timeout=10)

	html = None

	if response.status_code == 200:
	html = response.text
	else:
	print(f"Failed to retrieve HTML. Status code: {response.status_code}")
	return None

	if clean:
	return self.get_text(html)

	return html

	except HTTPError as e:
	print(f"HTTP Error: {e.code} - {e.reason}")
	return None
	except URLError as e:
	print(f"URL Error: {e.reason}")
	return None
	except Exception as e:
	print(f"An unexpected error occurred: {e}")
	return None


	class Inscriptis(ContentExtractor):
	def __init__(self):
	super()
	self.headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36 Brave/119.0.0.0",
	"Accept-Language": "en-US,en;q=0.9,ar;q=0.8",
	}

	warnings.warn("\nBeware, put only clean links with no trackers, or it may produce unexpected results.")

	def get_text(self, html):
	"""Extract text from HTML using inscriptis."""
	return inscriptis.get_text(html)

	def url_to_html(self, url):
	response = requests.get(url, headers=self.headers)
	return response.text


	class Docling(ContentExtractor):
	def __init__(self):
	super().__init__()

	# TODO: This is an unexpected behaviour but due to docling docs website being down, it's what works for now
	def get_text(self, text_content):
	result = None
	with tempfile.NamedTemporaryFile(mode='w+', suffix='.html', delete=False, encoding='utf-8') as tmpfile:
	tmpfile.write(text_content)
	tmpfile.flush()
	tmpfile_path = tmpfile.name.replace("\\", "/")
	tmpfile_path = Path(tmpfile_path)
	try:
	converter = DocumentConverter()
	document = converter.convert(tmpfile_path).document
	tables = []
	for table_ix, table in enumerate(document.tables):
	table_text = table.export_to_markdown()
	tables.append(table_text)

	result = document.export_to_markdown()
	for table in tables:
	result += "\n\n" + table
	finally:
	os.remove(tmpfile_path)
	return result


	class ReadabiliPy(ContentExtractor):
	def __init__(self):
	super().__init__()

	def get_text(self, html):
	content = simple_json_from_html_string(html, use_readability=True)
	json_object = json.dumps(content, indent=4)
	repaired = repair_json(json_object)
	return repaired


	class Trafilatura(ContentExtractor):
	def __init__(self):
	super().__init__()
	self.headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/119.0.0.0 Safari/537.36",
	"Accept-Language": "en-US,en;q=0.9",
	}

	warnings.warn("\nTrafilatura Content Extractor: Beware, put only clean links with no trackers, or it may produce unexpected results.")

	from copy import deepcopy
	from trafilatura.settings import DEFAULT_CONFIG
	config = deepcopy(DEFAULT_CONFIG)
	# config['DEFAULT']['MIN_EXTRACTED_SIZE'] = '5000' # Configurable but this value worked well for me
	self.config = config

	def url_to_html(self, url):
	response = requests.get(url, headers=self.headers)
	return response.text

	def get_text(self, html, output_format="markdown", min_extracted_size_char=20_000):
	# self.config['DEFAULT']['MIN_EXTRACTED_SIZE'] = f"{min_extracted_size_char}"
	# self.config['DEFAULT']['MIN_OUTPUT_SIZE'] = f"{min_extracted_size_char}"
	return trafilatura.extract(filecontent=html, favor_recall=True, config=self.config, output_format=output_format)


	class Markdownify(ContentExtractor):
	def get_text(self, html):
	alt = re.sub(r"\n{3,}", "\n\n", html)
	md = markdownify(alt, strip=['href', 'table', 'tr', 'td', 'header', 'footer'])

	md = re.sub(r'!?\[[^\]]\]\([^)]\)', '', md)
	# Remove extra newlines
	md = re.sub(r"\n{3,}", "\n\n", md)
	md = md.strip()

	return md


	class HTML2Text(ContentExtractor):
	def get_text(self, html):
	converter = html2text.HTML2Text()
	converter.ignore_tables=True
	converter.ignore_links=True
	converter.ignore_images=True
	converter.ignore_mailto_links=True
	return converter.handle(html)


	class HTML_TO_Markdown(ContentExtractor):
	def get_text(self, html):
	alt = re.sub(r"\n{3,}", "\n\n", html)
	md = convert_to_markdown(alt, strip=['href', 'table', 'tr', 'td', 'header', 'footer'])

	md = re.sub(r'!?\[[^\]]\]\([^)]\)', '', md)
	# Remove extra newlines
	md = re.sub(r"\n{3,}", "\n\n", md)
	md = md.strip()

	return md


	class PDFkitDocling(ContentExtractor):
	def get_text(self, html):
	soup = BeautifulSoup(html, "html.parser")

	# Remove <a>, <link>, <img>, and other unwanted tags
	for tag in soup.find_all(['a', 'link', 'img', 'base', 'meta', 'style', 'script', 'noscript', 'head']):
	tag.decompose()

	# Remove HTML comments
	for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
	comment.extract()


	content = str(soup)

	# PDF path to save
	pdf_path = 'test.pdf'

	# Create PDF
	pdfkit.from_string(content, pdf_path)

	converter = DocumentConverter()

	return converter.convert(pdf_path).document.export_to_markdown()


	class TrafilatraCHUNKS(ContentExtractor):
	def __init__(self):
	super().__init__()
	# self.trafi = Trafilatura()

	def get_text(self, html, max_tokens=1000):
	soup = BeautifulSoup(html, "html.parser")

	# Remove <a>, <link>, <img>, and other unwanted tags
	for tag in soup.find_all(['a', 'link', 'img', 'base', 'meta', 'style', 'script', 'noscript', 'head']):
	tag.decompose()

	# Remove HTML comments
	for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
	comment.extract()


	content = str(soup)

	chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)

	cleaned = [trafilatura.extract(chunk) for chunk in chunks]
	cleaned = [chunk for chunk in cleaned if chunk is not None]


	combined_text = ""
	for chunk in cleaned:
	if chunk is None:
	continue
	combined_text += chunk + "\n"

	return combined_text


	class TrafilaCHUNKSRobust(ContentExtractor):
	def __init__(self):
	super().__init__()
	# self.trafi = Trafilatura()

	def get_text(self, html, max_tokens=1000):
	soup = BeautifulSoup(html, "html.parser")

	for tag in soup.find_all(['style', 'script', 'head', 'img', 'base', 'noscript']):
	tag.decompose()

	for tag in soup.find_all(lambda tag: tag.attrs and any("nav" in str(v) for v in tag.attrs.values())):
	tag.decompose()

	# Remove HTML comments
	for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
	comment.extract()

	content = str(soup)

	chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)

	cleaned = [trafilatura.extract(chunk) for chunk in chunks]
	cleaned = [chunk for chunk in cleaned if chunk is not None]

	combined_text = ""
	for chunk in cleaned:
	if chunk is None:
	continue
	combined_text += chunk + "\n"

	return combined_text

	class TrafilaCHUNKSRobustV2(ContentExtractor):
	def __init__(self):
	super().__init__()
	# self.trafi = Trafilatura()

	def get_text(self, html, max_tokens=1000):
	soup = BeautifulSoup(html, "html.parser")

	for tag in soup.find_all(['style', 'script', 'head', 'img', 'base', 'noscript']):
	tag.decompose()

	# Remove HTML comments
	for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
	comment.extract()

	content = str(soup)

	chunks = get_html_chunks(content, max_tokens=max_tokens, is_clean_html=True, attr_cutoff_len=50)

	cleaned = [trafilatura.extract(chunk) for chunk in chunks]
	cleaned = [chunk for chunk in cleaned if chunk is not None]

	combined_text = ""
	for chunk in cleaned:
	if chunk is None:
	continue
	combined_text += chunk + "\n"

	return combined_text

	# Very Bad lol
	# class Textract(ContentExtractor):
	# def get_text(self, html):
	# with tempfile.NamedTemporaryFile(mode='w+', suffix='.html', delete=False, encoding='utf-8') as tmpfile:
	# tmpfile.write(html)
	# tmpfile.flush()
	# tmpfile_path = tmpfile.name.replace("\\", "/")
	# tmpfile_path = Path(tmpfile_path)
	# try:
	# result = textract.process(tmpfile_path)
	# finally:
	# os.remove(tmpfile_path)
	# return result