Spaces:

FractalAIResearch
/

Fathom-DeepResearch

Running

init

5ab87e0 about 1 month ago

1.53 kB

	from __future__ import annotations
	import logging
	from urllib.parse import unquote
	from config import CFG, _SESS
	from web_helpers import extract_main_text, fetch_blocked_site

	_BINARY = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".zip", ".tar",
	".gz", ".mp3", ".mp4", ".mkv", ".exe")

	_ERROR = ["wrong", "error", "try again"]

	def _looks_like_error(txt):
	if len(txt) < 300:
	for err in _ERROR:
	if err in txt:
	return True
	return False


	def fetch_html(url: str) -> str:
	if url.lower().endswith(_BINARY):
	return "[binary omitted]"
	try:
	r = _SESS.get(url, stream=True, timeout=(CFG.connect_to, CFG.read_to))
	r.raise_for_status()
	ctype = (r.headers.get("content-type") or "").lower()
	if "pdf" in ctype or not ("text" in ctype or "html" in ctype):
	return "[binary omitted]"
	raw = r.raw.read(CFG.stream_html_cap, decode_content=True)
	html = raw.decode(r.encoding or "utf-8", errors="ignore")
	txt = extract_main_text(html).strip()
	if "wikipedia.org" in url:
	slug = unquote(url.rsplit("/", 1)[-1]).replace("_", " ")
	if slug.lower() not in txt.lower():
	txt = f"{slug}\n\n{txt}"
	if _looks_like_error(txt):
	return f"[Error fetching url: {url}]"
	else:
	return "[Retrived using HTML] " + txt
	except Exception as e:
	logging.error("Generic fetch failed %s: %s", url, e)
	return fetch_blocked_site(url)