| from __future__ import annotations | |
| import logging | |
| from urllib.parse import unquote | |
| from config import CFG, _SESS | |
| from web_helpers import extract_main_text, fetch_blocked_site | |
| _BINARY = (".jpg", ".jpeg", ".png", ".gif", ".bmp", ".zip", ".tar", | |
| ".gz", ".mp3", ".mp4", ".mkv", ".exe") | |
| _ERROR = ["wrong", "error", "try again"] | |
| def _looks_like_error(txt): | |
| if len(txt) < 300: | |
| for err in _ERROR: | |
| if err in txt: | |
| return True | |
| return False | |
| def fetch_html(url: str) -> str: | |
| if url.lower().endswith(_BINARY): | |
| return "[binary omitted]" | |
| try: | |
| r = _SESS.get(url, stream=True, timeout=(CFG.connect_to, CFG.read_to)) | |
| r.raise_for_status() | |
| ctype = (r.headers.get("content-type") or "").lower() | |
| if "pdf" in ctype or not ("text" in ctype or "html" in ctype): | |
| return "[binary omitted]" | |
| raw = r.raw.read(CFG.stream_html_cap, decode_content=True) | |
| html = raw.decode(r.encoding or "utf-8", errors="ignore") | |
| txt = extract_main_text(html).strip() | |
| if "wikipedia.org" in url: | |
| slug = unquote(url.rsplit("/", 1)[-1]).replace("_", " ") | |
| if slug.lower() not in txt.lower(): | |
| txt = f"{slug}\n\n{txt}" | |
| if _looks_like_error(txt): | |
| return f"[Error fetching url: {url}]" | |
| else: | |
| return "[Retrived using HTML] " + txt | |
| except Exception as e: | |
| logging.error("Generic fetch failed %s: %s", url, e) | |
| return fetch_blocked_site(url) | |