File size: 1,752 Bytes
5ab87e0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
from __future__ import annotations
from config import CFG, _SESS
import io, logging, re, pymupdf as fitz

from web_helpers import retry, fetch_blocked_site     # ⬅️ shared
# ----------------------------------------------------------------------

class PDFExtractError(RuntimeError): ...

@retry
def _download_pdf(url: str) -> bytes:
    with _SESS.get(url, stream=True, timeout=(CFG.connect_to, CFG.read_to)) as r:
        r.raise_for_status()
        total = int(r.headers.get("content-length", 0) or 0)
        if 0 < total > CFG.pdf_size_cap:
            raise RuntimeError("pdf too large")
        buf = io.BytesIO()
        for chunk in r.iter_content(16_384):
            buf.write(chunk)
            if buf.tell() > CFG.pdf_size_cap:
                raise RuntimeError("pdf exceeds cap")
        return buf.getvalue()

def _extract_pdf(buf: bytes) -> str:
    try:
        doc = fitz.open(stream=buf, filetype="pdf")
    except Exception as e:
        raise PDFExtractError(e)
    parts, chars = [], 0
    for page in doc:
        if len(parts) >= CFG.pdf_pages_cap:
            break
        text = (
            page.get_text("text")
            .replace("\u00A0", " ")
            .replace("-\n", "")
        )
        parts.append(text)
        chars += len(text)
        if chars > CFG.pdf_chars_cap:
            break
    out = " ".join(parts).strip()[:CFG.pdf_chars_cap]
    if not out:
        raise PDFExtractError("empty / scanned pdf")
    return "[Retrieved from PyMUPDF]" + out

def fetch_pdf(url: str) -> str:
    try:
        buf = _download_pdf(url)
        return _extract_pdf(buf)
    except Exception as e:
        logging.error("PDF fetch failed for %s: %s", url, e)
        return fetch_blocked_site(url)[:CFG.text_cap]