fetch: add private-host allowlist + env switch; include resolved IPs in error and recheck redirects; document env vars in README
Browse files
README.md
CHANGED
|
@@ -44,6 +44,14 @@ pip install -r requirements.txt
|
|
| 44 |
```
|
| 45 |
If unset, the app automatically prefers `/data` when available, otherwise `./data`.
|
| 46 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 47 |
The request counters live in `<DATA_DIR>/request_counts.json`, guarded by a file lock to support concurrent MCP calls.
|
| 48 |
|
| 49 |
## Running Locally
|
|
|
|
| 44 |
```
|
| 45 |
If unset, the app automatically prefers `/data` when available, otherwise `./data`.
|
| 46 |
|
| 47 |
+
3. (Optional) Control private/local address policy for `fetch`:
|
| 48 |
+
- `FETCH_ALLOW_PRIVATE` — set to `1`/`true` to disable the SSRF guard entirely (not recommended except for trusted, local testing).
|
| 49 |
+
- `FETCH_PRIVATE_ALLOWLIST` — comma/space separated host patterns allowed even if they resolve to private/local IPs, e.g.:
|
| 50 |
+
```bash
|
| 51 |
+
export FETCH_PRIVATE_ALLOWLIST="*.corp.local, my-proxy.internal"
|
| 52 |
+
```
|
| 53 |
+
If neither is set, the fetcher refuses URLs whose host resolves to private, loopback, link‑local, multicast, reserved, or unspecified addresses. It also re-checks the final redirect target.
|
| 54 |
+
|
| 55 |
The request counters live in `<DATA_DIR>/request_counts.json`, guarded by a file lock to support concurrent MCP calls.
|
| 56 |
|
| 57 |
## Running Locally
|
app.py
CHANGED
|
@@ -6,6 +6,7 @@ import asyncio
|
|
| 6 |
import ipaddress
|
| 7 |
import socket
|
| 8 |
from typing import Optional, Dict, Any, List, Tuple
|
|
|
|
| 9 |
from urllib.parse import urlsplit
|
| 10 |
from datetime import datetime, timezone
|
| 11 |
|
|
@@ -84,6 +85,23 @@ EXTRACT_CONCURRENCY = max(
|
|
| 84 |
SEARCH_CACHE_TTL = max(0, int(os.getenv("SEARCH_CACHE_TTL", "30")))
|
| 85 |
FETCH_CACHE_TTL = max(0, int(os.getenv("FETCH_CACHE_TTL", "300")))
|
| 86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
_search_cache: Dict[Tuple[str, str, int], Dict[str, Any]] = {}
|
| 88 |
_fetch_cache: Dict[str, Dict[str, Any]] = {}
|
| 89 |
_search_cache_lock: Optional[asyncio.Lock] = None
|
|
@@ -162,20 +180,39 @@ def _client_ip(request: Optional[gr.Request]) -> str:
|
|
| 162 |
return "unknown"
|
| 163 |
|
| 164 |
|
| 165 |
-
|
|
|
|
| 166 |
if not host:
|
| 167 |
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 168 |
|
|
|
|
| 169 |
def _resolve() -> List[str]:
|
| 170 |
try:
|
| 171 |
return list({ai[4][0] for ai in socket.getaddrinfo(host, None)})
|
| 172 |
except Exception:
|
| 173 |
return []
|
| 174 |
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
if not addresses:
|
| 177 |
-
|
| 178 |
-
return True
|
| 179 |
|
| 180 |
for addr in addresses:
|
| 181 |
ip_obj = ipaddress.ip_address(addr)
|
|
@@ -187,8 +224,8 @@ async def _host_is_public(host: str) -> bool:
|
|
| 187 |
or ip_obj.is_reserved
|
| 188 |
or ip_obj.is_unspecified
|
| 189 |
):
|
| 190 |
-
return False
|
| 191 |
-
return True
|
| 192 |
|
| 193 |
|
| 194 |
async def _check_rate_limits(bucket: str, ip: str) -> Optional[str]:
|
|
@@ -374,9 +411,14 @@ async def fetch(
|
|
| 374 |
await record_request("fetch")
|
| 375 |
return cached
|
| 376 |
|
| 377 |
-
|
|
|
|
| 378 |
await record_request("fetch")
|
| 379 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 380 |
|
| 381 |
fetch_sema = _get_semaphore("fetch")
|
| 382 |
await fetch_sema.acquire()
|
|
@@ -397,6 +439,20 @@ async def fetch(
|
|
| 397 |
fetch_sema.release()
|
| 398 |
|
| 399 |
truncated = total > FETCH_MAX_BYTES
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 400 |
text = body.decode(encoding, errors="ignore")
|
| 401 |
|
| 402 |
extract_sema = _get_semaphore("extract")
|
|
|
|
| 6 |
import ipaddress
|
| 7 |
import socket
|
| 8 |
from typing import Optional, Dict, Any, List, Tuple
|
| 9 |
+
import fnmatch
|
| 10 |
from urllib.parse import urlsplit
|
| 11 |
from datetime import datetime, timezone
|
| 12 |
|
|
|
|
| 85 |
SEARCH_CACHE_TTL = max(0, int(os.getenv("SEARCH_CACHE_TTL", "30")))
|
| 86 |
FETCH_CACHE_TTL = max(0, int(os.getenv("FETCH_CACHE_TTL", "300")))
|
| 87 |
|
| 88 |
+
# Controls for private/local address handling in fetch()
|
| 89 |
+
def _env_flag(name: str, default: bool = False) -> bool:
|
| 90 |
+
"""Parse boolean-like env vars such as 1/true/yes/on."""
|
| 91 |
+
v = os.getenv(name)
|
| 92 |
+
if v is None:
|
| 93 |
+
return default
|
| 94 |
+
return str(v).strip().lower() in {"1", "true", "yes", "on", "y"}
|
| 95 |
+
|
| 96 |
+
# When True, allow any destination (disables SSRF guard — not recommended)
|
| 97 |
+
FETCH_ALLOW_PRIVATE = _env_flag("FETCH_ALLOW_PRIVATE", False)
|
| 98 |
+
|
| 99 |
+
# Optional comma/space separated host patterns to allow even if private, e.g.:
|
| 100 |
+
# FETCH_PRIVATE_ALLOWLIST="*.internal.example.com, my-proxy.local"
|
| 101 |
+
FETCH_PRIVATE_ALLOWLIST = [
|
| 102 |
+
p for p in re.split(r"[\s,]+", os.getenv("FETCH_PRIVATE_ALLOWLIST", "").strip()) if p
|
| 103 |
+
]
|
| 104 |
+
|
| 105 |
_search_cache: Dict[Tuple[str, str, int], Dict[str, Any]] = {}
|
| 106 |
_fetch_cache: Dict[str, Dict[str, Any]] = {}
|
| 107 |
_search_cache_lock: Optional[asyncio.Lock] = None
|
|
|
|
| 180 |
return "unknown"
|
| 181 |
|
| 182 |
|
| 183 |
+
def _host_matches_allowlist(host: str) -> bool:
|
| 184 |
+
"""Return True if host matches any pattern in FETCH_PRIVATE_ALLOWLIST."""
|
| 185 |
if not host:
|
| 186 |
return False
|
| 187 |
+
for pat in FETCH_PRIVATE_ALLOWLIST:
|
| 188 |
+
# Support bare host equality and fnmatch-style patterns (*.foo.bar)
|
| 189 |
+
if host == pat or fnmatch.fnmatch(host, pat):
|
| 190 |
+
return True
|
| 191 |
+
return False
|
| 192 |
+
|
| 193 |
|
| 194 |
+
async def _resolve_addresses(host: str) -> List[str]:
|
| 195 |
def _resolve() -> List[str]:
|
| 196 |
try:
|
| 197 |
return list({ai[4][0] for ai in socket.getaddrinfo(host, None)})
|
| 198 |
except Exception:
|
| 199 |
return []
|
| 200 |
|
| 201 |
+
return await asyncio.to_thread(_resolve)
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
async def _host_is_public(host: str) -> Tuple[bool, List[str]]:
|
| 205 |
+
"""Return (is_public, resolved_addresses).
|
| 206 |
+
|
| 207 |
+
- If resolution fails, treat as public and let HTTP request decide.
|
| 208 |
+
- Honors allowlist/env flags via the caller.
|
| 209 |
+
"""
|
| 210 |
+
if not host:
|
| 211 |
+
return False, []
|
| 212 |
+
|
| 213 |
+
addresses = await _resolve_addresses(host)
|
| 214 |
if not addresses:
|
| 215 |
+
return True, []
|
|
|
|
| 216 |
|
| 217 |
for addr in addresses:
|
| 218 |
ip_obj = ipaddress.ip_address(addr)
|
|
|
|
| 224 |
or ip_obj.is_reserved
|
| 225 |
or ip_obj.is_unspecified
|
| 226 |
):
|
| 227 |
+
return False, addresses
|
| 228 |
+
return True, addresses
|
| 229 |
|
| 230 |
|
| 231 |
async def _check_rate_limits(bucket: str, ip: str) -> Optional[str]:
|
|
|
|
| 411 |
await record_request("fetch")
|
| 412 |
return cached
|
| 413 |
|
| 414 |
+
is_public, addrs = await _host_is_public(host)
|
| 415 |
+
if not is_public and not (FETCH_ALLOW_PRIVATE or _host_matches_allowlist(host)):
|
| 416 |
await record_request("fetch")
|
| 417 |
+
detail = f" (resolved: {', '.join(addrs)})" if addrs else ""
|
| 418 |
+
return {
|
| 419 |
+
"error": "Refusing to fetch private or local addresses." + detail,
|
| 420 |
+
"host": host,
|
| 421 |
+
}
|
| 422 |
|
| 423 |
fetch_sema = _get_semaphore("fetch")
|
| 424 |
await fetch_sema.acquire()
|
|
|
|
| 439 |
fetch_sema.release()
|
| 440 |
|
| 441 |
truncated = total > FETCH_MAX_BYTES
|
| 442 |
+
# Extra guard: if final URL host ended up private due to a redirect and
|
| 443 |
+
# the user hasn't allowed private hosts, refuse to return body content.
|
| 444 |
+
try:
|
| 445 |
+
final_host = urlsplit(final_url_str).hostname or ""
|
| 446 |
+
except Exception:
|
| 447 |
+
final_host = ""
|
| 448 |
+
if final_host and not (FETCH_ALLOW_PRIVATE or _host_matches_allowlist(final_host)):
|
| 449 |
+
final_public, _ = await _host_is_public(final_host)
|
| 450 |
+
if not final_public:
|
| 451 |
+
await record_request("fetch")
|
| 452 |
+
return {
|
| 453 |
+
"error": "Refusing to fetch private or local addresses after redirect.",
|
| 454 |
+
"host": final_host,
|
| 455 |
+
}
|
| 456 |
text = body.decode(encoding, errors="ignore")
|
| 457 |
|
| 458 |
extract_sema = _get_semaphore("extract")
|