Spaces:
Sleeping
Sleeping
change
Browse files
app.py
CHANGED
|
@@ -11,6 +11,16 @@ import httpx
|
|
| 11 |
from pydantic import BaseModel, Field, HttpUrl
|
| 12 |
|
| 13 |
from fastmcp import FastMCP
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 14 |
|
| 15 |
|
| 16 |
mcp = FastMCP(
|
|
@@ -70,6 +80,7 @@ def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
|
|
| 70 |
|
| 71 |
# Prefer list items with data-occludable-job-id when available
|
| 72 |
cards = tree.css("li[data-occludable-job-id], .base-search-card, .job-search-card")
|
|
|
|
| 73 |
for card in cards:
|
| 74 |
job_id = card.attributes.get("data-occludable-job-id")
|
| 75 |
|
|
@@ -148,6 +159,7 @@ def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
|
|
| 148 |
if not jobs:
|
| 149 |
anchors = tree.css("a[href*='/jobs/view/']")
|
| 150 |
seen_ids: set[str] = set()
|
|
|
|
| 151 |
for a in anchors:
|
| 152 |
href = a.attributes.get("href") or ""
|
| 153 |
if not href:
|
|
@@ -176,6 +188,7 @@ def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
|
|
| 176 |
except Exception:
|
| 177 |
continue
|
| 178 |
|
|
|
|
| 179 |
return jobs
|
| 180 |
|
| 181 |
|
|
@@ -271,18 +284,26 @@ def _search_page(
|
|
| 271 |
params: dict,
|
| 272 |
) -> list[JobPosting]:
|
| 273 |
base_url = "https://www.linkedin.com/jobs/search/?" + urlencode(params)
|
|
|
|
| 274 |
resp = client.get(base_url, follow_redirects=True, timeout=20.0)
|
| 275 |
resp.raise_for_status()
|
|
|
|
| 276 |
jobs = _parse_jobs_from_html(resp.text)
|
|
|
|
| 277 |
|
| 278 |
# If nothing parsed, try the fragment endpoint as a fallback regardless of page
|
| 279 |
if len(jobs) == 0:
|
| 280 |
fragment_url = (
|
| 281 |
"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?" + urlencode(params)
|
| 282 |
)
|
|
|
|
| 283 |
frag_resp = client.get(fragment_url, follow_redirects=True, timeout=20.0)
|
| 284 |
if frag_resp.status_code == 200:
|
|
|
|
| 285 |
jobs = _parse_jobs_from_html(frag_resp.text)
|
|
|
|
|
|
|
|
|
|
| 286 |
|
| 287 |
return jobs
|
| 288 |
|
|
@@ -326,6 +347,20 @@ def search_linkedin_jobs(
|
|
| 326 |
|
| 327 |
with httpx.Client(headers=headers) as client:
|
| 328 |
start = 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 329 |
for _page in range(pages):
|
| 330 |
active_params = _build_search_params(
|
| 331 |
keywords=query,
|
|
@@ -340,27 +375,39 @@ def search_linkedin_jobs(
|
|
| 340 |
)
|
| 341 |
|
| 342 |
try:
|
|
|
|
| 343 |
jobs = _search_page(client, params=active_params)
|
| 344 |
except httpx.HTTPStatusError as e:
|
| 345 |
status = e.response.status_code
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
if status in (401, 403, 429):
|
|
|
|
| 347 |
break
|
| 348 |
raise
|
| 349 |
-
except Exception:
|
|
|
|
| 350 |
jobs = []
|
| 351 |
|
| 352 |
if not jobs:
|
|
|
|
| 353 |
break
|
| 354 |
|
| 355 |
all_jobs.extend(jobs)
|
| 356 |
if len(all_jobs) >= max_items:
|
|
|
|
| 357 |
break
|
| 358 |
|
| 359 |
start += 25
|
| 360 |
time.sleep(0.8)
|
| 361 |
|
|
|
|
| 362 |
return all_jobs[:max_items]
|
| 363 |
|
| 364 |
|
| 365 |
if __name__ == "__main__":
|
|
|
|
| 366 |
mcp.run(transport="http")
|
|
|
|
| 11 |
from pydantic import BaseModel, Field, HttpUrl
|
| 12 |
|
| 13 |
from fastmcp import FastMCP
|
| 14 |
+
import logging
|
| 15 |
+
|
| 16 |
+
# Logging configuration
|
| 17 |
+
LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper()
|
| 18 |
+
_numeric_level = getattr(logging, LOG_LEVEL, logging.INFO)
|
| 19 |
+
logging.basicConfig(
|
| 20 |
+
level=_numeric_level,
|
| 21 |
+
format="%(asctime)s %(levelname)s %(name)s - %(message)s",
|
| 22 |
+
)
|
| 23 |
+
logger = logging.getLogger("linkedin_mcp")
|
| 24 |
|
| 25 |
|
| 26 |
mcp = FastMCP(
|
|
|
|
| 80 |
|
| 81 |
# Prefer list items with data-occludable-job-id when available
|
| 82 |
cards = tree.css("li[data-occludable-job-id], .base-search-card, .job-search-card")
|
| 83 |
+
logger.debug("HTML parse: found %d potential job cards", len(cards))
|
| 84 |
for card in cards:
|
| 85 |
job_id = card.attributes.get("data-occludable-job-id")
|
| 86 |
|
|
|
|
| 159 |
if not jobs:
|
| 160 |
anchors = tree.css("a[href*='/jobs/view/']")
|
| 161 |
seen_ids: set[str] = set()
|
| 162 |
+
logger.debug("HTML parse fallback: scanning %d anchors with /jobs/view/", len(anchors))
|
| 163 |
for a in anchors:
|
| 164 |
href = a.attributes.get("href") or ""
|
| 165 |
if not href:
|
|
|
|
| 188 |
except Exception:
|
| 189 |
continue
|
| 190 |
|
| 191 |
+
logger.debug("HTML parse complete: %d jobs parsed", len(jobs))
|
| 192 |
return jobs
|
| 193 |
|
| 194 |
|
|
|
|
| 284 |
params: dict,
|
| 285 |
) -> list[JobPosting]:
|
| 286 |
base_url = "https://www.linkedin.com/jobs/search/?" + urlencode(params)
|
| 287 |
+
logger.debug("GET main page: %s", base_url)
|
| 288 |
resp = client.get(base_url, follow_redirects=True, timeout=20.0)
|
| 289 |
resp.raise_for_status()
|
| 290 |
+
logger.debug("Main page status=%d bytes=%d", resp.status_code, len(resp.content))
|
| 291 |
jobs = _parse_jobs_from_html(resp.text)
|
| 292 |
+
logger.debug("Parsed %d jobs from main page", len(jobs))
|
| 293 |
|
| 294 |
# If nothing parsed, try the fragment endpoint as a fallback regardless of page
|
| 295 |
if len(jobs) == 0:
|
| 296 |
fragment_url = (
|
| 297 |
"https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?" + urlencode(params)
|
| 298 |
)
|
| 299 |
+
logger.debug("GET fragment fallback: %s", fragment_url)
|
| 300 |
frag_resp = client.get(fragment_url, follow_redirects=True, timeout=20.0)
|
| 301 |
if frag_resp.status_code == 200:
|
| 302 |
+
logger.debug("Fragment status=%d bytes=%d", frag_resp.status_code, len(frag_resp.content))
|
| 303 |
jobs = _parse_jobs_from_html(frag_resp.text)
|
| 304 |
+
logger.debug("Parsed %d jobs from fragment", len(jobs))
|
| 305 |
+
else:
|
| 306 |
+
logger.debug("Fragment request returned status=%d", frag_resp.status_code)
|
| 307 |
|
| 308 |
return jobs
|
| 309 |
|
|
|
|
| 347 |
|
| 348 |
with httpx.Client(headers=headers) as client:
|
| 349 |
start = 0
|
| 350 |
+
logger.info(
|
| 351 |
+
"Search start: query=%r location=%r limit=%d pages=%d sort_by=%s date_posted=%s exp=%s job_types=%s remote=%s geo_id=%s cookie_present=%s",
|
| 352 |
+
query,
|
| 353 |
+
location,
|
| 354 |
+
limit,
|
| 355 |
+
pages,
|
| 356 |
+
sort_by,
|
| 357 |
+
date_posted,
|
| 358 |
+
experience_levels,
|
| 359 |
+
job_types,
|
| 360 |
+
remote,
|
| 361 |
+
geo_id,
|
| 362 |
+
bool(cookie),
|
| 363 |
+
)
|
| 364 |
for _page in range(pages):
|
| 365 |
active_params = _build_search_params(
|
| 366 |
keywords=query,
|
|
|
|
| 375 |
)
|
| 376 |
|
| 377 |
try:
|
| 378 |
+
logger.debug("Page fetch: start=%d params=%s", start, active_params)
|
| 379 |
jobs = _search_page(client, params=active_params)
|
| 380 |
except httpx.HTTPStatusError as e:
|
| 381 |
status = e.response.status_code
|
| 382 |
+
try:
|
| 383 |
+
failed_url = str(e.request.url)
|
| 384 |
+
except Exception:
|
| 385 |
+
failed_url = "<unknown>"
|
| 386 |
+
logger.warning("HTTP error status=%d url=%s", status, failed_url)
|
| 387 |
if status in (401, 403, 429):
|
| 388 |
+
logger.info("Stopping due to auth/rate limit status=%d", status)
|
| 389 |
break
|
| 390 |
raise
|
| 391 |
+
except Exception as ex:
|
| 392 |
+
logger.exception("Unexpected error during page fetch: %s", ex)
|
| 393 |
jobs = []
|
| 394 |
|
| 395 |
if not jobs:
|
| 396 |
+
logger.info("No jobs parsed for start=%d; stopping further requests", start)
|
| 397 |
break
|
| 398 |
|
| 399 |
all_jobs.extend(jobs)
|
| 400 |
if len(all_jobs) >= max_items:
|
| 401 |
+
logger.info("Reached max_items=%d; stopping pagination", max_items)
|
| 402 |
break
|
| 403 |
|
| 404 |
start += 25
|
| 405 |
time.sleep(0.8)
|
| 406 |
|
| 407 |
+
logger.info("Search complete: returning %d jobs", len(all_jobs[:max_items]))
|
| 408 |
return all_jobs[:max_items]
|
| 409 |
|
| 410 |
|
| 411 |
if __name__ == "__main__":
|
| 412 |
+
logger.info("Starting linkedin-jobs MCP server")
|
| 413 |
mcp.run(transport="http")
|