Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import os | |
| import re | |
| import time | |
| import html | |
| from typing import List, Optional | |
| from urllib.parse import urlencode | |
| import httpx | |
| from pydantic import BaseModel, Field, HttpUrl | |
| from fastmcp import FastMCP | |
| import logging | |
| # Logging configuration | |
| LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO").upper() | |
| _numeric_level = getattr(logging, LOG_LEVEL, logging.INFO) | |
| logging.basicConfig( | |
| level=_numeric_level, | |
| format="%(asctime)s %(levelname)s %(name)s - %(message)s", | |
| ) | |
| logger = logging.getLogger("linkedin_mcp") | |
| logger.setLevel(_numeric_level) | |
| if not logger.handlers: | |
| _handler = logging.StreamHandler() | |
| _handler.setLevel(_numeric_level) | |
| _handler.setFormatter(logging.Formatter("%(asctime)s %(levelname)s %(name)s - %(message)s")) | |
| logger.addHandler(_handler) | |
| logger.propagate = False | |
| mcp = FastMCP( | |
| name="linkedin-jobs", | |
| host="0.0.0.0", | |
| port=7860, | |
| ) | |
| class JobPosting(BaseModel): | |
| title: str = Field(..., description="Job title") | |
| company: Optional[str] = Field(None, description="Company name if available") | |
| location: Optional[str] = Field(None, description="Job location if available") | |
| url: HttpUrl = Field(..., description="Direct link to the LinkedIn job page") | |
| job_id: Optional[str] = Field(None, description="LinkedIn job ID parsed from URL, if found") | |
| listed_text: Optional[str] = Field(None, description="Human-readable posted time text, e.g., '3 days ago'") | |
| def _default_headers(cookie: Optional[str]) -> dict: | |
| headers = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/125.0.0.0 Safari/537.36" | |
| ), | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| "Cache-Control": "no-cache", | |
| "Pragma": "no-cache", | |
| "Connection": "keep-alive", | |
| "Referer": "https://www.linkedin.com/jobs/", | |
| "Accept-Encoding": "gzip, deflate, br, zstd", | |
| "Upgrade-Insecure-Requests": "1", | |
| "sec-ch-ua": '"Chromium";v="125", "Not.A/Brand";v="24", "Google Chrome";v="125"', | |
| "sec-ch-ua-mobile": "?0", | |
| "sec-ch-ua-platform": '"macOS"', | |
| "Sec-Fetch-Site": "same-origin", | |
| "Sec-Fetch-Mode": "navigate", | |
| "Sec-Fetch-Dest": "document", | |
| } | |
| if cookie: | |
| headers["Cookie"] = cookie | |
| return headers | |
| def _ensure_absolute_url(href: str) -> str: | |
| if href.startswith("http://") or href.startswith("https://"): | |
| return href | |
| if href.startswith("/"): | |
| return f"https://www.linkedin.com{href}" | |
| return f"https://www.linkedin.com/{href}" | |
| def _detect_block_or_wall(text: str) -> Optional[str]: | |
| lowered = text.lower() | |
| hints = [ | |
| "captcha", | |
| "are you a robot", | |
| "robot check", | |
| "unusual activity", | |
| "sign in", | |
| "signin", | |
| "log in", | |
| "please sign in", | |
| "you’re seeing this message because", | |
| "to view this page, you must", | |
| ] | |
| for hint in hints: | |
| if hint in lowered: | |
| return hint | |
| return None | |
| def _summarize_body(text: str, limit: int = 300) -> str: | |
| collapsed = re.sub(r"\s+", " ", text).strip() | |
| return collapsed[:limit] + ("…" if len(collapsed) > limit else "") | |
| def _parse_jobs_from_html(html_text: str) -> list[JobPosting]: | |
| try: | |
| from selectolax.parser import HTMLParser | |
| except Exception: | |
| raise RuntimeError( | |
| "selectolax is required. Ensure it is listed in requirements.txt and installed." | |
| ) | |
| tree = HTMLParser(html_text) | |
| jobs: list[JobPosting] = [] | |
| # Prefer list items with data-occludable-job-id when available | |
| cards = tree.css("li[data-occludable-job-id], .base-search-card, .job-search-card") | |
| logger.debug("HTML parse: found %d potential job cards", len(cards)) | |
| for card in cards: | |
| job_id = card.attributes.get("data-occludable-job-id") | |
| # Link: any anchor pointing to /jobs/view/ | |
| link_el = card.css_first("a[href*='/jobs/view/']") or card.css_first( | |
| "a.base-card__full-link, a.hidden-nested-link, a" | |
| ) | |
| url = (link_el.attributes.get("href") if link_el else None) or "" | |
| if url: | |
| url = _ensure_absolute_url(url) | |
| if not job_id: | |
| job_id_match = re.search(r"/jobs/view/(\d+)", url) | |
| if job_id_match: | |
| job_id = job_id_match.group(1) | |
| # Title | |
| title_el = ( | |
| card.css_first("h3.base-search-card__title") | |
| or card.css_first(".base-search-card__title") | |
| or card.css_first(".job-card-list__title") | |
| or card.css_first(".sr-only") | |
| or card.css_first("a[href*='/jobs/view/']") | |
| ) | |
| title = (title_el.text(strip=True) if title_el else "").strip() | |
| # Company | |
| company_el = ( | |
| card.css_first("h4.base-search-card__subtitle") | |
| or card.css_first(".base-search-card__subtitle") | |
| or card.css_first(".job-search-card__subtitle") | |
| or card.css_first(".hidden-nested-link+div") | |
| or card.css_first(".job-card-container__company-name") | |
| or card.css_first(".job-card-container__primary-description") | |
| ) | |
| company = (company_el.text(strip=True) if company_el else None) | |
| # Location | |
| location_el = ( | |
| card.css_first(".job-search-card__location") | |
| or card.css_first(".base-search-card__metadata > .job-search-card__location") | |
| or card.css_first(".job-card-container__metadata-item") | |
| ) | |
| location = (location_el.text(strip=True) if location_el else None) | |
| # Time listed | |
| time_el = card.css_first("time, .job-search-card__listdate, .job-search-card__listdate--new") | |
| listed_text = (time_el.text(strip=True) if time_el else None) | |
| if not url or not title: | |
| continue | |
| # Clean up HTML entities and whitespace | |
| title = html.unescape(re.sub(r"\s+", " ", title)) | |
| if company: | |
| company = html.unescape(re.sub(r"\s+", " ", company)) | |
| if location: | |
| location = html.unescape(re.sub(r"\s+", " ", location)) | |
| if listed_text: | |
| listed_text = html.unescape(re.sub(r"\s+", " ", listed_text)) | |
| try: | |
| jobs.append( | |
| JobPosting( | |
| title=title, | |
| company=company, | |
| location=location, | |
| url=url, # type: ignore[arg-type] | |
| job_id=job_id, | |
| listed_text=listed_text, | |
| ) | |
| ) | |
| except Exception: | |
| continue | |
| # Fallback: grab anchors if no structured cards were detected | |
| if not jobs: | |
| anchors = tree.css("a[href*='/jobs/view/']") | |
| seen_ids: set[str] = set() | |
| logger.debug("HTML parse fallback: scanning %d anchors with /jobs/view/", len(anchors)) | |
| for a in anchors: | |
| href = a.attributes.get("href") or "" | |
| if not href: | |
| continue | |
| url = _ensure_absolute_url(href) | |
| job_id_match = re.search(r"/jobs/view/(\d+)", url) | |
| job_id = job_id_match.group(1) if job_id_match else None | |
| if job_id and job_id in seen_ids: | |
| continue | |
| title = a.text(strip=True) | |
| if not title: | |
| title = "LinkedIn Job" | |
| try: | |
| jobs.append( | |
| JobPosting( | |
| title=title, | |
| company=None, | |
| location=None, | |
| url=url, # type: ignore[arg-type] | |
| job_id=job_id, | |
| listed_text=None, | |
| ) | |
| ) | |
| if job_id: | |
| seen_ids.add(job_id) | |
| except Exception: | |
| continue | |
| logger.debug("HTML parse complete: %d jobs parsed", len(jobs)) | |
| return jobs | |
| # Mapping helpers to align with common notebook tutorials/filters | |
| _DATE_POSTED_TO_TPR = { | |
| # keys accepted by our API → LinkedIn f_TPR values | |
| "past_24_hours": "r86400", | |
| "past_week": "r604800", | |
| "past_month": "r2592000", | |
| } | |
| _EXPERIENCE_TO_E = { | |
| "internship": "1", | |
| "entry": "2", | |
| "associate": "3", | |
| "mid-senior": "4", | |
| "director": "5", | |
| "executive": "6", | |
| } | |
| _JOBTYPE_TO_JT = { | |
| "full-time": "F", | |
| "part-time": "P", | |
| "contract": "C", | |
| "temporary": "T", | |
| "internship": "I", | |
| "volunteer": "V", | |
| "other": "O", | |
| } | |
| _REMOTE_TO_WRA = { | |
| "on-site": "1", | |
| "remote": "2", | |
| "hybrid": "3", | |
| } | |
| def _build_search_params( | |
| *, | |
| keywords: str, | |
| location: Optional[str], | |
| start: int, | |
| sort_by: str = "relevance", | |
| date_posted: Optional[str] = None, | |
| experience_levels: Optional[List[str]] = None, | |
| job_types: Optional[List[str]] = None, | |
| remote: Optional[str] = None, | |
| geo_id: Optional[int] = None, | |
| ) -> dict: | |
| params: dict = { | |
| "keywords": keywords, | |
| "start": start, | |
| } | |
| if location: | |
| params["location"] = location | |
| if geo_id is not None: | |
| params["geoId"] = str(geo_id) | |
| # Sort: relevance (R) or date (DD) | |
| if sort_by: | |
| sb = sort_by.lower() | |
| if sb in {"relevance", "r"}: | |
| params["sortBy"] = "R" | |
| elif sb in {"date", "recent", "dd"}: | |
| params["sortBy"] = "DD" | |
| # Time posted | |
| if date_posted: | |
| tpr = _DATE_POSTED_TO_TPR.get(date_posted) | |
| if tpr: | |
| params["f_TPR"] = tpr | |
| # Experience levels | |
| if experience_levels: | |
| codes = [code for key in experience_levels if (code := _EXPERIENCE_TO_E.get(key))] | |
| if codes: | |
| params["f_E"] = ",".join(codes) | |
| # Job types | |
| if job_types: | |
| codes = [code for key in job_types if (code := _JOBTYPE_TO_JT.get(key))] | |
| if codes: | |
| params["f_JT"] = ",".join(codes) | |
| # Workplace type (on-site / remote / hybrid) | |
| if remote: | |
| code = _REMOTE_TO_WRA.get(remote) | |
| if code: | |
| params["f_WRA"] = code | |
| return params | |
| def _search_page( | |
| client: httpx.Client, | |
| *, | |
| params: dict, | |
| ) -> list[JobPosting]: | |
| base_url = "https://www.linkedin.com/jobs/search/?" + urlencode(params) | |
| logger.debug("GET main page: %s", base_url) | |
| resp = client.get(base_url, follow_redirects=True, timeout=20.0) | |
| logger.debug( | |
| "Main page status=%d bytes=%d content-type=%s", | |
| resp.status_code, | |
| len(resp.content), | |
| resp.headers.get("content-type"), | |
| ) | |
| jobs: list[JobPosting] = [] | |
| if resp.status_code == 200: | |
| block_hint = _detect_block_or_wall(resp.text) | |
| if block_hint: | |
| logger.warning("Main page may be blocked/walled (hint=%r)", block_hint) | |
| jobs = _parse_jobs_from_html(resp.text) | |
| logger.debug("Parsed %d jobs from main page", len(jobs)) | |
| elif resp.status_code in (999, 401, 403, 429): | |
| logger.warning("Main page blocked with status=%d; will try fragment", resp.status_code) | |
| else: | |
| # For other errors, raise to caller | |
| resp.raise_for_status() | |
| # If nothing parsed, try the fragment endpoint as a fallback regardless of page | |
| if len(jobs) == 0: | |
| fragment_url = ( | |
| "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?" + urlencode(params) | |
| ) | |
| logger.debug("GET fragment fallback: %s", fragment_url) | |
| frag_resp = client.get(fragment_url, follow_redirects=True, timeout=20.0) | |
| logger.debug( | |
| "Fragment status=%d bytes=%d content-type=%s", | |
| frag_resp.status_code, | |
| len(frag_resp.content), | |
| frag_resp.headers.get("content-type"), | |
| ) | |
| if frag_resp.status_code == 200: | |
| block_hint = _detect_block_or_wall(frag_resp.text) | |
| if block_hint: | |
| logger.warning("Fragment page may be blocked/walled (hint=%r)", block_hint) | |
| jobs = _parse_jobs_from_html(frag_resp.text) | |
| logger.debug("Parsed %d jobs from fragment", len(jobs)) | |
| else: | |
| logger.debug("Fragment request returned status=%d", frag_resp.status_code) | |
| if len(jobs) == 0: | |
| logger.info( | |
| "Zero jobs after main+fragment. Body sample: %s", | |
| _summarize_body(resp.text if resp is not None and resp.text else (frag_resp.text if frag_resp is not None else "")), | |
| ) | |
| return jobs | |
| def _search_linkedin_jobs_impl( | |
| query: str, | |
| location: Optional[str] = None, | |
| limit: int = 25, | |
| pages: int = 1, | |
| *, | |
| sort_by: str = "relevance", | |
| date_posted: Optional[str] = None, | |
| experience_levels: Optional[List[str]] = None, | |
| job_types: Optional[List[str]] = None, | |
| remote: Optional[str] = None, | |
| geo_id: Optional[int] = None, | |
| ) -> List[JobPosting]: | |
| """ | |
| - query: Search keywords, e.g. "machine learning engineer" but not full sentences | |
| - location: Optional location filter, e.g. "Paris, Île-de-France, France" | |
| - limit: Maximum number of jobs to return (<= 200) | |
| - pages: Number of pages to fetch (each page is ~25 results) | |
| - sort_by: "relevance" or "date" (maps to LinkedIn sortBy R/DD) | |
| - date_posted: one of {"past_24_hours","past_week","past_month"} | |
| - experience_levels: list of {"internship","entry","associate","mid-senior","director","executive"} | |
| - job_types: list of {"full-time","part-time","contract","temporary","internship","volunteer","other"} | |
| - remote: one of {"on-site","remote","hybrid"} | |
| - geo_id: Optional numeric LinkedIn geoId for precise location targeting | |
| Note: LinkedIn may throttle or require authentication. You can set the environment | |
| variable LINKEDIN_COOKIE to a valid cookie string (e.g., including li_at) for better results. | |
| """ | |
| cookie = os.environ.get("LINKEDIN_COOKIE") | |
| max_items = max(1, min(limit, 200)) | |
| pages = max(1, min(pages, 8)) | |
| headers = _default_headers(cookie) | |
| all_jobs: list[JobPosting] = [] | |
| with httpx.Client(headers=headers) as client: | |
| start = 0 | |
| logger.info( | |
| "Search start: query=%r location=%r limit=%d pages=%d sort_by=%s date_posted=%s exp=%s job_types=%s remote=%s geo_id=%s cookie_present=%s", | |
| query, | |
| location, | |
| limit, | |
| pages, | |
| sort_by, | |
| date_posted, | |
| experience_levels, | |
| job_types, | |
| remote, | |
| geo_id, | |
| bool(cookie), | |
| ) | |
| for _page in range(pages): | |
| active_params = _build_search_params( | |
| keywords=query, | |
| location=location, | |
| start=start, | |
| sort_by=sort_by, | |
| date_posted=date_posted, | |
| experience_levels=experience_levels, | |
| job_types=job_types, | |
| remote=remote, | |
| geo_id=geo_id, | |
| ) | |
| try: | |
| logger.debug("Page fetch: start=%d params=%s", start, active_params) | |
| jobs = _search_page(client, params=active_params) | |
| except httpx.HTTPStatusError as e: | |
| status = e.response.status_code | |
| try: | |
| failed_url = str(e.request.url) | |
| except Exception: | |
| failed_url = "<unknown>" | |
| logger.warning("HTTP error status=%d url=%s", status, failed_url) | |
| if status in (401, 403, 429): | |
| logger.info("Stopping due to auth/rate limit status=%d", status) | |
| break | |
| raise | |
| except Exception as ex: | |
| logger.exception("Unexpected error during page fetch: %s", ex) | |
| jobs = [] | |
| if not jobs: | |
| logger.info("No jobs parsed for start=%d; stopping further requests", start) | |
| break | |
| all_jobs.extend(jobs) | |
| if len(all_jobs) >= max_items: | |
| logger.info("Reached max_items=%d; stopping pagination", max_items) | |
| break | |
| start += 25 | |
| time.sleep(0.8) | |
| logger.info("Search complete: returning %d jobs", len(all_jobs[:max_items])) | |
| return all_jobs[:max_items] | |
| # Log tool registration explicitly for visibility in managed environments | |
| logger.info("Tool registered: Linkedin_demo_search_linkedin_jobs") | |
| logger.info("Tool registered: search_linkedin_jobs") | |
| if __name__ == "__main__": | |
| logger.info("Starting linkedin-jobs MCP server (streamable-http) on %s:%s", "0.0.0.0", 7860) | |
| mcp.run(transport="streamable-http") |