Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import os | |
| import re | |
| import time | |
| import html | |
| from typing import List, Optional | |
| from urllib.parse import urlencode | |
| import httpx | |
| from pydantic import BaseModel, Field, HttpUrl | |
| from fastmcp import FastMCP | |
| mcp = FastMCP( | |
| name="linkedin-jobs", | |
| host="0.0.0.0", | |
| port=7860, | |
| ) | |
| class JobPosting(BaseModel): | |
| title: str = Field(..., description="Job title") | |
| company: Optional[str] = Field(None, description="Company name if available") | |
| location: Optional[str] = Field(None, description="Job location if available") | |
| url: HttpUrl = Field(..., description="Direct link to the LinkedIn job page") | |
| job_id: Optional[str] = Field(None, description="LinkedIn job ID parsed from URL, if found") | |
| listed_text: Optional[str] = Field(None, description="Human-readable posted time text, e.g., '3 days ago'") | |
| def _default_headers(cookie: Optional[str]) -> dict: | |
| headers = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/125.0.0.0 Safari/537.36" | |
| ), | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| "Cache-Control": "no-cache", | |
| "Pragma": "no-cache", | |
| "Connection": "keep-alive", | |
| "Referer": "https://www.linkedin.com/jobs/", | |
| } | |
| if cookie: | |
| headers["Cookie"] = cookie | |
| return headers | |
| def _ensure_absolute_url(href: str) -> str: | |
| if href.startswith("http://") or href.startswith("https://"): | |
| return href | |
| if href.startswith("/"): | |
| return f"https://www.linkedin.com{href}" | |
| return f"https://www.linkedin.com/{href}" | |
| def _parse_jobs_from_html(html_text: str) -> list[JobPosting]: | |
| try: | |
| from selectolax.parser import HTMLParser | |
| except Exception: | |
| raise RuntimeError( | |
| "selectolax is required. Ensure it is listed in requirements.txt and installed." | |
| ) | |
| tree = HTMLParser(html_text) | |
| jobs: list[JobPosting] = [] | |
| # Prefer list items with data-occludable-job-id when available | |
| cards = tree.css("li[data-occludable-job-id], .base-search-card, .job-search-card") | |
| for card in cards: | |
| job_id = card.attributes.get("data-occludable-job-id") | |
| # Link: any anchor pointing to /jobs/view/ | |
| link_el = card.css_first("a[href*='/jobs/view/']") or card.css_first( | |
| "a.base-card__full-link, a.hidden-nested-link, a" | |
| ) | |
| url = (link_el.attributes.get("href") if link_el else None) or "" | |
| if url: | |
| url = _ensure_absolute_url(url) | |
| if not job_id: | |
| job_id_match = re.search(r"/jobs/view/(\d+)", url) | |
| if job_id_match: | |
| job_id = job_id_match.group(1) | |
| # Title | |
| title_el = ( | |
| card.css_first("h3.base-search-card__title") | |
| or card.css_first(".base-search-card__title") | |
| or card.css_first(".job-card-list__title") | |
| or card.css_first(".sr-only") | |
| or card.css_first("a[href*='/jobs/view/']") | |
| ) | |
| title = (title_el.text(strip=True) if title_el else "").strip() | |
| # Company | |
| company_el = ( | |
| card.css_first("h4.base-search-card__subtitle") | |
| or card.css_first(".base-search-card__subtitle") | |
| or card.css_first(".job-search-card__subtitle") | |
| or card.css_first(".hidden-nested-link+div") | |
| or card.css_first(".job-card-container__company-name") | |
| or card.css_first(".job-card-container__primary-description") | |
| ) | |
| company = (company_el.text(strip=True) if company_el else None) | |
| # Location | |
| location_el = ( | |
| card.css_first(".job-search-card__location") | |
| or card.css_first(".base-search-card__metadata > .job-search-card__location") | |
| or card.css_first(".job-card-container__metadata-item") | |
| ) | |
| location = (location_el.text(strip=True) if location_el else None) | |
| # Time listed | |
| time_el = card.css_first("time, .job-search-card__listdate, .job-search-card__listdate--new") | |
| listed_text = (time_el.text(strip=True) if time_el else None) | |
| if not url or not title: | |
| continue | |
| # Clean up HTML entities and whitespace | |
| title = html.unescape(re.sub(r"\s+", " ", title)) | |
| if company: | |
| company = html.unescape(re.sub(r"\s+", " ", company)) | |
| if location: | |
| location = html.unescape(re.sub(r"\s+", " ", location)) | |
| if listed_text: | |
| listed_text = html.unescape(re.sub(r"\s+", " ", listed_text)) | |
| try: | |
| jobs.append( | |
| JobPosting( | |
| title=title, | |
| company=company, | |
| location=location, | |
| url=url, # type: ignore[arg-type] | |
| job_id=job_id, | |
| listed_text=listed_text, | |
| ) | |
| ) | |
| except Exception: | |
| continue | |
| # Fallback: grab anchors if no structured cards were detected | |
| if not jobs: | |
| anchors = tree.css("a[href*='/jobs/view/']") | |
| seen_ids: set[str] = set() | |
| for a in anchors: | |
| href = a.attributes.get("href") or "" | |
| if not href: | |
| continue | |
| url = _ensure_absolute_url(href) | |
| job_id_match = re.search(r"/jobs/view/(\d+)", url) | |
| job_id = job_id_match.group(1) if job_id_match else None | |
| if job_id and job_id in seen_ids: | |
| continue | |
| title = a.text(strip=True) | |
| if not title: | |
| title = "LinkedIn Job" | |
| try: | |
| jobs.append( | |
| JobPosting( | |
| title=title, | |
| company=None, | |
| location=None, | |
| url=url, # type: ignore[arg-type] | |
| job_id=job_id, | |
| listed_text=None, | |
| ) | |
| ) | |
| if job_id: | |
| seen_ids.add(job_id) | |
| except Exception: | |
| continue | |
| return jobs | |
| # Mapping helpers to align with common notebook tutorials/filters | |
| _DATE_POSTED_TO_TPR = { | |
| # keys accepted by our API → LinkedIn f_TPR values | |
| "past_24_hours": "r86400", | |
| "past_week": "r604800", | |
| "past_month": "r2592000", | |
| } | |
| _EXPERIENCE_TO_E = { | |
| "internship": "1", | |
| "entry": "2", | |
| "associate": "3", | |
| "mid-senior": "4", | |
| "director": "5", | |
| "executive": "6", | |
| } | |
| _JOBTYPE_TO_JT = { | |
| "full-time": "F", | |
| "part-time": "P", | |
| "contract": "C", | |
| "temporary": "T", | |
| "internship": "I", | |
| "volunteer": "V", | |
| "other": "O", | |
| } | |
| _REMOTE_TO_WRA = { | |
| "on-site": "1", | |
| "remote": "2", | |
| "hybrid": "3", | |
| } | |
| def _build_search_params( | |
| *, | |
| keywords: str, | |
| location: Optional[str], | |
| start: int, | |
| sort_by: str = "relevance", | |
| date_posted: Optional[str] = None, | |
| experience_levels: Optional[List[str]] = None, | |
| job_types: Optional[List[str]] = None, | |
| remote: Optional[str] = None, | |
| geo_id: Optional[int] = None, | |
| ) -> dict: | |
| params: dict = { | |
| "keywords": keywords, | |
| "start": start, | |
| } | |
| if location: | |
| params["location"] = location | |
| if geo_id is not None: | |
| params["geoId"] = str(geo_id) | |
| # Sort: relevance (R) or date (DD) | |
| if sort_by and sort_by.lower() in {"relevance", "date"}: | |
| params["sortBy"] = "R" if sort_by.lower() == "relevance" else "DD" | |
| # Time posted | |
| if date_posted: | |
| tpr = _DATE_POSTED_TO_TPR.get(date_posted) | |
| if tpr: | |
| params["f_TPR"] = tpr | |
| # Experience levels | |
| if experience_levels: | |
| codes = [code for key in experience_levels if (code := _EXPERIENCE_TO_E.get(key))] | |
| if codes: | |
| params["f_E"] = ",".join(codes) | |
| # Job types | |
| if job_types: | |
| codes = [code for key in job_types if (code := _JOBTYPE_TO_JT.get(key))] | |
| if codes: | |
| params["f_JT"] = ",".join(codes) | |
| # Workplace type (on-site / remote / hybrid) | |
| if remote: | |
| code = _REMOTE_TO_WRA.get(remote) | |
| if code: | |
| params["f_WRA"] = code | |
| return params | |
| def _search_page( | |
| client: httpx.Client, | |
| *, | |
| params: dict, | |
| ) -> list[JobPosting]: | |
| base_url = "https://www.linkedin.com/jobs/search/?" + urlencode(params) | |
| resp = client.get(base_url, follow_redirects=True, timeout=20.0) | |
| resp.raise_for_status() | |
| jobs = _parse_jobs_from_html(resp.text) | |
| # If nothing parsed, try the fragment endpoint as a fallback regardless of page | |
| if len(jobs) == 0: | |
| fragment_url = ( | |
| "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?" + urlencode(params) | |
| ) | |
| frag_resp = client.get(fragment_url, follow_redirects=True, timeout=20.0) | |
| if frag_resp.status_code == 200: | |
| jobs = _parse_jobs_from_html(frag_resp.text) | |
| return jobs | |
| def search_linkedin_jobs( | |
| query: str, | |
| location: Optional[str] = None, | |
| limit: int = 25, | |
| pages: int = 1, | |
| *, | |
| sort_by: str = "relevance", | |
| date_posted: Optional[str] = None, | |
| experience_levels: Optional[List[str]] = None, | |
| job_types: Optional[List[str]] = None, | |
| remote: Optional[str] = None, | |
| geo_id: Optional[int] = None, | |
| ) -> List[JobPosting]: | |
| """ | |
| - query: Search keywords, e.g. "machine learning engineer" | |
| - location: Optional location filter, e.g. "Paris, Île-de-France, France" | |
| - limit: Maximum number of jobs to return (<= 200) | |
| - pages: Number of pages to fetch (each page is ~25 results) | |
| - sort_by: "relevance" or "date" (maps to LinkedIn sortBy R/DD) | |
| - date_posted: one of {"past_24_hours","past_week","past_month"} | |
| - experience_levels: list of {"internship","entry","associate","mid-senior","director","executive"} | |
| - job_types: list of {"full-time","part-time","contract","temporary","internship","volunteer","other"} | |
| - remote: one of {"on-site","remote","hybrid"} | |
| - geo_id: Optional numeric LinkedIn geoId for precise location targeting | |
| Note: LinkedIn may throttle or require authentication. You can set the environment | |
| variable LINKEDIN_COOKIE to a valid cookie string (e.g., including li_at) for better results. | |
| """ | |
| cookie = os.environ.get("LINKEDIN_COOKIE") | |
| max_items = max(1, min(limit, 200)) | |
| pages = max(1, min(pages, 8)) | |
| headers = _default_headers(cookie) | |
| all_jobs: list[JobPosting] = [] | |
| with httpx.Client(headers=headers) as client: | |
| start = 0 | |
| for _page in range(pages): | |
| active_params = _build_search_params( | |
| keywords=query, | |
| location=location, | |
| start=start, | |
| sort_by=sort_by, | |
| date_posted=date_posted, | |
| experience_levels=experience_levels, | |
| job_types=job_types, | |
| remote=remote, | |
| geo_id=geo_id, | |
| ) | |
| try: | |
| jobs = _search_page(client, params=active_params) | |
| except httpx.HTTPStatusError as e: | |
| status = e.response.status_code | |
| if status in (401, 403, 429): | |
| break | |
| raise | |
| except Exception: | |
| jobs = [] | |
| if not jobs: | |
| break | |
| all_jobs.extend(jobs) | |
| if len(all_jobs) >= max_items: | |
| break | |
| start += 25 | |
| time.sleep(0.8) | |
| return all_jobs[:max_items] | |
| if __name__ == "__main__": | |
| mcp.run(transport="http") |