Spaces:
Sleeping
Sleeping
| from __future__ import annotations | |
| import os | |
| import re | |
| import time | |
| import html | |
| from typing import List, Optional | |
| from urllib.parse import urlencode | |
| import httpx | |
| from pydantic import BaseModel, Field, HttpUrl | |
| from fastmcp import FastMCP | |
| mcp = FastMCP( | |
| name="linkedin-jobs", | |
| host="0.0.0.0", | |
| port=7861, | |
| ) | |
| class JobPosting(BaseModel): | |
| title: str = Field(..., description="Job title") | |
| company: Optional[str] = Field(None, description="Company name if available") | |
| location: Optional[str] = Field(None, description="Job location if available") | |
| url: HttpUrl = Field(..., description="Direct link to the LinkedIn job page") | |
| job_id: Optional[str] = Field(None, description="LinkedIn job ID parsed from URL, if found") | |
| listed_text: Optional[str] = Field(None, description="Human-readable posted time text, e.g., '3 days ago'") | |
| def _default_headers(cookie: Optional[str]) -> dict: | |
| headers = { | |
| "User-Agent": ( | |
| "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) " | |
| "AppleWebKit/537.36 (KHTML, like Gecko) " | |
| "Chrome/125.0.0.0 Safari/537.36" | |
| ), | |
| "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", | |
| "Accept-Language": "en-US,en;q=0.9", | |
| "Cache-Control": "no-cache", | |
| "Pragma": "no-cache", | |
| "Connection": "keep-alive", | |
| } | |
| if cookie: | |
| headers["Cookie"] = cookie | |
| return headers | |
| def _parse_jobs_from_html(html_text: str) -> list[JobPosting]: | |
| try: | |
| from selectolax.parser import HTMLParser | |
| except Exception: | |
| raise RuntimeError( | |
| "selectolax is required. Ensure it is listed in requirements.txt and installed." | |
| ) | |
| tree = HTMLParser(html_text) | |
| jobs: list[JobPosting] = [] | |
| # LinkedIn search uses job cards with these classes | |
| for card in tree.css(".base-search-card, .job-search-card"): | |
| link_el = card.css_first("a.base-card__full-link, a.hidden-nested-link, a") | |
| title_el = card.css_first("h3.base-search-card__title, .base-search-card__title, .sr-only") | |
| company_el = card.css_first( | |
| "h4.base-search-card__subtitle, .base-search-card__subtitle, .job-search-card__subtitle, .hidden-nested-link+div" | |
| ) | |
| location_el = card.css_first(".job-search-card__location, .base-search-card__metadata > .job-search-card__location") | |
| time_el = card.css_first("time, .job-search-card__listdate, .job-search-card__listdate--new") | |
| url = (link_el.attributes.get("href") if link_el else None) or "" | |
| title = (title_el.text(strip=True) if title_el else "").strip() | |
| company = (company_el.text(strip=True) if company_el else None) | |
| location = (location_el.text(strip=True) if location_el else None) | |
| listed_text = (time_el.text(strip=True) if time_el else None) | |
| if not url or not title: | |
| continue | |
| # Clean up HTML entities and whitespace | |
| title = html.unescape(re.sub(r"\s+", " ", title)) | |
| if company: | |
| company = html.unescape(re.sub(r"\s+", " ", company)) | |
| if location: | |
| location = html.unescape(re.sub(r"\s+", " ", location)) | |
| if listed_text: | |
| listed_text = html.unescape(re.sub(r"\s+", " ", listed_text)) | |
| # Derive job id from URL if present: /jobs/view/<id>/ | |
| job_id_match = re.search(r"/jobs/view/(\d+)", url) | |
| job_id = job_id_match.group(1) if job_id_match else None | |
| try: | |
| jobs.append( | |
| JobPosting( | |
| title=title, | |
| company=company, | |
| location=location, | |
| url=url, # type: ignore[arg-type] | |
| job_id=job_id, | |
| listed_text=listed_text, | |
| ) | |
| ) | |
| except Exception: | |
| # Skip malformed entries gracefully | |
| continue | |
| return jobs | |
| def _search_page(client: httpx.Client, query: str, location: Optional[str], start: int) -> list[JobPosting]: | |
| params = { | |
| "keywords": query, | |
| "start": start, | |
| } | |
| if location: | |
| params["location"] = location | |
| # First request the main search page (richer HTML for the first 25 results) | |
| url = "https://www.linkedin.com/jobs/search/?" + urlencode(params) | |
| resp = client.get(url, follow_redirects=True, timeout=20.0) | |
| resp.raise_for_status() | |
| jobs = _parse_jobs_from_html(resp.text) | |
| # For subsequent starts (>0), LinkedIn often uses this fragment endpoint | |
| if start > 0 and len(jobs) == 0: | |
| fragment_url = ( | |
| "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?" + urlencode(params) | |
| ) | |
| frag_resp = client.get(fragment_url, follow_redirects=True, timeout=20.0) | |
| if frag_resp.status_code == 200: | |
| jobs = _parse_jobs_from_html(frag_resp.text) | |
| return jobs | |
| def search_linkedin_jobs(query: str, location: Optional[str] = None, limit: int = 25, pages: int = 1) -> List[JobPosting]: | |
| """ | |
| - query: Search keywords, e.g. "machine learning engineer" | |
| - location: Optional location filter, e.g. "Paris, Île-de-France, France" | |
| - limit: Maximum number of jobs to return (<= 200) | |
| - pages: Number of pages to fetch (each page is ~25 results) | |
| Note: LinkedIn may throttle or require authentication. You can set the environment | |
| variable LINKEDIN_COOKIE to a valid cookie string (e.g., including li_at) for better results. | |
| """ | |
| cookie = os.environ.get("LINKEDIN_COOKIE") | |
| max_items = max(1, min(limit, 200)) | |
| pages = max(1, min(pages, 8)) | |
| headers = _default_headers(cookie) | |
| all_jobs: list[JobPosting] = [] | |
| with httpx.Client(headers=headers) as client: | |
| start = 0 | |
| for page in range(pages): | |
| try: | |
| jobs = _search_page(client, query=query, location=location, start=start) | |
| except httpx.HTTPStatusError as e: | |
| # If unauthorized or blocked, break early | |
| status = e.response.status_code | |
| if status in (401, 403, 429): | |
| break | |
| raise | |
| except Exception: | |
| # transient errors: move to next page | |
| jobs = [] | |
| if not jobs: | |
| # If no jobs were parsed, stop to avoid hammering | |
| break | |
| all_jobs.extend(jobs) | |
| if len(all_jobs) >= max_items: | |
| break | |
| start += 25 | |
| # Be polite to avoid rate-limiting | |
| time.sleep(0.8) | |
| return all_jobs[:max_items] | |
| if __name__ == "__main__": | |
| mcp.run(transport="http") |