Spaces:

Jofthomas
/

linkedin_mcp

Sleeping

App Files Files Community

Jofthomas commited on Sep 11

Commit

4969b87

verified ·

1 Parent(s): 67e1f99

Update app.py

Browse files

Files changed (1) hide show

app.py +209 -31

app.py CHANGED Viewed

@@ -41,12 +41,21 @@ def _default_headers(cookie: Optional[str]) -> dict:
         "Cache-Control": "no-cache",
         "Pragma": "no-cache",
         "Connection": "keep-alive",
     }
     if cookie:
         headers["Cookie"] = cookie
     return headers
 def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
     try:
         from selectolax.parser import HTMLParser
@@ -59,20 +68,54 @@ def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
     jobs: list[JobPosting] = []
-    # LinkedIn search uses job cards with these classes
-    for card in tree.css(".base-search-card, .job-search-card"):
-        link_el = card.css_first("a.base-card__full-link, a.hidden-nested-link, a")
-        title_el = card.css_first("h3.base-search-card__title, .base-search-card__title, .sr-only")
-        company_el = card.css_first(
-            "h4.base-search-card__subtitle, .base-search-card__subtitle, .job-search-card__subtitle, .hidden-nested-link+div"
-        )
-        location_el = card.css_first(".job-search-card__location, .base-search-card__metadata > .job-search-card__location")
-        time_el = card.css_first("time, .job-search-card__listdate, .job-search-card__listdate--new")
         url = (link_el.attributes.get("href") if link_el else None) or ""
         title = (title_el.text(strip=True) if title_el else "").strip()
         company = (company_el.text(strip=True) if company_el else None)
         location = (location_el.text(strip=True) if location_el else None)
         listed_text = (time_el.text(strip=True) if time_el else None)
         if not url or not title:
@@ -87,10 +130,6 @@ def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
         if listed_text:
             listed_text = html.unescape(re.sub(r"\s+", " ", listed_text))
-        # Derive job id from URL if present: /jobs/view/<id>/
-        job_id_match = re.search(r"/jobs/view/(\d+)", url)
-        job_id = job_id_match.group(1) if job_id_match else None
         try:
             jobs.append(
                 JobPosting(
@@ -103,28 +142,141 @@ def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
                 )
             )
         except Exception:
-            # Skip malformed entries gracefully
             continue
     return jobs
-def _search_page(client: httpx.Client, query: str, location: Optional[str], start: int) -> list[JobPosting]:
-    params = {
-        "keywords": query,
         "start": start,
     }
     if location:
         params["location"] = location
-    # First request the main search page (richer HTML for the first 25 results)
-    url = "https://www.linkedin.com/jobs/search/?" + urlencode(params)
-    resp = client.get(url, follow_redirects=True, timeout=20.0)
     resp.raise_for_status()
     jobs = _parse_jobs_from_html(resp.text)
-    # For subsequent starts (>0), LinkedIn often uses this fragment endpoint
-    if start > 0 and len(jobs) == 0:
         fragment_url = (
             "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?" + urlencode(params)
         )
@@ -135,13 +287,31 @@ def _search_page(client: httpx.Client, query: str, location: Optional[str], star
     return jobs
-@mcp.tool(description="Search LinkedIn job listings and return structured job postings. Optionally set LINKEDIN_COOKIE env for authenticated scraping.")
-def search_linkedin_jobs(query: str, location: Optional[str] = None, limit: int = 25, pages: int = 1) -> List[JobPosting]:
     """
     - query: Search keywords, e.g. "machine learning engineer"
     - location: Optional location filter, e.g. "Paris, Île-de-France, France"
     - limit: Maximum number of jobs to return (<= 200)
     - pages: Number of pages to fetch (each page is ~25 results)
     Note: LinkedIn may throttle or require authentication. You can set the environment
     variable LINKEDIN_COOKIE to a valid cookie string (e.g., including li_at) for better results.
@@ -156,21 +326,30 @@ def search_linkedin_jobs(query: str, location: Optional[str] = None, limit: int
     with httpx.Client(headers=headers) as client:
         start = 0
-        for page in range(pages):
             try:
-                jobs = _search_page(client, query=query, location=location, start=start)
             except httpx.HTTPStatusError as e:
-                # If unauthorized or blocked, break early
                 status = e.response.status_code
                 if status in (401, 403, 429):
                     break
                 raise
             except Exception:
-                # transient errors: move to next page
                 jobs = []
             if not jobs:
-                # If no jobs were parsed, stop to avoid hammering
                 break
             all_jobs.extend(jobs)
@@ -178,7 +357,6 @@ def search_linkedin_jobs(query: str, location: Optional[str] = None, limit: int
                 break
             start += 25
-            # Be polite to avoid rate-limiting
             time.sleep(0.8)
     return all_jobs[:max_items]

         "Cache-Control": "no-cache",
         "Pragma": "no-cache",
         "Connection": "keep-alive",
+        "Referer": "https://www.linkedin.com/jobs/",
     }
     if cookie:
         headers["Cookie"] = cookie
     return headers
+def _ensure_absolute_url(href: str) -> str:
+    if href.startswith("http://") or href.startswith("https://"):
+        return href
+    if href.startswith("/"):
+        return f"https://www.linkedin.com{href}"
+    return f"https://www.linkedin.com/{href}"
 def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
     try:
         from selectolax.parser import HTMLParser
     jobs: list[JobPosting] = []
+    # Prefer list items with data-occludable-job-id when available
+    cards = tree.css("li[data-occludable-job-id], .base-search-card, .job-search-card")
+    for card in cards:
+        job_id = card.attributes.get("data-occludable-job-id")
+        # Link: any anchor pointing to /jobs/view/
+        link_el = card.css_first("a[href*='/jobs/view/']") or card.css_first(
+            "a.base-card__full-link, a.hidden-nested-link, a"
+        )
         url = (link_el.attributes.get("href") if link_el else None) or ""
+        if url:
+            url = _ensure_absolute_url(url)
+            if not job_id:
+                job_id_match = re.search(r"/jobs/view/(\d+)", url)
+                if job_id_match:
+                    job_id = job_id_match.group(1)
+        # Title
+        title_el = (
+            card.css_first("h3.base-search-card__title")
+            or card.css_first(".base-search-card__title")
+            or card.css_first(".job-card-list__title")
+            or card.css_first(".sr-only")
+            or card.css_first("a[href*='/jobs/view/']")
+        )
         title = (title_el.text(strip=True) if title_el else "").strip()
+        # Company
+        company_el = (
+            card.css_first("h4.base-search-card__subtitle")
+            or card.css_first(".base-search-card__subtitle")
+            or card.css_first(".job-search-card__subtitle")
+            or card.css_first(".hidden-nested-link+div")
+            or card.css_first(".job-card-container__company-name")
+            or card.css_first(".job-card-container__primary-description")
+        )
         company = (company_el.text(strip=True) if company_el else None)
+        # Location
+        location_el = (
+            card.css_first(".job-search-card__location")
+            or card.css_first(".base-search-card__metadata > .job-search-card__location")
+            or card.css_first(".job-card-container__metadata-item")
+        )
         location = (location_el.text(strip=True) if location_el else None)
+        # Time listed
+        time_el = card.css_first("time, .job-search-card__listdate, .job-search-card__listdate--new")
         listed_text = (time_el.text(strip=True) if time_el else None)
         if not url or not title:
         if listed_text:
             listed_text = html.unescape(re.sub(r"\s+", " ", listed_text))
         try:
             jobs.append(
                 JobPosting(
                 )
             )
         except Exception:
             continue
+    # Fallback: grab anchors if no structured cards were detected
+    if not jobs:
+        anchors = tree.css("a[href*='/jobs/view/']")
+        seen_ids: set[str] = set()
+        for a in anchors:
+            href = a.attributes.get("href") or ""
+            if not href:
+                continue
+            url = _ensure_absolute_url(href)
+            job_id_match = re.search(r"/jobs/view/(\d+)", url)
+            job_id = job_id_match.group(1) if job_id_match else None
+            if job_id and job_id in seen_ids:
+                continue
+            title = a.text(strip=True)
+            if not title:
+                title = "LinkedIn Job"
+            try:
+                jobs.append(
+                    JobPosting(
+                        title=title,
+                        company=None,
+                        location=None,
+                        url=url,  # type: ignore[arg-type]
+                        job_id=job_id,
+                        listed_text=None,
+                    )
+                )
+                if job_id:
+                    seen_ids.add(job_id)
+            except Exception:
+                continue
     return jobs
+# Mapping helpers to align with common notebook tutorials/filters
+_DATE_POSTED_TO_TPR = {
+    # keys accepted by our API → LinkedIn f_TPR values
+    "past_24_hours": "r86400",
+    "past_week": "r604800",
+    "past_month": "r2592000",
+}
+_EXPERIENCE_TO_E = {
+    "internship": "1",
+    "entry": "2",
+    "associate": "3",
+    "mid-senior": "4",
+    "director": "5",
+    "executive": "6",
+}
+_JOBTYPE_TO_JT = {
+    "full-time": "F",
+    "part-time": "P",
+    "contract": "C",
+    "temporary": "T",
+    "internship": "I",
+    "volunteer": "V",
+    "other": "O",
+}
+_REMOTE_TO_WRA = {
+    "on-site": "1",
+    "remote": "2",
+    "hybrid": "3",
+}
+def _build_search_params(
+    *,
+    keywords: str,
+    location: Optional[str],
+    start: int,
+    sort_by: str = "relevance",
+    date_posted: Optional[str] = None,
+    experience_levels: Optional[List[str]] = None,
+    job_types: Optional[List[str]] = None,
+    remote: Optional[str] = None,
+    geo_id: Optional[int] = None,
+) -> dict:
+    params: dict = {
+        "keywords": keywords,
         "start": start,
     }
     if location:
         params["location"] = location
+    if geo_id is not None:
+        params["geoId"] = str(geo_id)
+    # Sort: relevance (R) or date (DD)
+    if sort_by and sort_by.lower() in {"relevance", "date"}:
+        params["sortBy"] = "R" if sort_by.lower() == "relevance" else "DD"
+    # Time posted
+    if date_posted:
+        tpr = _DATE_POSTED_TO_TPR.get(date_posted)
+        if tpr:
+            params["f_TPR"] = tpr
+    # Experience levels
+    if experience_levels:
+        codes = [code for key in experience_levels if (code := _EXPERIENCE_TO_E.get(key))]
+        if codes:
+            params["f_E"] = ",".join(codes)
+    # Job types
+    if job_types:
+        codes = [code for key in job_types if (code := _JOBTYPE_TO_JT.get(key))]
+        if codes:
+            params["f_JT"] = ",".join(codes)
+    # Workplace type (on-site / remote / hybrid)
+    if remote:
+        code = _REMOTE_TO_WRA.get(remote)
+        if code:
+            params["f_WRA"] = code
+    return params
+def _search_page(
+    client: httpx.Client,
+    *,
+    params: dict,
+) -> list[JobPosting]:
+    base_url = "https://www.linkedin.com/jobs/search/?" + urlencode(params)
+    resp = client.get(base_url, follow_redirects=True, timeout=20.0)
     resp.raise_for_status()
     jobs = _parse_jobs_from_html(resp.text)
+    # If nothing parsed, try the fragment endpoint as a fallback regardless of page
+    if len(jobs) == 0:
         fragment_url = (
             "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?" + urlencode(params)
         )
     return jobs
+@mcp.tool(description="Search LinkedIn job listings and return structured job postings.")
+def search_linkedin_jobs(
+    query: str,
+    location: Optional[str] = None,
+    limit: int = 25,
+    pages: int = 1,
+    *,
+    sort_by: str = "relevance",
+    date_posted: Optional[str] = None,
+    experience_levels: Optional[List[str]] = None,
+    job_types: Optional[List[str]] = None,
+    remote: Optional[str] = None,
+    geo_id: Optional[int] = None,
+) -> List[JobPosting]:
     """
     - query: Search keywords, e.g. "machine learning engineer"
     - location: Optional location filter, e.g. "Paris, Île-de-France, France"
     - limit: Maximum number of jobs to return (<= 200)
     - pages: Number of pages to fetch (each page is ~25 results)
+    - sort_by: "relevance" or "date" (maps to LinkedIn sortBy R/DD)
+    - date_posted: one of {"past_24_hours","past_week","past_month"}
+    - experience_levels: list of {"internship","entry","associate","mid-senior","director","executive"}
+    - job_types: list of {"full-time","part-time","contract","temporary","internship","volunteer","other"}
+    - remote: one of {"on-site","remote","hybrid"}
+    - geo_id: Optional numeric LinkedIn geoId for precise location targeting
     Note: LinkedIn may throttle or require authentication. You can set the environment
     variable LINKEDIN_COOKIE to a valid cookie string (e.g., including li_at) for better results.
     with httpx.Client(headers=headers) as client:
         start = 0
+        for _page in range(pages):
+            active_params = _build_search_params(
+                keywords=query,
+                location=location,
+                start=start,
+                sort_by=sort_by,
+                date_posted=date_posted,
+                experience_levels=experience_levels,
+                job_types=job_types,
+                remote=remote,
+                geo_id=geo_id,
+            )
             try:
+                jobs = _search_page(client, params=active_params)
             except httpx.HTTPStatusError as e:
                 status = e.response.status_code
                 if status in (401, 403, 429):
                     break
                 raise
             except Exception:
                 jobs = []
             if not jobs:
                 break
             all_jobs.extend(jobs)
                 break
             start += 25
             time.sleep(0.8)
     return all_jobs[:max_items]