Jofthomas commited on
Commit
4969b87
·
verified ·
1 Parent(s): 67e1f99

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +209 -31
app.py CHANGED
@@ -41,12 +41,21 @@ def _default_headers(cookie: Optional[str]) -> dict:
41
  "Cache-Control": "no-cache",
42
  "Pragma": "no-cache",
43
  "Connection": "keep-alive",
 
44
  }
45
  if cookie:
46
  headers["Cookie"] = cookie
47
  return headers
48
 
49
 
 
 
 
 
 
 
 
 
50
  def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
51
  try:
52
  from selectolax.parser import HTMLParser
@@ -59,20 +68,54 @@ def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
59
 
60
  jobs: list[JobPosting] = []
61
 
62
- # LinkedIn search uses job cards with these classes
63
- for card in tree.css(".base-search-card, .job-search-card"):
64
- link_el = card.css_first("a.base-card__full-link, a.hidden-nested-link, a")
65
- title_el = card.css_first("h3.base-search-card__title, .base-search-card__title, .sr-only")
66
- company_el = card.css_first(
67
- "h4.base-search-card__subtitle, .base-search-card__subtitle, .job-search-card__subtitle, .hidden-nested-link+div"
68
- )
69
- location_el = card.css_first(".job-search-card__location, .base-search-card__metadata > .job-search-card__location")
70
- time_el = card.css_first("time, .job-search-card__listdate, .job-search-card__listdate--new")
71
 
 
 
 
 
72
  url = (link_el.attributes.get("href") if link_el else None) or ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
73
  title = (title_el.text(strip=True) if title_el else "").strip()
 
 
 
 
 
 
 
 
 
 
74
  company = (company_el.text(strip=True) if company_el else None)
 
 
 
 
 
 
 
75
  location = (location_el.text(strip=True) if location_el else None)
 
 
 
76
  listed_text = (time_el.text(strip=True) if time_el else None)
77
 
78
  if not url or not title:
@@ -87,10 +130,6 @@ def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
87
  if listed_text:
88
  listed_text = html.unescape(re.sub(r"\s+", " ", listed_text))
89
 
90
- # Derive job id from URL if present: /jobs/view/<id>/
91
- job_id_match = re.search(r"/jobs/view/(\d+)", url)
92
- job_id = job_id_match.group(1) if job_id_match else None
93
-
94
  try:
95
  jobs.append(
96
  JobPosting(
@@ -103,28 +142,141 @@ def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
103
  )
104
  )
105
  except Exception:
106
- # Skip malformed entries gracefully
107
  continue
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  return jobs
110
 
111
 
112
- def _search_page(client: httpx.Client, query: str, location: Optional[str], start: int) -> list[JobPosting]:
113
- params = {
114
- "keywords": query,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
  "start": start,
116
  }
117
  if location:
118
  params["location"] = location
119
-
120
- # First request the main search page (richer HTML for the first 25 results)
121
- url = "https://www.linkedin.com/jobs/search/?" + urlencode(params)
122
- resp = client.get(url, follow_redirects=True, timeout=20.0)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  resp.raise_for_status()
124
  jobs = _parse_jobs_from_html(resp.text)
125
 
126
- # For subsequent starts (>0), LinkedIn often uses this fragment endpoint
127
- if start > 0 and len(jobs) == 0:
128
  fragment_url = (
129
  "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?" + urlencode(params)
130
  )
@@ -135,13 +287,31 @@ def _search_page(client: httpx.Client, query: str, location: Optional[str], star
135
  return jobs
136
 
137
 
138
- @mcp.tool(description="Search LinkedIn job listings and return structured job postings. Optionally set LINKEDIN_COOKIE env for authenticated scraping.")
139
- def search_linkedin_jobs(query: str, location: Optional[str] = None, limit: int = 25, pages: int = 1) -> List[JobPosting]:
 
 
 
 
 
 
 
 
 
 
 
 
140
  """
141
  - query: Search keywords, e.g. "machine learning engineer"
142
  - location: Optional location filter, e.g. "Paris, Île-de-France, France"
143
  - limit: Maximum number of jobs to return (<= 200)
144
  - pages: Number of pages to fetch (each page is ~25 results)
 
 
 
 
 
 
145
 
146
  Note: LinkedIn may throttle or require authentication. You can set the environment
147
  variable LINKEDIN_COOKIE to a valid cookie string (e.g., including li_at) for better results.
@@ -156,21 +326,30 @@ def search_linkedin_jobs(query: str, location: Optional[str] = None, limit: int
156
 
157
  with httpx.Client(headers=headers) as client:
158
  start = 0
159
- for page in range(pages):
 
 
 
 
 
 
 
 
 
 
 
 
160
  try:
161
- jobs = _search_page(client, query=query, location=location, start=start)
162
  except httpx.HTTPStatusError as e:
163
- # If unauthorized or blocked, break early
164
  status = e.response.status_code
165
  if status in (401, 403, 429):
166
  break
167
  raise
168
  except Exception:
169
- # transient errors: move to next page
170
  jobs = []
171
 
172
  if not jobs:
173
- # If no jobs were parsed, stop to avoid hammering
174
  break
175
 
176
  all_jobs.extend(jobs)
@@ -178,7 +357,6 @@ def search_linkedin_jobs(query: str, location: Optional[str] = None, limit: int
178
  break
179
 
180
  start += 25
181
- # Be polite to avoid rate-limiting
182
  time.sleep(0.8)
183
 
184
  return all_jobs[:max_items]
 
41
  "Cache-Control": "no-cache",
42
  "Pragma": "no-cache",
43
  "Connection": "keep-alive",
44
+ "Referer": "https://www.linkedin.com/jobs/",
45
  }
46
  if cookie:
47
  headers["Cookie"] = cookie
48
  return headers
49
 
50
 
51
+ def _ensure_absolute_url(href: str) -> str:
52
+ if href.startswith("http://") or href.startswith("https://"):
53
+ return href
54
+ if href.startswith("/"):
55
+ return f"https://www.linkedin.com{href}"
56
+ return f"https://www.linkedin.com/{href}"
57
+
58
+
59
  def _parse_jobs_from_html(html_text: str) -> list[JobPosting]:
60
  try:
61
  from selectolax.parser import HTMLParser
 
68
 
69
  jobs: list[JobPosting] = []
70
 
71
+ # Prefer list items with data-occludable-job-id when available
72
+ cards = tree.css("li[data-occludable-job-id], .base-search-card, .job-search-card")
73
+ for card in cards:
74
+ job_id = card.attributes.get("data-occludable-job-id")
 
 
 
 
 
75
 
76
+ # Link: any anchor pointing to /jobs/view/
77
+ link_el = card.css_first("a[href*='/jobs/view/']") or card.css_first(
78
+ "a.base-card__full-link, a.hidden-nested-link, a"
79
+ )
80
  url = (link_el.attributes.get("href") if link_el else None) or ""
81
+ if url:
82
+ url = _ensure_absolute_url(url)
83
+ if not job_id:
84
+ job_id_match = re.search(r"/jobs/view/(\d+)", url)
85
+ if job_id_match:
86
+ job_id = job_id_match.group(1)
87
+
88
+ # Title
89
+ title_el = (
90
+ card.css_first("h3.base-search-card__title")
91
+ or card.css_first(".base-search-card__title")
92
+ or card.css_first(".job-card-list__title")
93
+ or card.css_first(".sr-only")
94
+ or card.css_first("a[href*='/jobs/view/']")
95
+ )
96
  title = (title_el.text(strip=True) if title_el else "").strip()
97
+
98
+ # Company
99
+ company_el = (
100
+ card.css_first("h4.base-search-card__subtitle")
101
+ or card.css_first(".base-search-card__subtitle")
102
+ or card.css_first(".job-search-card__subtitle")
103
+ or card.css_first(".hidden-nested-link+div")
104
+ or card.css_first(".job-card-container__company-name")
105
+ or card.css_first(".job-card-container__primary-description")
106
+ )
107
  company = (company_el.text(strip=True) if company_el else None)
108
+
109
+ # Location
110
+ location_el = (
111
+ card.css_first(".job-search-card__location")
112
+ or card.css_first(".base-search-card__metadata > .job-search-card__location")
113
+ or card.css_first(".job-card-container__metadata-item")
114
+ )
115
  location = (location_el.text(strip=True) if location_el else None)
116
+
117
+ # Time listed
118
+ time_el = card.css_first("time, .job-search-card__listdate, .job-search-card__listdate--new")
119
  listed_text = (time_el.text(strip=True) if time_el else None)
120
 
121
  if not url or not title:
 
130
  if listed_text:
131
  listed_text = html.unescape(re.sub(r"\s+", " ", listed_text))
132
 
 
 
 
 
133
  try:
134
  jobs.append(
135
  JobPosting(
 
142
  )
143
  )
144
  except Exception:
 
145
  continue
146
 
147
+ # Fallback: grab anchors if no structured cards were detected
148
+ if not jobs:
149
+ anchors = tree.css("a[href*='/jobs/view/']")
150
+ seen_ids: set[str] = set()
151
+ for a in anchors:
152
+ href = a.attributes.get("href") or ""
153
+ if not href:
154
+ continue
155
+ url = _ensure_absolute_url(href)
156
+ job_id_match = re.search(r"/jobs/view/(\d+)", url)
157
+ job_id = job_id_match.group(1) if job_id_match else None
158
+ if job_id and job_id in seen_ids:
159
+ continue
160
+ title = a.text(strip=True)
161
+ if not title:
162
+ title = "LinkedIn Job"
163
+ try:
164
+ jobs.append(
165
+ JobPosting(
166
+ title=title,
167
+ company=None,
168
+ location=None,
169
+ url=url, # type: ignore[arg-type]
170
+ job_id=job_id,
171
+ listed_text=None,
172
+ )
173
+ )
174
+ if job_id:
175
+ seen_ids.add(job_id)
176
+ except Exception:
177
+ continue
178
+
179
  return jobs
180
 
181
 
182
+ # Mapping helpers to align with common notebook tutorials/filters
183
+ _DATE_POSTED_TO_TPR = {
184
+ # keys accepted by our API → LinkedIn f_TPR values
185
+ "past_24_hours": "r86400",
186
+ "past_week": "r604800",
187
+ "past_month": "r2592000",
188
+ }
189
+
190
+ _EXPERIENCE_TO_E = {
191
+ "internship": "1",
192
+ "entry": "2",
193
+ "associate": "3",
194
+ "mid-senior": "4",
195
+ "director": "5",
196
+ "executive": "6",
197
+ }
198
+
199
+ _JOBTYPE_TO_JT = {
200
+ "full-time": "F",
201
+ "part-time": "P",
202
+ "contract": "C",
203
+ "temporary": "T",
204
+ "internship": "I",
205
+ "volunteer": "V",
206
+ "other": "O",
207
+ }
208
+
209
+ _REMOTE_TO_WRA = {
210
+ "on-site": "1",
211
+ "remote": "2",
212
+ "hybrid": "3",
213
+ }
214
+
215
+
216
+ def _build_search_params(
217
+ *,
218
+ keywords: str,
219
+ location: Optional[str],
220
+ start: int,
221
+ sort_by: str = "relevance",
222
+ date_posted: Optional[str] = None,
223
+ experience_levels: Optional[List[str]] = None,
224
+ job_types: Optional[List[str]] = None,
225
+ remote: Optional[str] = None,
226
+ geo_id: Optional[int] = None,
227
+ ) -> dict:
228
+ params: dict = {
229
+ "keywords": keywords,
230
  "start": start,
231
  }
232
  if location:
233
  params["location"] = location
234
+ if geo_id is not None:
235
+ params["geoId"] = str(geo_id)
236
+
237
+ # Sort: relevance (R) or date (DD)
238
+ if sort_by and sort_by.lower() in {"relevance", "date"}:
239
+ params["sortBy"] = "R" if sort_by.lower() == "relevance" else "DD"
240
+
241
+ # Time posted
242
+ if date_posted:
243
+ tpr = _DATE_POSTED_TO_TPR.get(date_posted)
244
+ if tpr:
245
+ params["f_TPR"] = tpr
246
+
247
+ # Experience levels
248
+ if experience_levels:
249
+ codes = [code for key in experience_levels if (code := _EXPERIENCE_TO_E.get(key))]
250
+ if codes:
251
+ params["f_E"] = ",".join(codes)
252
+
253
+ # Job types
254
+ if job_types:
255
+ codes = [code for key in job_types if (code := _JOBTYPE_TO_JT.get(key))]
256
+ if codes:
257
+ params["f_JT"] = ",".join(codes)
258
+
259
+ # Workplace type (on-site / remote / hybrid)
260
+ if remote:
261
+ code = _REMOTE_TO_WRA.get(remote)
262
+ if code:
263
+ params["f_WRA"] = code
264
+
265
+ return params
266
+
267
+
268
+ def _search_page(
269
+ client: httpx.Client,
270
+ *,
271
+ params: dict,
272
+ ) -> list[JobPosting]:
273
+ base_url = "https://www.linkedin.com/jobs/search/?" + urlencode(params)
274
+ resp = client.get(base_url, follow_redirects=True, timeout=20.0)
275
  resp.raise_for_status()
276
  jobs = _parse_jobs_from_html(resp.text)
277
 
278
+ # If nothing parsed, try the fragment endpoint as a fallback regardless of page
279
+ if len(jobs) == 0:
280
  fragment_url = (
281
  "https://www.linkedin.com/jobs-guest/jobs/api/seeMoreJobPostings/search?" + urlencode(params)
282
  )
 
287
  return jobs
288
 
289
 
290
+ @mcp.tool(description="Search LinkedIn job listings and return structured job postings.")
291
+ def search_linkedin_jobs(
292
+ query: str,
293
+ location: Optional[str] = None,
294
+ limit: int = 25,
295
+ pages: int = 1,
296
+ *,
297
+ sort_by: str = "relevance",
298
+ date_posted: Optional[str] = None,
299
+ experience_levels: Optional[List[str]] = None,
300
+ job_types: Optional[List[str]] = None,
301
+ remote: Optional[str] = None,
302
+ geo_id: Optional[int] = None,
303
+ ) -> List[JobPosting]:
304
  """
305
  - query: Search keywords, e.g. "machine learning engineer"
306
  - location: Optional location filter, e.g. "Paris, Île-de-France, France"
307
  - limit: Maximum number of jobs to return (<= 200)
308
  - pages: Number of pages to fetch (each page is ~25 results)
309
+ - sort_by: "relevance" or "date" (maps to LinkedIn sortBy R/DD)
310
+ - date_posted: one of {"past_24_hours","past_week","past_month"}
311
+ - experience_levels: list of {"internship","entry","associate","mid-senior","director","executive"}
312
+ - job_types: list of {"full-time","part-time","contract","temporary","internship","volunteer","other"}
313
+ - remote: one of {"on-site","remote","hybrid"}
314
+ - geo_id: Optional numeric LinkedIn geoId for precise location targeting
315
 
316
  Note: LinkedIn may throttle or require authentication. You can set the environment
317
  variable LINKEDIN_COOKIE to a valid cookie string (e.g., including li_at) for better results.
 
326
 
327
  with httpx.Client(headers=headers) as client:
328
  start = 0
329
+ for _page in range(pages):
330
+ active_params = _build_search_params(
331
+ keywords=query,
332
+ location=location,
333
+ start=start,
334
+ sort_by=sort_by,
335
+ date_posted=date_posted,
336
+ experience_levels=experience_levels,
337
+ job_types=job_types,
338
+ remote=remote,
339
+ geo_id=geo_id,
340
+ )
341
+
342
  try:
343
+ jobs = _search_page(client, params=active_params)
344
  except httpx.HTTPStatusError as e:
 
345
  status = e.response.status_code
346
  if status in (401, 403, 429):
347
  break
348
  raise
349
  except Exception:
 
350
  jobs = []
351
 
352
  if not jobs:
 
353
  break
354
 
355
  all_jobs.extend(jobs)
 
357
  break
358
 
359
  start += 25
 
360
  time.sleep(0.8)
361
 
362
  return all_jobs[:max_items]