navkast commited on
Commit
dce8143
·
unverified ·
1 Parent(s): 85c3ddd

Async LinkedIn scraper (#5)

Browse files
src/vsp/app/bindings.py CHANGED
@@ -1,6 +1,8 @@
1
  from vsp.app.prompts.prompt_loader import PromptLoader
 
2
  from vsp.llm.openai.openai import AsyncOpenAIService
3
  from vsp.llm.openai.openai_model import OpenAIModel
4
 
5
  prompt_loader = PromptLoader()
6
  open_ai_service = AsyncOpenAIService(OpenAIModel.GPT_4_MINI)
 
 
1
  from vsp.app.prompts.prompt_loader import PromptLoader
2
+ from vsp.app.scrapers.linkedin_downloader import LinkedinDownloader
3
  from vsp.llm.openai.openai import AsyncOpenAIService
4
  from vsp.llm.openai.openai_model import OpenAIModel
5
 
6
  prompt_loader = PromptLoader()
7
  open_ai_service = AsyncOpenAIService(OpenAIModel.GPT_4_MINI)
8
+ linkedin_downloader = LinkedinDownloader()
src/vsp/app/scrapers/__init__.py ADDED
File without changes
src/vsp/app/scrapers/linkedin_downloader.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ linkedin_downloader.py
3
+
4
+ This module provides functionality for asynchronously downloading LinkedIn profile data
5
+ using the RapidAPI LinkedIn API. It handles authentication, rate limiting, and retrying
6
+ of requests in case of failures.
7
+
8
+ Classes:
9
+ LinkedInFetchFailedError: Custom exception for LinkedIn fetch failures.
10
+ LinkedinDownloader: Main class for downloading LinkedIn profile data.
11
+
12
+ Usage:
13
+ downloader = LinkedinDownloader()
14
+ profile = await downloader.fetch_linkedin_data("https://www.linkedin.com/in/username/")
15
+ """
16
+
17
+ import asyncio
18
+ from typing import Final
19
+
20
+ import aiohttp
21
+ from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
22
+
23
+ from vsp.app.model.linkedin.linkedin_models import LinkedinProfile
24
+ from vsp.shared import aws_clients, config, logger_factory
25
+
26
+ logger = logger_factory.get_logger(__name__)
27
+
28
+
29
+ class LinkedInFetchFailedError(Exception):
30
+ """Custom exception raised when fetching LinkedIn profile data fails."""
31
+
32
+
33
+ class LinkedinDownloader:
34
+ """
35
+ A class for asynchronously downloading LinkedIn profile data.
36
+
37
+ This class handles authentication, rate limiting, and retrying of requests
38
+ when interacting with the RapidAPI LinkedIn API.
39
+
40
+ Attributes:
41
+ _URL (Final[str]): The RapidAPI endpoint for LinkedIn data.
42
+ _X_RAPIDAPI_HOST (Final[str]): The RapidAPI host for LinkedIn API.
43
+ _api_key (str): The RapidAPI key for authentication.
44
+ _semaphore (asyncio.Semaphore): Semaphore for limiting concurrent requests.
45
+ """
46
+
47
+ _URL: Final[str] = "https://linkedin-api8.p.rapidapi.com/"
48
+ _X_RAPIDAPI_HOST: Final[str] = "linkedin-api8.p.rapidapi.com"
49
+
50
+ def __init__(self, max_concurrency: int = 10):
51
+ """
52
+ Initialize the LinkedinDownloader.
53
+
54
+ Args:
55
+ max_concurrency (int): Maximum number of concurrent API calls. Defaults to 10.
56
+ """
57
+ self._api_key = self._fetch_api_key()
58
+ self._semaphore = asyncio.Semaphore(max_concurrency)
59
+
60
+ @staticmethod
61
+ def _fetch_api_key() -> str:
62
+ """
63
+ Fetch the LinkedIn RapidAPI key from AWS Parameter Store.
64
+
65
+ Returns:
66
+ str: The LinkedIn RapidAPI key.
67
+
68
+ Raises:
69
+ ValueError: If the API key is not found in the Parameter Store.
70
+ RuntimeError: If there's an error accessing the Parameter Store.
71
+ """
72
+ try:
73
+ return aws_clients.fetch_from_parameter_store(config.get_linkedin_key_path(), is_secret=True)
74
+ except aws_clients.ParameterNotFoundError as e:
75
+ logger.error("LinkedIn RapidAPI key not found in Parameter Store", error=str(e))
76
+ raise ValueError("LinkedIn RapidAPI key not found") from e
77
+ except aws_clients.ParameterStoreAccessError as e:
78
+ logger.error("Error accessing Parameter Store", error=str(e))
79
+ raise RuntimeError("Unable to access LinkedIn RapidAPI key") from e
80
+
81
+ def _compose_request(self, linkedin_url: str) -> tuple[dict[str, str], dict[str, str]]:
82
+ """
83
+ Compose the request headers and query parameters for the API call.
84
+
85
+ Args:
86
+ linkedin_url (str): The LinkedIn profile URL.
87
+
88
+ Returns:
89
+ tuple[dict[str, str], dict[str, str]]: A tuple containing the headers and query parameters.
90
+ """
91
+ username = linkedin_url.split("/")[-2] if linkedin_url.endswith("/") else linkedin_url.split("/")[-1]
92
+ querystring = {"username": username}
93
+ headers = {"X-RapidAPI-Key": self._api_key, "X-RapidAPI-Host": self._X_RAPIDAPI_HOST}
94
+ return (headers, querystring)
95
+
96
+ @retry(
97
+ wait=wait_random_exponential(min=1, max=60),
98
+ stop=stop_after_attempt(3),
99
+ retry=retry_if_exception_type(LinkedInFetchFailedError),
100
+ reraise=True,
101
+ )
102
+ async def fetch_linkedin_data(self, linkedin_url: str) -> LinkedinProfile:
103
+ """
104
+ Fetch LinkedIn profile data for a given URL.
105
+
106
+ This method is decorated with a retry mechanism that will attempt to retry
107
+ the call up to 3 times with exponential backoff if a LinkedInFetchFailedError is raised.
108
+
109
+ Args:
110
+ linkedin_url (str): The LinkedIn profile URL to fetch data for.
111
+
112
+ Returns:
113
+ LinkedinProfile: The fetched LinkedIn profile data.
114
+
115
+ Raises:
116
+ LinkedInFetchFailedError: If the API call fails after all retry attempts.
117
+ """
118
+ async with self._semaphore:
119
+ headers, querystring = self._compose_request(linkedin_url)
120
+ logger.info("Fetching LinkedIn profile", url=linkedin_url)
121
+ async with aiohttp.ClientSession() as session:
122
+ async with session.get(self._URL, headers=headers, params=querystring) as response:
123
+ if response.status == 200:
124
+ data = await response.json()
125
+ return LinkedinProfile.profile_from_json(data)
126
+ else:
127
+ logger.error("Failed to fetch LinkedIn profile", url=linkedin_url, status=response.status)
128
+ raise LinkedInFetchFailedError(f"Failed to fetch LinkedIn profile for {linkedin_url}")
src/vsp/llm/openai/openai.py CHANGED
@@ -77,7 +77,7 @@ class AsyncOpenAIService(LLMService):
77
  try:
78
  return aws_clients.fetch_from_parameter_store(config.get_openai_api_key_path(), is_secret=True)
79
  except aws_clients.ParameterNotFoundError as e:
80
- logger.error("API key not found in Parameter Store", error=str(e))
81
  raise ValueError("OpenAI API key not found") from e
82
  except aws_clients.ParameterStoreAccessError as e:
83
  logger.error("Error accessing Parameter Store", error=str(e))
 
77
  try:
78
  return aws_clients.fetch_from_parameter_store(config.get_openai_api_key_path(), is_secret=True)
79
  except aws_clients.ParameterNotFoundError as e:
80
+ logger.error("OpenAI API key not found in Parameter Store", error=str(e))
81
  raise ValueError("OpenAI API key not found") from e
82
  except aws_clients.ParameterStoreAccessError as e:
83
  logger.error("Error accessing Parameter Store", error=str(e))
src/vsp/shared/config.py CHANGED
@@ -91,3 +91,13 @@ def get_openai_api_key_path() -> str:
91
  """
92
  config = _get_config()
93
  return str(config["openai"]["openai_api_key_parameter_store_path"])
 
 
 
 
 
 
 
 
 
 
 
91
  """
92
  config = _get_config()
93
  return str(config["openai"]["openai_api_key_parameter_store_path"])
94
+
95
+
96
+ @cache
97
+ def get_linkedin_key_path() -> str:
98
+ """
99
+ Reads the RapidAPI key path from the TOML configuration file for LinkedIn.
100
+ Key is in AWS parameter store
101
+ """
102
+ config = _get_config()
103
+ return str(config["linkedin"]["linkedin_api_key_parameter_store_path"])
src/vsp/shared/config.toml CHANGED
@@ -6,4 +6,7 @@ shared_services_aws_account = "851725506657"
6
  bedrock_aws_account = "339713101814"
7
 
8
  [openai]
9
- openai_api_key_parameter_store_path = "/secrets/openai/api_key"
 
 
 
 
6
  bedrock_aws_account = "339713101814"
7
 
8
  [openai]
9
+ openai_api_key_parameter_store_path = "/secrets/openai/api_key"
10
+
11
+ [linkedin]
12
+ linkedin_api_key_parameter_store_path = "/secrets/rapidapi/linkedin"
tests/vsp/app/scrapers/__init__.py ADDED
File without changes
tests/vsp/app/scrapers/test_integration_linkedin_downloader.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+
3
+ import pytest
4
+
5
+ from vsp.app.scrapers.linkedin_downloader import LinkedinDownloader
6
+ from vsp.shared import logger_factory
7
+
8
+ logger = logger_factory.get_logger(__name__)
9
+
10
+
11
+ @pytest.mark.asyncio
12
+ async def test_fetch_linkedin_data_integration():
13
+ """
14
+ Integration test for LinkedinDownloader.
15
+
16
+ This test makes an actual network request to fetch the LinkedIn profile of 'navkast'.
17
+ It requires a valid RapidAPI key to be set in the AWS Parameter Store.
18
+
19
+ Note: This test should be run sparingly to avoid unnecessary API calls and potential rate limiting.
20
+ """
21
+ downloader = LinkedinDownloader()
22
+ linkedin_url = "https://www.linkedin.com/in/navkast/"
23
+
24
+ try:
25
+ profile = await downloader.fetch_linkedin_data(linkedin_url)
26
+
27
+ # Log the fetched profile data
28
+ logger.info("Fetched LinkedIn profile", first_name=profile.first_name, last_name=profile.last_name)
29
+
30
+ # Assertions to verify the fetched data
31
+ assert profile.first_name == "Naveen"
32
+ assert profile.last_name == "K."
33
+ assert "University of Pennsylvania" in [edu.school_name for edu in profile.educations]
34
+ logger.info("Integration test passed successfully")
35
+ except Exception as e:
36
+ logger.error("Integration test failed", error=str(e))
37
+ raise
38
+
39
+
40
+ if __name__ == "__main__":
41
+ asyncio.run(test_fetch_linkedin_data_integration())