navkast
commited on
Async LinkedIn scraper (#5)
Browse files- src/vsp/app/bindings.py +2 -0
- src/vsp/app/scrapers/__init__.py +0 -0
- src/vsp/app/scrapers/linkedin_downloader.py +128 -0
- src/vsp/llm/openai/openai.py +1 -1
- src/vsp/shared/config.py +10 -0
- src/vsp/shared/config.toml +4 -1
- tests/vsp/app/scrapers/__init__.py +0 -0
- tests/vsp/app/scrapers/test_integration_linkedin_downloader.py +41 -0
src/vsp/app/bindings.py
CHANGED
|
@@ -1,6 +1,8 @@
|
|
| 1 |
from vsp.app.prompts.prompt_loader import PromptLoader
|
|
|
|
| 2 |
from vsp.llm.openai.openai import AsyncOpenAIService
|
| 3 |
from vsp.llm.openai.openai_model import OpenAIModel
|
| 4 |
|
| 5 |
prompt_loader = PromptLoader()
|
| 6 |
open_ai_service = AsyncOpenAIService(OpenAIModel.GPT_4_MINI)
|
|
|
|
|
|
| 1 |
from vsp.app.prompts.prompt_loader import PromptLoader
|
| 2 |
+
from vsp.app.scrapers.linkedin_downloader import LinkedinDownloader
|
| 3 |
from vsp.llm.openai.openai import AsyncOpenAIService
|
| 4 |
from vsp.llm.openai.openai_model import OpenAIModel
|
| 5 |
|
| 6 |
prompt_loader = PromptLoader()
|
| 7 |
open_ai_service = AsyncOpenAIService(OpenAIModel.GPT_4_MINI)
|
| 8 |
+
linkedin_downloader = LinkedinDownloader()
|
src/vsp/app/scrapers/__init__.py
ADDED
|
File without changes
|
src/vsp/app/scrapers/linkedin_downloader.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
linkedin_downloader.py
|
| 3 |
+
|
| 4 |
+
This module provides functionality for asynchronously downloading LinkedIn profile data
|
| 5 |
+
using the RapidAPI LinkedIn API. It handles authentication, rate limiting, and retrying
|
| 6 |
+
of requests in case of failures.
|
| 7 |
+
|
| 8 |
+
Classes:
|
| 9 |
+
LinkedInFetchFailedError: Custom exception for LinkedIn fetch failures.
|
| 10 |
+
LinkedinDownloader: Main class for downloading LinkedIn profile data.
|
| 11 |
+
|
| 12 |
+
Usage:
|
| 13 |
+
downloader = LinkedinDownloader()
|
| 14 |
+
profile = await downloader.fetch_linkedin_data("https://www.linkedin.com/in/username/")
|
| 15 |
+
"""
|
| 16 |
+
|
| 17 |
+
import asyncio
|
| 18 |
+
from typing import Final
|
| 19 |
+
|
| 20 |
+
import aiohttp
|
| 21 |
+
from tenacity import retry, retry_if_exception_type, stop_after_attempt, wait_random_exponential
|
| 22 |
+
|
| 23 |
+
from vsp.app.model.linkedin.linkedin_models import LinkedinProfile
|
| 24 |
+
from vsp.shared import aws_clients, config, logger_factory
|
| 25 |
+
|
| 26 |
+
logger = logger_factory.get_logger(__name__)
|
| 27 |
+
|
| 28 |
+
|
| 29 |
+
class LinkedInFetchFailedError(Exception):
|
| 30 |
+
"""Custom exception raised when fetching LinkedIn profile data fails."""
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
class LinkedinDownloader:
|
| 34 |
+
"""
|
| 35 |
+
A class for asynchronously downloading LinkedIn profile data.
|
| 36 |
+
|
| 37 |
+
This class handles authentication, rate limiting, and retrying of requests
|
| 38 |
+
when interacting with the RapidAPI LinkedIn API.
|
| 39 |
+
|
| 40 |
+
Attributes:
|
| 41 |
+
_URL (Final[str]): The RapidAPI endpoint for LinkedIn data.
|
| 42 |
+
_X_RAPIDAPI_HOST (Final[str]): The RapidAPI host for LinkedIn API.
|
| 43 |
+
_api_key (str): The RapidAPI key for authentication.
|
| 44 |
+
_semaphore (asyncio.Semaphore): Semaphore for limiting concurrent requests.
|
| 45 |
+
"""
|
| 46 |
+
|
| 47 |
+
_URL: Final[str] = "https://linkedin-api8.p.rapidapi.com/"
|
| 48 |
+
_X_RAPIDAPI_HOST: Final[str] = "linkedin-api8.p.rapidapi.com"
|
| 49 |
+
|
| 50 |
+
def __init__(self, max_concurrency: int = 10):
|
| 51 |
+
"""
|
| 52 |
+
Initialize the LinkedinDownloader.
|
| 53 |
+
|
| 54 |
+
Args:
|
| 55 |
+
max_concurrency (int): Maximum number of concurrent API calls. Defaults to 10.
|
| 56 |
+
"""
|
| 57 |
+
self._api_key = self._fetch_api_key()
|
| 58 |
+
self._semaphore = asyncio.Semaphore(max_concurrency)
|
| 59 |
+
|
| 60 |
+
@staticmethod
|
| 61 |
+
def _fetch_api_key() -> str:
|
| 62 |
+
"""
|
| 63 |
+
Fetch the LinkedIn RapidAPI key from AWS Parameter Store.
|
| 64 |
+
|
| 65 |
+
Returns:
|
| 66 |
+
str: The LinkedIn RapidAPI key.
|
| 67 |
+
|
| 68 |
+
Raises:
|
| 69 |
+
ValueError: If the API key is not found in the Parameter Store.
|
| 70 |
+
RuntimeError: If there's an error accessing the Parameter Store.
|
| 71 |
+
"""
|
| 72 |
+
try:
|
| 73 |
+
return aws_clients.fetch_from_parameter_store(config.get_linkedin_key_path(), is_secret=True)
|
| 74 |
+
except aws_clients.ParameterNotFoundError as e:
|
| 75 |
+
logger.error("LinkedIn RapidAPI key not found in Parameter Store", error=str(e))
|
| 76 |
+
raise ValueError("LinkedIn RapidAPI key not found") from e
|
| 77 |
+
except aws_clients.ParameterStoreAccessError as e:
|
| 78 |
+
logger.error("Error accessing Parameter Store", error=str(e))
|
| 79 |
+
raise RuntimeError("Unable to access LinkedIn RapidAPI key") from e
|
| 80 |
+
|
| 81 |
+
def _compose_request(self, linkedin_url: str) -> tuple[dict[str, str], dict[str, str]]:
|
| 82 |
+
"""
|
| 83 |
+
Compose the request headers and query parameters for the API call.
|
| 84 |
+
|
| 85 |
+
Args:
|
| 86 |
+
linkedin_url (str): The LinkedIn profile URL.
|
| 87 |
+
|
| 88 |
+
Returns:
|
| 89 |
+
tuple[dict[str, str], dict[str, str]]: A tuple containing the headers and query parameters.
|
| 90 |
+
"""
|
| 91 |
+
username = linkedin_url.split("/")[-2] if linkedin_url.endswith("/") else linkedin_url.split("/")[-1]
|
| 92 |
+
querystring = {"username": username}
|
| 93 |
+
headers = {"X-RapidAPI-Key": self._api_key, "X-RapidAPI-Host": self._X_RAPIDAPI_HOST}
|
| 94 |
+
return (headers, querystring)
|
| 95 |
+
|
| 96 |
+
@retry(
|
| 97 |
+
wait=wait_random_exponential(min=1, max=60),
|
| 98 |
+
stop=stop_after_attempt(3),
|
| 99 |
+
retry=retry_if_exception_type(LinkedInFetchFailedError),
|
| 100 |
+
reraise=True,
|
| 101 |
+
)
|
| 102 |
+
async def fetch_linkedin_data(self, linkedin_url: str) -> LinkedinProfile:
|
| 103 |
+
"""
|
| 104 |
+
Fetch LinkedIn profile data for a given URL.
|
| 105 |
+
|
| 106 |
+
This method is decorated with a retry mechanism that will attempt to retry
|
| 107 |
+
the call up to 3 times with exponential backoff if a LinkedInFetchFailedError is raised.
|
| 108 |
+
|
| 109 |
+
Args:
|
| 110 |
+
linkedin_url (str): The LinkedIn profile URL to fetch data for.
|
| 111 |
+
|
| 112 |
+
Returns:
|
| 113 |
+
LinkedinProfile: The fetched LinkedIn profile data.
|
| 114 |
+
|
| 115 |
+
Raises:
|
| 116 |
+
LinkedInFetchFailedError: If the API call fails after all retry attempts.
|
| 117 |
+
"""
|
| 118 |
+
async with self._semaphore:
|
| 119 |
+
headers, querystring = self._compose_request(linkedin_url)
|
| 120 |
+
logger.info("Fetching LinkedIn profile", url=linkedin_url)
|
| 121 |
+
async with aiohttp.ClientSession() as session:
|
| 122 |
+
async with session.get(self._URL, headers=headers, params=querystring) as response:
|
| 123 |
+
if response.status == 200:
|
| 124 |
+
data = await response.json()
|
| 125 |
+
return LinkedinProfile.profile_from_json(data)
|
| 126 |
+
else:
|
| 127 |
+
logger.error("Failed to fetch LinkedIn profile", url=linkedin_url, status=response.status)
|
| 128 |
+
raise LinkedInFetchFailedError(f"Failed to fetch LinkedIn profile for {linkedin_url}")
|
src/vsp/llm/openai/openai.py
CHANGED
|
@@ -77,7 +77,7 @@ class AsyncOpenAIService(LLMService):
|
|
| 77 |
try:
|
| 78 |
return aws_clients.fetch_from_parameter_store(config.get_openai_api_key_path(), is_secret=True)
|
| 79 |
except aws_clients.ParameterNotFoundError as e:
|
| 80 |
-
logger.error("API key not found in Parameter Store", error=str(e))
|
| 81 |
raise ValueError("OpenAI API key not found") from e
|
| 82 |
except aws_clients.ParameterStoreAccessError as e:
|
| 83 |
logger.error("Error accessing Parameter Store", error=str(e))
|
|
|
|
| 77 |
try:
|
| 78 |
return aws_clients.fetch_from_parameter_store(config.get_openai_api_key_path(), is_secret=True)
|
| 79 |
except aws_clients.ParameterNotFoundError as e:
|
| 80 |
+
logger.error("OpenAI API key not found in Parameter Store", error=str(e))
|
| 81 |
raise ValueError("OpenAI API key not found") from e
|
| 82 |
except aws_clients.ParameterStoreAccessError as e:
|
| 83 |
logger.error("Error accessing Parameter Store", error=str(e))
|
src/vsp/shared/config.py
CHANGED
|
@@ -91,3 +91,13 @@ def get_openai_api_key_path() -> str:
|
|
| 91 |
"""
|
| 92 |
config = _get_config()
|
| 93 |
return str(config["openai"]["openai_api_key_parameter_store_path"])
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
"""
|
| 92 |
config = _get_config()
|
| 93 |
return str(config["openai"]["openai_api_key_parameter_store_path"])
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
@cache
|
| 97 |
+
def get_linkedin_key_path() -> str:
|
| 98 |
+
"""
|
| 99 |
+
Reads the RapidAPI key path from the TOML configuration file for LinkedIn.
|
| 100 |
+
Key is in AWS parameter store
|
| 101 |
+
"""
|
| 102 |
+
config = _get_config()
|
| 103 |
+
return str(config["linkedin"]["linkedin_api_key_parameter_store_path"])
|
src/vsp/shared/config.toml
CHANGED
|
@@ -6,4 +6,7 @@ shared_services_aws_account = "851725506657"
|
|
| 6 |
bedrock_aws_account = "339713101814"
|
| 7 |
|
| 8 |
[openai]
|
| 9 |
-
openai_api_key_parameter_store_path = "/secrets/openai/api_key"
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
bedrock_aws_account = "339713101814"
|
| 7 |
|
| 8 |
[openai]
|
| 9 |
+
openai_api_key_parameter_store_path = "/secrets/openai/api_key"
|
| 10 |
+
|
| 11 |
+
[linkedin]
|
| 12 |
+
linkedin_api_key_parameter_store_path = "/secrets/rapidapi/linkedin"
|
tests/vsp/app/scrapers/__init__.py
ADDED
|
File without changes
|
tests/vsp/app/scrapers/test_integration_linkedin_downloader.py
ADDED
|
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
|
| 3 |
+
import pytest
|
| 4 |
+
|
| 5 |
+
from vsp.app.scrapers.linkedin_downloader import LinkedinDownloader
|
| 6 |
+
from vsp.shared import logger_factory
|
| 7 |
+
|
| 8 |
+
logger = logger_factory.get_logger(__name__)
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
@pytest.mark.asyncio
|
| 12 |
+
async def test_fetch_linkedin_data_integration():
|
| 13 |
+
"""
|
| 14 |
+
Integration test for LinkedinDownloader.
|
| 15 |
+
|
| 16 |
+
This test makes an actual network request to fetch the LinkedIn profile of 'navkast'.
|
| 17 |
+
It requires a valid RapidAPI key to be set in the AWS Parameter Store.
|
| 18 |
+
|
| 19 |
+
Note: This test should be run sparingly to avoid unnecessary API calls and potential rate limiting.
|
| 20 |
+
"""
|
| 21 |
+
downloader = LinkedinDownloader()
|
| 22 |
+
linkedin_url = "https://www.linkedin.com/in/navkast/"
|
| 23 |
+
|
| 24 |
+
try:
|
| 25 |
+
profile = await downloader.fetch_linkedin_data(linkedin_url)
|
| 26 |
+
|
| 27 |
+
# Log the fetched profile data
|
| 28 |
+
logger.info("Fetched LinkedIn profile", first_name=profile.first_name, last_name=profile.last_name)
|
| 29 |
+
|
| 30 |
+
# Assertions to verify the fetched data
|
| 31 |
+
assert profile.first_name == "Naveen"
|
| 32 |
+
assert profile.last_name == "K."
|
| 33 |
+
assert "University of Pennsylvania" in [edu.school_name for edu in profile.educations]
|
| 34 |
+
logger.info("Integration test passed successfully")
|
| 35 |
+
except Exception as e:
|
| 36 |
+
logger.error("Integration test failed", error=str(e))
|
| 37 |
+
raise
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
if __name__ == "__main__":
|
| 41 |
+
asyncio.run(test_fetch_linkedin_data_integration())
|