Spaces:
Sleeping
Sleeping
| from selenium import webdriver | |
| from selenium.webdriver.chrome.service import Service | |
| from selenium.webdriver.chrome.options import Options | |
| from selenium.webdriver.common.by import By | |
| from selenium.webdriver.support.ui import WebDriverWait | |
| from selenium.webdriver.support import expected_conditions as EC | |
| from selenium.common.exceptions import TimeoutException, NoSuchElementException | |
| from webdriver_manager.chrome import ChromeDriverManager | |
| import pandas as pd | |
| import time | |
| from urllib.parse import urljoin | |
| def scrape_shl_products(): | |
| # Configure Chrome options | |
| chrome_options = Options() | |
| chrome_options.add_argument("--headless") # Optional: Run in background | |
| chrome_options.add_argument("--disable-blink-features=AutomationControlled") | |
| chrome_options.add_argument( | |
| "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" | |
| ) | |
| # Set up driver | |
| service = Service(ChromeDriverManager().install()) | |
| driver = webdriver.Chrome(service=service, options=chrome_options) | |
| base_url = "https://www.shl.com" | |
| catalog_url = "https://www.shl.com/solutions/products/product-catalog/" | |
| try: | |
| print("Loading SHL product catalog...") | |
| driver.get(catalog_url) | |
| # Wait for products to load | |
| WebDriverWait(driver, 15).until( | |
| EC.presence_of_element_located((By.CSS_SELECTOR, ".product-card")) | |
| ) | |
| # Scroll to load all products | |
| print("Scrolling to load all products...") | |
| last_height = driver.execute_script("return document.body.scrollHeight") | |
| while True: | |
| driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
| time.sleep(2) | |
| new_height = driver.execute_script("return document.body.scrollHeight") | |
| if new_height == last_height: | |
| break | |
| last_height = new_height | |
| product_cards = driver.find_elements(By.CSS_SELECTOR, ".product-card") | |
| print(f"Found {len(product_cards)} products.") | |
| products = [] | |
| for card in product_cards: | |
| try: | |
| product = { | |
| 'Assessment Name': 'Not found', | |
| 'URL': 'Not found', | |
| 'Remote Testing Support': 'No', | |
| 'Adaptive/IRT Support': 'No', | |
| 'Duration': 'Not specified', | |
| 'Test Type': 'Not specified' | |
| } | |
| # Name | |
| name_element = card.find_element(By.CSS_SELECTOR, ".product-card__title") | |
| product['Assessment Name'] = name_element.text | |
| # URL | |
| link_element = card.find_element(By.CSS_SELECTOR, "a[href]") | |
| product['URL'] = urljoin(base_url, link_element.get_attribute("href")) | |
| # Metadata | |
| meta_items = card.find_elements(By.CSS_SELECTOR, ".product-card__meta-item") | |
| for item in meta_items: | |
| try: | |
| label = item.find_element(By.CSS_SELECTOR, ".product-card__meta-label").text.lower() | |
| value = item.find_element(By.CSS_SELECTOR, ".product-card__meta-value").text | |
| if 'remote' in label: | |
| product['Remote Testing Support'] = 'Yes' if 'yes' in value.lower() else 'No' | |
| elif 'adaptive' in label or 'irt' in label: | |
| product['Adaptive/IRT Support'] = 'Yes' if 'yes' in value.lower() else 'No' | |
| elif 'duration' in label: | |
| product['Duration'] = value | |
| elif 'type' in label: | |
| product['Test Type'] = value | |
| except NoSuchElementException: | |
| continue | |
| products.append(product) | |
| except Exception as e: | |
| print(f"Error processing a product card: {str(e)}") | |
| continue | |
| # Save data | |
| df = pd.DataFrame(products) | |
| df.to_csv('shl_products.csv', index=False) | |
| print("Data saved to shl_products.csv") | |
| return df | |
| except TimeoutException: | |
| print("Timeout loading the page.") | |
| except Exception as e: | |
| print(f"An error occurred: {str(e)}") | |
| finally: | |
| driver.quit() | |
| print("Browser closed.") | |
| if __name__ == "__main__": | |
| print("Starting SHL scraper...") # Debug print | |
| df = scrape_shl_products() | |
| if df is not None and not df.empty: | |
| print("\nFirst 5 results:") | |
| print(df.head()) | |
| else: | |
| print("No data scraped.") | |