Spaces:

Ipshitaa
/

rag-chatbot

Sleeping

App Files Files Community

rag-chatbot / Web_Scraper.py

Ipshitaa

Deploying SHL chatbot to Hugging Face Spaces

c71b3b8 7 months ago

raw

history blame contribute delete

4.68 kB

	from selenium import webdriver
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.chrome.options import Options
	from selenium.webdriver.common.by import By
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import TimeoutException, NoSuchElementException
	from webdriver_manager.chrome import ChromeDriverManager
	import pandas as pd
	import time
	from urllib.parse import urljoin


	def scrape_shl_products():
	# Configure Chrome options
	chrome_options = Options()
	chrome_options.add_argument("--headless") # Optional: Run in background
	chrome_options.add_argument("--disable-blink-features=AutomationControlled")
	chrome_options.add_argument(
	"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
	)

	# Set up driver
	service = Service(ChromeDriverManager().install())
	driver = webdriver.Chrome(service=service, options=chrome_options)

	base_url = "https://www.shl.com"
	catalog_url = "https://www.shl.com/solutions/products/product-catalog/"

	try:
	print("Loading SHL product catalog...")
	driver.get(catalog_url)

	# Wait for products to load
	WebDriverWait(driver, 15).until(
	EC.presence_of_element_located((By.CSS_SELECTOR, ".product-card"))
	)

	# Scroll to load all products
	print("Scrolling to load all products...")
	last_height = driver.execute_script("return document.body.scrollHeight")
	while True:
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(2)
	new_height = driver.execute_script("return document.body.scrollHeight")
	if new_height == last_height:
	break
	last_height = new_height

	product_cards = driver.find_elements(By.CSS_SELECTOR, ".product-card")
	print(f"Found {len(product_cards)} products.")

	products = []

	for card in product_cards:
	try:
	product = {
	'Assessment Name': 'Not found',
	'URL': 'Not found',
	'Remote Testing Support': 'No',
	'Adaptive/IRT Support': 'No',
	'Duration': 'Not specified',
	'Test Type': 'Not specified'
	}

	# Name
	name_element = card.find_element(By.CSS_SELECTOR, ".product-card__title")
	product['Assessment Name'] = name_element.text

	# URL
	link_element = card.find_element(By.CSS_SELECTOR, "a[href]")
	product['URL'] = urljoin(base_url, link_element.get_attribute("href"))

	# Metadata
	meta_items = card.find_elements(By.CSS_SELECTOR, ".product-card__meta-item")
	for item in meta_items:
	try:
	label = item.find_element(By.CSS_SELECTOR, ".product-card__meta-label").text.lower()
	value = item.find_element(By.CSS_SELECTOR, ".product-card__meta-value").text

	if 'remote' in label:
	product['Remote Testing Support'] = 'Yes' if 'yes' in value.lower() else 'No'
	elif 'adaptive' in label or 'irt' in label:
	product['Adaptive/IRT Support'] = 'Yes' if 'yes' in value.lower() else 'No'
	elif 'duration' in label:
	product['Duration'] = value
	elif 'type' in label:
	product['Test Type'] = value
	except NoSuchElementException:
	continue

	products.append(product)

	except Exception as e:
	print(f"Error processing a product card: {str(e)}")
	continue

	# Save data
	df = pd.DataFrame(products)
	df.to_csv('shl_products.csv', index=False)
	print("Data saved to shl_products.csv")

	return df

	except TimeoutException:
	print("Timeout loading the page.")
	except Exception as e:
	print(f"An error occurred: {str(e)}")
	finally:
	driver.quit()
	print("Browser closed.")


	if __name__ == "__main__":
	print("Starting SHL scraper...") # Debug print
	df = scrape_shl_products()
	if df is not None and not df.empty:
	print("\nFirst 5 results:")
	print(df.head())
	else:
	print("No data scraped.")