Spaces:

Divyansh12
/

analytics-vidhya-search

Sleeping

App Files Files Community

analytics-vidhya-search / app.py

Divyansh12

Update app.py

9b04153 verified about 1 year ago

raw

history blame

1.97 kB

	import asyncio
	import nest_asyncio
	import json
	import streamlit as st
	from sentence_transformers import SentenceTransformer
	from scrapegraphai.graphs import SmartScraperMultiGraph
	from langchain_groq import ChatGroq

	import subprocess

	subprocess.run(["playwright", "install"])
	# Apply nest_asyncio to allow nested event loops
	nest_asyncio.apply()

	# Load the sentence transformer model
	model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

	# Streamlit Application
	st.title("Course Scraper for Analytics Vidhya")

	# API Key Input
	api_key = st.text_input("Enter your API Key:", type="password")

	# Prompt Input
	user_prompt = st.text_input("Enter your prompt for scraping:")

	# Scraping Configuration
	graph_config = {
	"llm": {
	"model": "groq/llama3-70b-8192",
	"api_key": api_key,
	"temperature": 1
	},
	"embeddings": {
	"model_instance": model
	},
	'verbose': True,
	"headless": True # Set to True to run in headless mode
	}

	# Generate URLs for the first 4 pages (1 to 4)
	base_url = "https://courses.analyticsvidhya.com/collections"
	urls = [f"{base_url}?page={i}" for i in range(1, 5)] # Adjusting to scrape only the first 4 pages

	# Run the scraper when the button is clicked
	if st.button("Scrape Courses"):
	try:
	# Create the SmartScraperGraph instance
	smart_scraper_graph = SmartScraperMultiGraph(
	prompt=user_prompt, # Use the user-defined prompt
	source=urls,
	config=graph_config
	)

	# Run the scraper
	result = smart_scraper_graph.run()

	# Save the result as a JSON file
	with open("courses.json", "w") as outfile:
	json.dump(result, outfile, indent=4)

	# Display the results in Streamlit
	st.success("Scraping completed successfully!")
	st.json(result) # Display the result as a JSON object

	except Exception as e:
	st.error(f"An error occurred: {e}")