Divyansh12's picture
Update app.py
9b04153 verified
raw
history blame
1.97 kB
import asyncio
import nest_asyncio
import json
import streamlit as st
from sentence_transformers import SentenceTransformer
from scrapegraphai.graphs import SmartScraperMultiGraph
from langchain_groq import ChatGroq
import subprocess
subprocess.run(["playwright", "install"])
# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()
# Load the sentence transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Streamlit Application
st.title("Course Scraper for Analytics Vidhya")
# API Key Input
api_key = st.text_input("Enter your API Key:", type="password")
# Prompt Input
user_prompt = st.text_input("Enter your prompt for scraping:")
# Scraping Configuration
graph_config = {
"llm": {
"model": "groq/llama3-70b-8192",
"api_key": api_key,
"temperature": 1
},
"embeddings": {
"model_instance": model
},
'verbose': True,
"headless": True # Set to True to run in headless mode
}
# Generate URLs for the first 4 pages (1 to 4)
base_url = "https://courses.analyticsvidhya.com/collections"
urls = [f"{base_url}?page={i}" for i in range(1, 5)] # Adjusting to scrape only the first 4 pages
# Run the scraper when the button is clicked
if st.button("Scrape Courses"):
try:
# Create the SmartScraperGraph instance
smart_scraper_graph = SmartScraperMultiGraph(
prompt=user_prompt, # Use the user-defined prompt
source=urls,
config=graph_config
)
# Run the scraper
result = smart_scraper_graph.run()
# Save the result as a JSON file
with open("courses.json", "w") as outfile:
json.dump(result, outfile, indent=4)
# Display the results in Streamlit
st.success("Scraping completed successfully!")
st.json(result) # Display the result as a JSON object
except Exception as e:
st.error(f"An error occurred: {e}")