Spaces:
Sleeping
Sleeping
File size: 1,971 Bytes
46650c7 c700303 343b21c a94df45 343b21c 46650c7 9b04153 46650c7 c700303 343b21c 46650c7 2a05abe 9fde84a 2a05abe 46650c7 c700303 46650c7 343b21c 46650c7 343b21c 46650c7 2a05abe 46650c7 343b21c 46650c7 343b21c e8b3820 343b21c 46650c7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 |
import asyncio
import nest_asyncio
import json
import streamlit as st
from sentence_transformers import SentenceTransformer
from scrapegraphai.graphs import SmartScraperMultiGraph
from langchain_groq import ChatGroq
import subprocess
subprocess.run(["playwright", "install"])
# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()
# Load the sentence transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")
# Streamlit Application
st.title("Course Scraper for Analytics Vidhya")
# API Key Input
api_key = st.text_input("Enter your API Key:", type="password")
# Prompt Input
user_prompt = st.text_input("Enter your prompt for scraping:")
# Scraping Configuration
graph_config = {
"llm": {
"model": "groq/llama3-70b-8192",
"api_key": api_key,
"temperature": 1
},
"embeddings": {
"model_instance": model
},
'verbose': True,
"headless": True # Set to True to run in headless mode
}
# Generate URLs for the first 4 pages (1 to 4)
base_url = "https://courses.analyticsvidhya.com/collections"
urls = [f"{base_url}?page={i}" for i in range(1, 5)] # Adjusting to scrape only the first 4 pages
# Run the scraper when the button is clicked
if st.button("Scrape Courses"):
try:
# Create the SmartScraperGraph instance
smart_scraper_graph = SmartScraperMultiGraph(
prompt=user_prompt, # Use the user-defined prompt
source=urls,
config=graph_config
)
# Run the scraper
result = smart_scraper_graph.run()
# Save the result as a JSON file
with open("courses.json", "w") as outfile:
json.dump(result, outfile, indent=4)
# Display the results in Streamlit
st.success("Scraping completed successfully!")
st.json(result) # Display the result as a JSON object
except Exception as e:
st.error(f"An error occurred: {e}")
|