import asyncio import nest_asyncio import json import streamlit as st from sentence_transformers import SentenceTransformer from scrapegraphai.graphs import SmartScraperMultiGraph from langchain_groq import ChatGroq import subprocess subprocess.run(["playwright", "install"]) # Apply nest_asyncio to allow nested event loops nest_asyncio.apply() # Load the sentence transformer model model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") # Streamlit Application st.title("Course Scraper for Analytics Vidhya") # API Key Input api_key = st.text_input("Enter your API Key:", type="password") # Prompt Input user_prompt = st.text_input("Enter your prompt for scraping:") # Scraping Configuration graph_config = { "llm": { "model": "groq/llama3-70b-8192", "api_key": api_key, "temperature": 1 }, "embeddings": { "model_instance": model }, 'verbose': True, "headless": True # Set to True to run in headless mode } # Generate URLs for the first 4 pages (1 to 4) base_url = "https://courses.analyticsvidhya.com/collections" urls = [f"{base_url}?page={i}" for i in range(1, 5)] # Adjusting to scrape only the first 4 pages # Run the scraper when the button is clicked if st.button("Scrape Courses"): try: # Create the SmartScraperGraph instance smart_scraper_graph = SmartScraperMultiGraph( prompt=user_prompt, # Use the user-defined prompt source=urls, config=graph_config ) # Run the scraper result = smart_scraper_graph.run() # Save the result as a JSON file with open("courses.json", "w") as outfile: json.dump(result, outfile, indent=4) # Display the results in Streamlit st.success("Scraping completed successfully!") st.json(result) # Display the result as a JSON object except Exception as e: st.error(f"An error occurred: {e}")