Spaces:

Divyansh12
/

analytics-vidhya-search

Sleeping

File size: 1,971 Bytes

import asyncio
import nest_asyncio
import json
import streamlit as st
from sentence_transformers import SentenceTransformer
from scrapegraphai.graphs import SmartScraperMultiGraph
from langchain_groq import ChatGroq

import subprocess

subprocess.run(["playwright", "install"])
# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load the sentence transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Streamlit Application
st.title("Course Scraper for Analytics Vidhya")

# API Key Input
api_key = st.text_input("Enter your API Key:", type="password")

# Prompt Input
user_prompt = st.text_input("Enter your prompt for scraping:")

# Scraping Configuration
graph_config = {
    "llm": {
        "model": "groq/llama3-70b-8192",
        "api_key": api_key,
        "temperature": 1
    },
    "embeddings": {
        "model_instance": model
    },
    'verbose': True,
    "headless": True  # Set to True to run in headless mode
}

# Generate URLs for the first 4 pages (1 to 4)
base_url = "https://courses.analyticsvidhya.com/collections"
urls = [f"{base_url}?page={i}" for i in range(1, 5)]  # Adjusting to scrape only the first 4 pages

# Run the scraper when the button is clicked
if st.button("Scrape Courses"):
    try:
        # Create the SmartScraperGraph instance
        smart_scraper_graph = SmartScraperMultiGraph(
            prompt=user_prompt,  # Use the user-defined prompt
            source=urls,
            config=graph_config
        )

        # Run the scraper
        result = smart_scraper_graph.run()
        
        # Save the result as a JSON file
        with open("courses.json", "w") as outfile:
            json.dump(result, outfile, indent=4)

        # Display the results in Streamlit
        st.success("Scraping completed successfully!")
        st.json(result)  # Display the result as a JSON object

    except Exception as e:
        st.error(f"An error occurred: {e}")