File size: 1,971 Bytes
46650c7
 
 
 
 
 
c700303
 
343b21c
a94df45
343b21c
46650c7
 
 
 
 
 
 
9b04153
46650c7
c700303
343b21c
46650c7
2a05abe
9fde84a
2a05abe
46650c7
 
 
 
c700303
46650c7
 
 
 
 
 
 
 
 
 
 
 
 
343b21c
 
46650c7
343b21c
46650c7
2a05abe
46650c7
 
 
 
343b21c
 
 
 
 
 
46650c7
343b21c
 
 
e8b3820
343b21c
 
46650c7
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import asyncio
import nest_asyncio
import json
import streamlit as st
from sentence_transformers import SentenceTransformer
from scrapegraphai.graphs import SmartScraperMultiGraph
from langchain_groq import ChatGroq

import subprocess

subprocess.run(["playwright", "install"])
# Apply nest_asyncio to allow nested event loops
nest_asyncio.apply()

# Load the sentence transformer model
model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# Streamlit Application
st.title("Course Scraper for Analytics Vidhya")

# API Key Input
api_key = st.text_input("Enter your API Key:", type="password")

# Prompt Input
user_prompt = st.text_input("Enter your prompt for scraping:")

# Scraping Configuration
graph_config = {
    "llm": {
        "model": "groq/llama3-70b-8192",
        "api_key": api_key,
        "temperature": 1
    },
    "embeddings": {
        "model_instance": model
    },
    'verbose': True,
    "headless": True  # Set to True to run in headless mode
}

# Generate URLs for the first 4 pages (1 to 4)
base_url = "https://courses.analyticsvidhya.com/collections"
urls = [f"{base_url}?page={i}" for i in range(1, 5)]  # Adjusting to scrape only the first 4 pages

# Run the scraper when the button is clicked
if st.button("Scrape Courses"):
    try:
        # Create the SmartScraperGraph instance
        smart_scraper_graph = SmartScraperMultiGraph(
            prompt=user_prompt,  # Use the user-defined prompt
            source=urls,
            config=graph_config
        )

        # Run the scraper
        result = smart_scraper_graph.run()
        
        # Save the result as a JSON file
        with open("courses.json", "w") as outfile:
            json.dump(result, outfile, indent=4)

        # Display the results in Streamlit
        st.success("Scraping completed successfully!")
        st.json(result)  # Display the result as a JSON object

    except Exception as e:
        st.error(f"An error occurred: {e}")