Spaces:
Sleeping
Sleeping
| import asyncio | |
| import nest_asyncio | |
| import json | |
| import streamlit as st | |
| from sentence_transformers import SentenceTransformer | |
| from scrapegraphai.graphs import SmartScraperMultiGraph | |
| from langchain_groq import ChatGroq | |
| import subprocess | |
| subprocess.run(["playwright", "install"]) | |
| # Apply nest_asyncio to allow nested event loops | |
| nest_asyncio.apply() | |
| # Load the sentence transformer model | |
| model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2") | |
| # Streamlit Application | |
| st.title("Course Scraper for Analytics Vidhya") | |
| # API Key Input | |
| api_key = st.text_input("Enter your API Key:", type="password") | |
| # Prompt Input | |
| user_prompt = st.text_input("Enter your prompt for scraping:") | |
| # Scraping Configuration | |
| graph_config = { | |
| "llm": { | |
| "model": "groq/llama3-70b-8192", | |
| "api_key": api_key, | |
| "temperature": 1 | |
| }, | |
| "embeddings": { | |
| "model_instance": model | |
| }, | |
| 'verbose': True, | |
| "headless": True # Set to True to run in headless mode | |
| } | |
| # Generate URLs for the first 4 pages (1 to 4) | |
| base_url = "https://courses.analyticsvidhya.com/collections" | |
| urls = [f"{base_url}?page={i}" for i in range(1, 5)] # Adjusting to scrape only the first 4 pages | |
| # Run the scraper when the button is clicked | |
| if st.button("Scrape Courses"): | |
| try: | |
| # Create the SmartScraperGraph instance | |
| smart_scraper_graph = SmartScraperMultiGraph( | |
| prompt=user_prompt, # Use the user-defined prompt | |
| source=urls, | |
| config=graph_config | |
| ) | |
| # Run the scraper | |
| result = smart_scraper_graph.run() | |
| # Save the result as a JSON file | |
| with open("courses.json", "w") as outfile: | |
| json.dump(result, outfile, indent=4) | |
| # Display the results in Streamlit | |
| st.success("Scraping completed successfully!") | |
| st.json(result) # Display the result as a JSON object | |
| except Exception as e: | |
| st.error(f"An error occurred: {e}") | |