Spaces:
Sleeping
Sleeping
| import os | |
| import streamlit as st | |
| import pickle | |
| import time | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from langchain_community.embeddings import HuggingFaceEmbeddings | |
| from langchain.chains import RetrievalQAWithSourcesChain | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| from langchain.vectorstores import Chroma | |
| from langchain_groq import ChatGroq | |
| from dotenv import load_dotenv | |
| from langchain.schema import Document | |
| from langchain.vectorstores import FAISS | |
| load_dotenv() # Load environment variables from .env file | |
| st.title("RockyBot: News Research Tool π") | |
| st.sidebar.title("News Article URLs") | |
| # Collect URLs from user input | |
| urls = [st.sidebar.text_input(f"URL {i+1}") for i in range(3)] | |
| process_url_clicked = st.sidebar.button("Process URLs") | |
| file_path = "faiss_store_openai.pkl" | |
| main_placeholder = st.empty() | |
| llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500) | |
| def fetch_web_content(url): | |
| """Fetches text content from a given URL using BeautifulSoup.""" | |
| try: | |
| response = requests.get(url, timeout=10) | |
| response.raise_for_status() | |
| soup = BeautifulSoup(response.text, "html.parser") | |
| return soup.get_text() | |
| except Exception as e: | |
| return f"Error fetching {url}: {str(e)}" | |
| if process_url_clicked: | |
| main_placeholder.text("Data Loading...Started...β β β ") | |
| # Fetch content from URLs | |
| data = [fetch_web_content(url) for url in urls if url.strip()] | |
| main_placeholder.text("Data Loading...Completed...β β β ") | |
| # Split data into chunks | |
| text_splitter = RecursiveCharacterTextSplitter( | |
| separators=['\n\n', '\n', '.', ','], | |
| chunk_size=1000 | |
| ) | |
| main_placeholder.text("Text Splitting...Started...β β β ") | |
| docs = [Document(page_content=text) for text in data] | |
| docs = text_splitter.split_documents(docs) | |
| #docs = text_splitter.split_documents(data) | |
| # Create embeddings and save to Chroma vector store | |
| embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2") | |
| #vectorstore_huggingface = Chroma.from_documents(docs, embedding_model) | |
| vectorstore_huggingface = FAISS.from_documents(docs, embedding_model) | |
| main_placeholder.text("Embedding Vector Started Building...β β β ") | |
| time.sleep(2) | |
| # Save the vector store to a pickle file | |
| with open(file_path, "wb") as f: | |
| pickle.dump(vectorstore_huggingface, f) | |
| # User query input | |
| query = st.text_input("Question: ") | |
| if query: | |
| if os.path.exists(file_path): | |
| with open(file_path, "rb") as f: | |
| vectorstore = pickle.load(f) | |
| chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever()) | |
| result = chain({"question": query}, return_only_outputs=True) | |
| # Display answer | |
| st.header("Answer") | |
| st.write(result["answer"]) | |
| # Display sources, if available | |
| sources = result.get("sources", "") | |
| if sources: | |
| st.subheader("Sources:") | |
| sources_list = sources.split("\n") | |
| for source in sources_list: | |
| st.write(source) | |