Spaces:

Deaksh
/

research-tool

Sleeping

App Files Files Community

Deaksh commited on Feb 18

Commit

2c7dad0

verified ·

1 Parent(s): b7c716e

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -76

app.py CHANGED Viewed

@@ -1,13 +1,14 @@
-import requests
 import os
 import streamlit as st
 import pickle
 import time
 from langchain_community.embeddings import HuggingFaceEmbeddings
 from langchain.chains import RetrievalQAWithSourcesChain
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.document_loaders import UnstructuredURLLoader
 from langchain_groq import ChatGroq
 from langchain.vectorstores import FAISS
 from dotenv import load_dotenv
@@ -27,100 +28,36 @@ file_path = "faiss_store_openai.pkl"
 main_placeholder = st.empty()
 llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500)
-# Debugging: Check if URLs are accessible
-def check_url(url):
-    try:
-        response = requests.get(url)
-        if response.status_code == 200:
-            return True
-        else:
-            return False
-    except Exception as e:
-        return False
 if process_url_clicked:
-    # Debugging: Verify URL accessibility
-    valid_urls = []
-    for url in urls:
-        if check_url(url):
-            valid_urls.append(url)
-        else:
-            main_placeholder.text(f"URL is not accessible: {url}")
-    if not valid_urls:
-        main_placeholder.text("None of the URLs are accessible.")
-    # Load data from URLs
-    loader = UnstructuredURLLoader(urls=valid_urls)
     main_placeholder.text("Data Loading...Started...✅✅✅")
-    try:
-        data = loader.load()
-    except Exception as e:
-        main_placeholder.text(f"Error loading data: {e}")
-    # Split data into chunks
     text_splitter = RecursiveCharacterTextSplitter(
         separators=['\n\n', '\n', '.', ','],
         chunk_size=1000
     )
     main_placeholder.text("Text Splitter...Started...✅✅✅")
     docs = text_splitter.split_documents(data)
-    # Debugging: Check if docs is empty
-    if not docs:
-        main_placeholder.text("No valid documents found! Please check the URLs.")
-    # Debugging: Check the content of docs
-    for doc in docs:
-        main_placeholder.text(f"Document content: {doc.page_content[:200]}")  # Show first 200 chars of each document
-    # Create embeddings using HuggingFaceEmbeddings
     embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
     main_placeholder.text("Embedding Vector Started Building...✅✅✅")
-    # Generate embeddings
-    embeddings = embedding_model.embed_documents([doc.page_content for doc in docs])
-    # Debugging: Check if embeddings are generated
-    if not embeddings:
-        main_placeholder.text("No embeddings were generated! Check the embedding model or document content.")
-    # Check the size of embeddings
-    main_placeholder.text(f"Generated {len(embeddings)} embeddings.")
-    # Convert embeddings to numpy array (needed by FAISS)
-    embeddings_np = np.array(embeddings).astype(np.float32)
-    # Check the shape of embeddings
-    main_placeholder.text(f"Shape of embeddings: {embeddings_np.shape}")
-    # Create FAISS index
-    if len(embeddings) > 0:
-        dimension = len(embeddings[0])  # Embedding vector dimension
-        index = FAISS(dimension)
-        index.add(embeddings_np)  # Add embeddings to FAISS index
-        # Wrap FAISS index using LangChain FAISS wrapper
-        vectorstore_huggingface = FAISS(embedding_function=embedding_model, index=index)
-        # Save the FAISS index to a pickle file
-        with open(file_path, "wb") as f:
-            pickle.dump(vectorstore_huggingface, f)
-        time.sleep(2)
-    else:
-        main_placeholder.text("Embeddings could not be generated, skipping FAISS index creation.")
 query = main_placeholder.text_input("Question: ")
 if query:
     if os.path.exists(file_path):
-        # Load the FAISS index from the pickle file
         with open(file_path, "rb") as f:
             vectorstore = pickle.load(f)
             chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
             result = chain({"question": query}, return_only_outputs=True)
-            # Display the answer
             st.header("Answer")
             st.write(result["answer"])
@@ -136,3 +73,4 @@ if query:

 import os
 import streamlit as st
 import pickle
 import time
 from langchain_community.embeddings import HuggingFaceEmbeddings
+from langchain import OpenAI
 from langchain.chains import RetrievalQAWithSourcesChain
 from langchain.text_splitter import RecursiveCharacterTextSplitter
 from langchain.document_loaders import UnstructuredURLLoader
 from langchain_groq import ChatGroq
+from langchain.embeddings import OpenAIEmbeddings
 from langchain.vectorstores import FAISS
 from dotenv import load_dotenv
 main_placeholder = st.empty()
 llm = ChatGroq(model_name="llama-3.3-70b-versatile", temperature=0.9, max_tokens=500)
 if process_url_clicked:
+    # load data
+    loader = UnstructuredURLLoader(urls=urls)
     main_placeholder.text("Data Loading...Started...✅✅✅")
+    data = loader.load()
+    # split data
     text_splitter = RecursiveCharacterTextSplitter(
         separators=['\n\n', '\n', '.', ','],
         chunk_size=1000
     )
     main_placeholder.text("Text Splitter...Started...✅✅✅")
     docs = text_splitter.split_documents(data)
+    # create embeddings and save it to FAISS index
     embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
+    vectorstore_huggingface = FAISS.from_documents(docs, embedding_model)
     main_placeholder.text("Embedding Vector Started Building...✅✅✅")
+    time.sleep(2)
+    # Save the FAISS index to a pickle file
+    with open(file_path, "wb") as f:
+        pickle.dump(vectorstore_huggingface, f)
 query = main_placeholder.text_input("Question: ")
 if query:
     if os.path.exists(file_path):
         with open(file_path, "rb") as f:
             vectorstore = pickle.load(f)
             chain = RetrievalQAWithSourcesChain.from_llm(llm=llm, retriever=vectorstore.as_retriever())
             result = chain({"question": query}, return_only_outputs=True)
+            # result will be a dictionary of this format --> {"answer": "", "sources": [] }
             st.header("Answer")
             st.write(result["answer"])