Spaces:

FridayMaster
/

CHATBOT1

Sleeping

App Files Files Community

FridayMaster commited on Aug 5, 2024

Commit

4c4e926

verified ·

1 Parent(s): 7dbc572

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -11

app.py CHANGED Viewed

@@ -1,12 +1,14 @@
 import pandas as pd
 import fitz  # PyMuPDF for PDF extraction
 import spacy
-from langchain.chains import ConversationalRetrievalChain  # Ensure this class is available or use an alternative
-from langchain.llms import OpenAI
-from langchain.vectorstores import FAISS
-import torch
 from transformers import AutoTokenizer, AutoModel
 import gradio as gr
 # Load and preprocess PDF text
 def extract_text_from_pdf(pdf_path):
@@ -18,7 +20,8 @@ def extract_text_from_pdf(pdf_path):
     return text
 # Extract text from the PDF
-pdf_text = extract_text_from_pdf('Getting_Started_with_Ubuntu_16.04.pdf')  # Replace with your PDF path
 # Convert the text to a DataFrame
 df = pd.DataFrame({'text': [pdf_text]})
@@ -35,7 +38,7 @@ class CustomEmbeddingModel:
             embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
         return embeddings[0].numpy()
-embedding_model = CustomEmbeddingModel('distilbert-base-uncased')  # Replace with your model name
 # Load Spacy model for preprocessing
 nlp = spacy.load("en_core_web_sm")
@@ -50,15 +53,34 @@ df['text'] = df['text'].apply(preprocess_text)
 df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
 # Create FAISS vector store
-documents = df['text'].tolist()
-embeddings = df['text_embeddings'].tolist()
-vector_store = FAISS.from_documents(documents, embeddings)
 # Create LangChain model and chain
 llm_model = OpenAI('gpt-3.5-turbo')  # You can replace this with a different LLM if desired
-retriever = vector_store.as_retriever()
-# Create a conversational chain
 chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
 # Function to generate a response

 import pandas as pd
 import fitz  # PyMuPDF for PDF extraction
 import spacy
+from nltk.corpus import stopwords
 from transformers import AutoTokenizer, AutoModel
+import torch
 import gradio as gr
+import numpy as np
+from faiss import IndexFlatL2, normalize_L2
+from langchain.llms import OpenAI
+from langchain.chains import ConversationalRetrievalChain
 # Load and preprocess PDF text
 def extract_text_from_pdf(pdf_path):
     return text
 # Extract text from the PDF
+pdf_path = 'Getting_Started_with_Ubuntu_16.04.pdf'  # Reference to the PDF file in the same directory
+pdf_text = extract_text_from_pdf(pdf_path)
 # Convert the text to a DataFrame
 df = pd.DataFrame({'text': [pdf_text]})
             embeddings = self.model(**inputs).last_hidden_state.mean(dim=1)
         return embeddings[0].numpy()
+embedding_model = CustomEmbeddingModel('FridayMaster/fine_tune_embedding')  # Replace with your model name
 # Load Spacy model for preprocessing
 nlp = spacy.load("en_core_web_sm")
 df['text_embeddings'] = df['text'].apply(lambda x: embedding_model.embed_text(x))
 # Create FAISS vector store
+class SimpleFAISSIndex:
+    def __init__(self, embeddings):
+        self.index = IndexFlatL2(embeddings.shape[1])
+        normalize_L2(embeddings)
+        self.index.add(embeddings)
+    def search(self, query_embedding, k=1):
+        normalize_L2(query_embedding)
+        distances, indices = self.index.search(query_embedding, k)
+        return indices[0], distances[0]
+embeddings = np.array(df['text_embeddings'].tolist())
+vector_store = SimpleFAISSIndex(embeddings)
 # Create LangChain model and chain
 llm_model = OpenAI('gpt-3.5-turbo')  # You can replace this with a different LLM if desired
+class SimpleRetriever:
+    def __init__(self, vector_store, documents):
+        self.vector_store = vector_store
+        self.documents = documents
+    def retrieve(self, query):
+        query_embedding = embedding_model.embed_text(query).reshape(1, -1)
+        indices, _ = self.vector_store.search(query_embedding)
+        return [self.documents[idx] for idx in indices]
+retriever = SimpleRetriever(vector_store, df['text'].tolist())
 chain = ConversationalRetrievalChain.from_llm(llm_model, retriever=retriever)
 # Function to generate a response