Spaces:
Sleeping
Sleeping
| from langchain.vectorstores import Chroma | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| import os | |
| from typing import List | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| class PrepareVectorDB: | |
| """ | |
| A class for preparing and saving a VectorDB using OpenAI embeddings. | |
| This class facilitates the process of loading documents, chunking them, and creating a VectorDB | |
| with OpenAI embeddings. It provides methods to prepare and save the VectorDB. | |
| Parameters: | |
| data_directory (str or List[str]): The directory or list of directories containing the documents. | |
| persist_directory (str): The directory to save the VectorDB. | |
| chunk_size (int): The size of the chunks for document processing. | |
| chunk_overlap (int): The overlap between chunks. | |
| """ | |
| def __init__( | |
| self, | |
| data_directory: str, | |
| persist_directory: str, | |
| chunk_size: int, | |
| chunk_overlap: int | |
| ) -> None: | |
| """ | |
| Initialize the PrepareVectorDB instance. | |
| Parameters: | |
| data_directory (str or List[str]): The directory or list of directories containing the documents. | |
| persist_directory (str): The directory to save the VectorDB. | |
| chunk_size (int): The size of the chunks for document processing. | |
| chunk_overlap (int): The overlap between chunks. | |
| """ | |
| self.text_splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| separators=["\n\n", "\n", " ", ""] | |
| ) | |
| """Other options: CharacterTextSplitter, TokenTextSplitter, etc.""" | |
| self.data_directory = data_directory | |
| self.persist_directory = persist_directory | |
| self.embedding_function = HuggingFaceEmbeddings( | |
| model_name="NeuML/pubmedbert-base-embeddings", | |
| # cache_folder=os.getenv('SENTENCE_TRANSFORMERS_HOME') | |
| ) | |
| def __load_all_documents(self) -> List: | |
| """ | |
| Load all documents from the specified directory or directories. | |
| Returns: | |
| List: A list of loaded documents. | |
| """ | |
| doc_counter = 0 | |
| if isinstance(self.data_directory, list): | |
| print("Loading the uploaded documents...") | |
| docs = [] | |
| for doc_dir in self.data_directory: | |
| docs.extend(PyPDFLoader(doc_dir).load()) | |
| doc_counter += 1 | |
| print("Number of loaded documents:", doc_counter) | |
| print("Number of pages:", len(docs), "\n\n") | |
| else: | |
| print("Loading documents manually...") | |
| document_list = os.listdir(self.data_directory) | |
| docs = [] | |
| for doc_name in document_list: | |
| docs.extend(PyPDFLoader(os.path.join( | |
| self.data_directory, doc_name)).load()) | |
| doc_counter += 1 | |
| print("Number of loaded documents:", doc_counter) | |
| print("Number of pages:", len(docs), "\n\n") | |
| return docs | |
| def __chunk_documents(self, docs: List) -> List: | |
| """ | |
| Chunk the loaded documents using the specified text splitter. | |
| Parameters: | |
| docs (List): The list of loaded documents. | |
| Returns: | |
| List: A list of chunked documents. | |
| """ | |
| print("Chunking documents...") | |
| chunked_documents = self.text_splitter.split_documents(docs) | |
| print("Number of chunks:", len(chunked_documents), "\n\n") | |
| return chunked_documents | |
| def prepare_and_save_vectordb(self): | |
| """ | |
| Load, chunk, and create a VectorDB with OpenAI embeddings, and save it. | |
| Returns: | |
| Chroma: The created VectorDB. | |
| """ | |
| docs = self.__load_all_documents() | |
| chunked_documents = self.__chunk_documents(docs) | |
| print("Preparing vectordb...") | |
| vectordb = Chroma.from_documents( | |
| documents=chunked_documents, | |
| embedding=self.embedding_function, | |
| persist_directory=self.persist_directory | |
| ) | |
| print("VectorDB is created and saved.") | |
| print("Number of vectors in vectordb:", | |
| vectordb._collection.count(), "\n\n") | |
| return vectordb | |