Spaces:
Sleeping
Sleeping
| from langchain.vectorstores import Chroma | |
| from langchain.embeddings import HuggingFaceEmbeddings | |
| from src.logger import logger | |
| class RetrievalModule: | |
| def __init__(self, embedding_model="all-MiniLM-L6-v2", persist_dir="./chroma_db"): | |
| self.embeddings = HuggingFaceEmbeddings(model_name=embedding_model) | |
| self.vector_store = None | |
| self.persist_dir = persist_dir # Persistent storage | |
| def build_vector_store(self, texts): | |
| """Build Chroma vector store with better logging.""" | |
| if not texts: | |
| logger.warning("No texts provided. Skipping vector store creation.") | |
| return | |
| self.vector_store = Chroma.from_texts( | |
| texts, self.embeddings, persist_directory=self.persist_dir | |
| ) | |
| self.vector_store.persist() | |
| logger.info("Chroma vector store successfully built.") | |
| def retrieve_relevant(self, query, k=2): | |
| """Fetch top-k relevant documents, logging warnings if store is empty.""" | |
| if not self.vector_store: | |
| logger.warning("Vector store is empty. Run `build_vector_store` first.") | |
| return [] | |
| top_docs = self.vector_store.similarity_search(query, k=k) | |
| retrieved = [doc.page_content for doc in top_docs] if top_docs else [] | |
| logger.info(f"Retrieved {len(retrieved)} relevant papers for query: '{query}'.") | |
| return retrieved | |