Spaces:
Sleeping
Sleeping
| import os | |
| import sys | |
| sys.path.append(sys.path[0].replace('scripts', '')) | |
| import pandas as pd | |
| import numpy as np | |
| from config.data_paths import VECTORDB_PATH | |
| from typing import Sequence, List, Tuple | |
| import faiss | |
| from sentence_transformers import SentenceTransformer | |
| class Vectorizer: | |
| def __init__(self, model_name: str) -> None: | |
| """ | |
| Initialize the vectorizer with a pre-trained embedding model. | |
| Args: | |
| model_name: The name of the pre-trained embedding model (compatible with sentence-transformers). | |
| """ | |
| self.model = SentenceTransformer(model_name) | |
| def transform(self, prompts: Sequence[str], build_index=False) -> np.ndarray: | |
| """ | |
| Transform texts into numerical vectors using the specified model. | |
| Args: | |
| prompts: The sequence of raw corpus prompts. | |
| Returns: | |
| Vectorized prompts as a numpy array. | |
| """ | |
| embeddings = self.model.encode(prompts, show_progress_bar=True) | |
| embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) # normalize embeddings | |
| if build_index: | |
| # self.embeddings=embeddings | |
| if os.path.isfile(os.path.join(VECTORDB_PATH, 'prompts_index.faiss')): | |
| print('Embeddings already stored in vector db') | |
| else: | |
| index = self._build_index(embeddings) | |
| faiss.write_index(index, os.path.join(VECTORDB_PATH, 'prompts_index.faiss')) | |
| else: | |
| return embeddings | |
| def _build_index(self, embeddings: np.ndarray) -> faiss.IndexFlatIP: | |
| """ | |
| Build and return a FAISS index for the given embeddings. | |
| Args: | |
| embeddings: A numpy array of prompt embeddings. | |
| Returns: | |
| FAISS index for efficient similarity search. | |
| """ | |
| index = faiss.IndexFlatIP(embeddings.shape[1]) # Cosine similarity (IP on normalized vectors) | |
| index.add(embeddings) | |
| return index | |
| def cosine_similarity(query_vector: np.ndarray, corpus_vectors: np.ndarray) -> np.ndarray: | |
| """ | |
| Calculate cosine similarity between prompt vectors. | |
| Args: | |
| query_vector: Vectorized prompt query of shape (1, D). | |
| corpus_vectors: Vectorized prompt corpus of shape (N, D). | |
| Returns: | |
| A vector of shape (N,) with values in range [-1, 1] where 1 is maximum similarity. | |
| """ | |
| return np.dot(corpus_vectors, query_vector.T).flatten() | |
| class PromptSearchEngine: | |
| def __init__(self, corpus: str, model_name: str = 'all-MiniLM-L6-v2', use_index=False) -> None: | |
| """ | |
| Initialize search engine by vectorizing prompt corpus. | |
| Vectorized prompt corpus should be used to find the top n most similar prompts. | |
| Args: | |
| corpus: Path to the parquet dataset with raw prompts. | |
| model_name: The name of the pre-trained embedding model. | |
| """ | |
| self.use_index=use_index | |
| self.prompts=pd.read_parquet(corpus)['prompt'].to_list() | |
| self.prompts=self.prompts# if use_index else np.random.choice(self.prompts, 1000, replace=False) | |
| self.vectorizer = Vectorizer(model_name) | |
| self.embeddings = self.vectorizer.transform(self.prompts, | |
| build_index=use_index) # build index initially for faster retrieval | |
| if use_index: | |
| self.index = faiss.read_index(os.path.join(VECTORDB_PATH, 'prompts_index.faiss')) | |
| def most_similar(self, query: str, n: int = 5) -> List[Tuple[float, str]]: | |
| """ | |
| Return top n most similar prompts from the corpus. | |
| Input query prompt is vectorized using the Vectorizer. After that, use the cosine_similarity | |
| function to get the top n most similar prompts from the corpus. | |
| Args: | |
| query: The raw query prompt input from the user. | |
| n: The number of similar prompts to return from the corpus. | |
| Returns: | |
| The list of top n most similar prompts from the corpus along with similarity scores. | |
| Note that returned prompts are verbatim. | |
| """ | |
| query_vector = self.vectorizer.transform([query]) | |
| if self.use_index: | |
| distances, indices = self.index.search(query_vector, n) | |
| results = [{'prompt': self.prompts[idx], 'score': distances[0][i]} for i, idx in enumerate(indices[0])] | |
| return results | |
| else: | |
| similarities = cosine_similarity(query_vector, self.embeddings) | |
| top_indices = np.argsort(-similarities)[:n] # Sort in descending order | |
| return [{'prompt': self.prompts[i], 'score': similarities[i]} for i in top_indices] | |