Spaces:

alexpantex
/

prompt-search-app

Sleeping

App Files Files Community

prompt-search-app / scripts /prompt_engine.py

alexpantex

Upload scripts/prompt_engine.py with huggingface_hub

5e9edd3 verified 10 months ago

raw

history blame

4.76 kB

	import os
	import sys
	sys.path.append(sys.path[0].replace('scripts', ''))
	import pandas as pd
	import numpy as np

	from config.data_paths import VECTORDB_PATH

	from typing import Sequence, List, Tuple
	import faiss
	from sentence_transformers import SentenceTransformer


	class Vectorizer:
	def __init__(self, model_name: str) -> None:
	"""
	Initialize the vectorizer with a pre-trained embedding model.
	Args:
	model_name: The name of the pre-trained embedding model (compatible with sentence-transformers).
	"""
	self.model = SentenceTransformer(model_name)

	def transform(self, prompts: Sequence[str], build_index=False) -> np.ndarray:
	"""
	Transform texts into numerical vectors using the specified model.
	Args:
	prompts: The sequence of raw corpus prompts.
	Returns:
	Vectorized prompts as a numpy array.
	"""
	embeddings = self.model.encode(prompts, show_progress_bar=True)
	embeddings = embeddings / np.linalg.norm(embeddings, axis=1, keepdims=True) # normalize embeddings
	if build_index:
	# self.embeddings=embeddings
	if os.path.isfile(os.path.join(VECTORDB_PATH, 'prompts_index.faiss')):
	print('Embeddings already stored in vector db')
	else:
	index = self._build_index(embeddings)
	faiss.write_index(index, os.path.join(VECTORDB_PATH, 'prompts_index.faiss'))
	else:
	return embeddings

	def _build_index(self, embeddings: np.ndarray) -> faiss.IndexFlatIP:
	"""
	Build and return a FAISS index for the given embeddings.
	Args:
	embeddings: A numpy array of prompt embeddings.
	Returns:
	FAISS index for efficient similarity search.
	"""
	index = faiss.IndexFlatIP(embeddings.shape[1]) # Cosine similarity (IP on normalized vectors)
	index.add(embeddings)
	return index

	def cosine_similarity(query_vector: np.ndarray, corpus_vectors: np.ndarray) -> np.ndarray:
	"""
	Calculate cosine similarity between prompt vectors.
	Args:
	query_vector: Vectorized prompt query of shape (1, D).
	corpus_vectors: Vectorized prompt corpus of shape (N, D).
	Returns:
	A vector of shape (N,) with values in range [-1, 1] where 1 is maximum similarity.
	"""
	return np.dot(corpus_vectors, query_vector.T).flatten()

	class PromptSearchEngine:
	def __init__(self, corpus: str, model_name: str = 'all-MiniLM-L6-v2', use_index=False) -> None:
	"""
	Initialize search engine by vectorizing prompt corpus.
	Vectorized prompt corpus should be used to find the top n most similar prompts.
	Args:
	corpus: Path to the parquet dataset with raw prompts.
	model_name: The name of the pre-trained embedding model.
	"""
	self.use_index=use_index
	self.prompts=pd.read_parquet(corpus)['prompt'].to_list()
	self.prompts=self.prompts# if use_index else np.random.choice(self.prompts, 1000, replace=False)
	self.vectorizer = Vectorizer(model_name)
	self.embeddings = self.vectorizer.transform(self.prompts,
	build_index=use_index) # build index initially for faster retrieval
	if use_index:
	self.index = faiss.read_index(os.path.join(VECTORDB_PATH, 'prompts_index.faiss'))

	def most_similar(self, query: str, n: int = 5) -> List[Tuple[float, str]]:
	"""
	Return top n most similar prompts from the corpus.
	Input query prompt is vectorized using the Vectorizer. After that, use the cosine_similarity
	function to get the top n most similar prompts from the corpus.
	Args:
	query: The raw query prompt input from the user.
	n: The number of similar prompts to return from the corpus.
	Returns:
	The list of top n most similar prompts from the corpus along with similarity scores.
	Note that returned prompts are verbatim.
	"""
	query_vector = self.vectorizer.transform([query])
	if self.use_index:
	distances, indices = self.index.search(query_vector, n)
	results = [{'prompt': self.prompts[idx], 'score': distances[0][i]} for i, idx in enumerate(indices[0])]
	return results
	else:
	similarities = cosine_similarity(query_vector, self.embeddings)
	top_indices = np.argsort(-similarities)[:n] # Sort in descending order
	return [{'prompt': self.prompts[i], 'score': similarities[i]} for i in top_indices]