Spaces:
Sleeping
Sleeping
Gemini
fix: Explicitly disable ChromaDB telemetry\n\n- Set anonymized_telemetry=False in ChromaDB client initialization in both chroma_utils.py and vector_repository.py to resolve persistent posthog errors.
fa21e69
| import chromadb | |
| from chromadb.config import Settings | |
| from chromadb.errors import IDAlreadyExistsError | |
| from typing import List, Dict, Optional | |
| from abc import ABC, abstractmethod | |
| from dataclasses import dataclass | |
| class VectorDocument: | |
| id: str | |
| text: str | |
| metadata: Dict | |
| embedding: Optional[List[float]] = None | |
| class BaseVectorRepository(ABC): | |
| def add(self, documents: List[VectorDocument]) -> None: | |
| pass | |
| def search(self, query_vector: List[float], limit: int = 5) -> List[VectorDocument]: | |
| pass | |
| class ChromaRepository(BaseVectorRepository): | |
| def __init__(self, collection_name: str, persist_directory: str = "./chroma_db"): | |
| settings = Settings(anonymized_telemetry=False) | |
| self.client = chromadb.PersistentClient(path=persist_directory, settings=settings) | |
| # Check if collection exists, create it if it doesn't | |
| try: | |
| self.collection = self.client.get_collection(name=collection_name) | |
| except ValueError: # ValueError is thrown when Collection does not exist | |
| self.collection = self.client.create_collection( | |
| name=collection_name, | |
| metadata={"hnsw:space": "cosine", "dimension": 1536}, | |
| ) | |
| def add(self, documents: List[VectorDocument]) -> None: | |
| """ | |
| Add documents to the vector store | |
| """ | |
| if not documents: | |
| return | |
| ids = [doc.id for doc in documents] | |
| texts = [doc.text for doc in documents] | |
| metadatas = [doc.metadata for doc in documents] | |
| embeddings = [doc.embedding for doc in documents if doc.embedding is not None] | |
| # If embeddings are provided, use them directly | |
| if embeddings and len(embeddings) == len(documents): | |
| self.collection.add( | |
| ids=ids, documents=texts, metadatas=metadatas, embeddings=embeddings | |
| ) | |
| else: | |
| # Let ChromaDB handle embedding generation | |
| self.collection.add(ids=ids, documents=texts, metadatas=metadatas) | |
| def search(self, query_vector: List[float], limit: int = 5) -> List[VectorDocument]: | |
| """ | |
| Search similar documents using a query vector | |
| """ | |
| results = self.collection.query( | |
| query_embeddings=[query_vector], | |
| n_results=limit, | |
| include=["documents", "metadatas", "distances"], | |
| ) | |
| documents = [] | |
| for i in range(len(results["ids"][0])): | |
| doc = VectorDocument( | |
| id=results["ids"][0][i], | |
| text=results["documents"][0][i], | |
| metadata=results["metadatas"][0][i], | |
| embedding=None, # ChromaDB doesn't return embeddings in search results | |
| ) | |
| documents.append(doc) | |
| return documents | |
| def get_by_ids(self, ids: List[str]) -> List[VectorDocument]: | |
| """ | |
| Retrieve documents by their IDs | |
| """ | |
| results = self.collection.get(ids=ids, include=["documents", "metadatas"]) | |
| documents = [] | |
| for i in range(len(results["ids"])): | |
| doc = VectorDocument( | |
| id=results["ids"][i], | |
| text=results["documents"][i], | |
| metadata=results["metadatas"][i], | |
| embedding=None, | |
| ) | |
| documents.append(doc) | |
| return documents | |
| def delete(self, ids: List[str]) -> None: | |
| """ | |
| Delete documents by their IDs | |
| """ | |
| self.collection.delete(ids=ids) | |