Gemini
fix: Explicitly disable ChromaDB telemetry\n\n- Set anonymized_telemetry=False in ChromaDB client initialization in both chroma_utils.py and vector_repository.py to resolve persistent posthog errors.
fa21e69
import chromadb
from chromadb.config import Settings
from chromadb.errors import IDAlreadyExistsError
from typing import List, Dict, Optional
from abc import ABC, abstractmethod
from dataclasses import dataclass
@dataclass
class VectorDocument:
id: str
text: str
metadata: Dict
embedding: Optional[List[float]] = None
class BaseVectorRepository(ABC):
@abstractmethod
def add(self, documents: List[VectorDocument]) -> None:
pass
@abstractmethod
def search(self, query_vector: List[float], limit: int = 5) -> List[VectorDocument]:
pass
class ChromaRepository(BaseVectorRepository):
def __init__(self, collection_name: str, persist_directory: str = "./chroma_db"):
settings = Settings(anonymized_telemetry=False)
self.client = chromadb.PersistentClient(path=persist_directory, settings=settings)
# Check if collection exists, create it if it doesn't
try:
self.collection = self.client.get_collection(name=collection_name)
except ValueError: # ValueError is thrown when Collection does not exist
self.collection = self.client.create_collection(
name=collection_name,
metadata={"hnsw:space": "cosine", "dimension": 1536},
)
def add(self, documents: List[VectorDocument]) -> None:
"""
Add documents to the vector store
"""
if not documents:
return
ids = [doc.id for doc in documents]
texts = [doc.text for doc in documents]
metadatas = [doc.metadata for doc in documents]
embeddings = [doc.embedding for doc in documents if doc.embedding is not None]
# If embeddings are provided, use them directly
if embeddings and len(embeddings) == len(documents):
self.collection.add(
ids=ids, documents=texts, metadatas=metadatas, embeddings=embeddings
)
else:
# Let ChromaDB handle embedding generation
self.collection.add(ids=ids, documents=texts, metadatas=metadatas)
def search(self, query_vector: List[float], limit: int = 5) -> List[VectorDocument]:
"""
Search similar documents using a query vector
"""
results = self.collection.query(
query_embeddings=[query_vector],
n_results=limit,
include=["documents", "metadatas", "distances"],
)
documents = []
for i in range(len(results["ids"][0])):
doc = VectorDocument(
id=results["ids"][0][i],
text=results["documents"][0][i],
metadata=results["metadatas"][0][i],
embedding=None, # ChromaDB doesn't return embeddings in search results
)
documents.append(doc)
return documents
def get_by_ids(self, ids: List[str]) -> List[VectorDocument]:
"""
Retrieve documents by their IDs
"""
results = self.collection.get(ids=ids, include=["documents", "metadatas"])
documents = []
for i in range(len(results["ids"])):
doc = VectorDocument(
id=results["ids"][i],
text=results["documents"][i],
metadata=results["metadatas"][i],
embedding=None,
)
documents.append(doc)
return documents
def delete(self, ids: List[str]) -> None:
"""
Delete documents by their IDs
"""
self.collection.delete(ids=ids)