| from backend.qdrant import client | |
| from backend.embed_models import dense_model, colbert_model | |
| from qdrant_client import models | |
| def setup_collections(): | |
| dense_dim = client.get_embedding_size("sentence-transformers/all-MiniLM-L6-v2") | |
| print("creating collection") | |
| client.recreate_collection( | |
| collection_name="pdf_dense", | |
| vectors_config={"embedding": models.VectorParams(size=dense_dim, distance=models.Distance.COSINE)} | |
| ) | |
| print("creating collection") | |
| client.recreate_collection( | |
| collection_name="pdf_colbert", | |
| vectors_config=models.VectorParams( | |
| size=128, | |
| distance=models.Distance.COSINE, | |
| multivector_config=models.MultiVectorConfig( | |
| comparator=models.MultiVectorComparator.MAX_SIM | |
| ) | |
| ) | |
| ) | |
| def index_documents(chunks): | |
| dense_embs = list(dense_model.embed(chunks)) | |
| colbert_embs = list(colbert_model.embed(chunks)) | |
| dense_points = [ | |
| models.PointStruct(id=idx, payload={"chunk": chunk}, vector={"embedding": vec}) | |
| for idx, (chunk, vec) in enumerate(zip(chunks, dense_embs)) | |
| ] | |
| colbert_points = [ | |
| models.PointStruct(id=idx, payload={"chunk": chunk}, vector=vec) | |
| for idx, (chunk, vec) in enumerate(zip(chunks, colbert_embs)) | |
| ] | |
| client.upload_points("pdf_dense", dense_points) | |
| client.upload_points("pdf_colbert", colbert_points) | |