File size: 1,417 Bytes
ff58d3c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from backend.qdrant import client
from backend.embed_models import dense_model, colbert_model
from qdrant_client import models

def setup_collections():
    dense_dim = client.get_embedding_size("sentence-transformers/all-MiniLM-L6-v2")
    print("creating collection")
    client.recreate_collection(
        collection_name="pdf_dense",
        vectors_config={"embedding": models.VectorParams(size=dense_dim, distance=models.Distance.COSINE)}
    )
    print("creating collection")

    client.recreate_collection(
        collection_name="pdf_colbert",
        vectors_config=models.VectorParams(
            size=128,
            distance=models.Distance.COSINE,
            multivector_config=models.MultiVectorConfig(
                comparator=models.MultiVectorComparator.MAX_SIM
            )
        )
    )

def index_documents(chunks):
    dense_embs = list(dense_model.embed(chunks))
    colbert_embs = list(colbert_model.embed(chunks))

    dense_points = [
        models.PointStruct(id=idx, payload={"chunk": chunk}, vector={"embedding": vec})
        for idx, (chunk, vec) in enumerate(zip(chunks, dense_embs))
    ]
    colbert_points = [
        models.PointStruct(id=idx, payload={"chunk": chunk}, vector=vec)
        for idx, (chunk, vec) in enumerate(zip(chunks, colbert_embs))
    ]


    client.upload_points("pdf_dense", dense_points)
    client.upload_points("pdf_colbert", colbert_points)