Vallabhpatil777 commited on
Commit
833b888
·
verified ·
1 Parent(s): 8e24417

Upload 12 files (#1)

Browse files

- Upload 12 files (e998abec7346526da097cc568cf90badf620d1bd)

backend/__pycache__/embed_models.cpython-311.pyc ADDED
Binary file (941 Bytes). View file
 
backend/__pycache__/indexer.cpython-311.pyc ADDED
Binary file (2.91 kB). View file
 
backend/__pycache__/pdf_utils.cpython-311.pyc ADDED
Binary file (1.16 kB). View file
 
backend/__pycache__/qdrant.cpython-311.pyc ADDED
Binary file (580 Bytes). View file
 
backend/__pycache__/search.cpython-311.pyc ADDED
Binary file (2.42 kB). View file
 
backend/embed_models.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastembed import TextEmbedding, LateInteractionTextEmbedding
2
+ from fastembed.rerank.cross_encoder import TextCrossEncoder
3
+ from dotenv import load_dotenv
4
+ import os
5
+ load_dotenv()
6
+ from huggingface_hub import login
7
+
8
+ login(token=os.getenv("HUGGINGFACE_HUB_TOKEN"))
9
+
10
+ dense_model = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
11
+ colbert_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")
12
+ cross_encoder = TextCrossEncoder("jinaai/jina-reranker-v2-base-multilingual")
backend/indexer.py ADDED
@@ -0,0 +1,40 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from backend.qdrant import client
2
+ from backend.embed_models import dense_model, colbert_model
3
+ from qdrant_client import models
4
+
5
+ def setup_collections():
6
+ dense_dim = client.get_embedding_size("sentence-transformers/all-MiniLM-L6-v2")
7
+ print("creating collection")
8
+ client.recreate_collection(
9
+ collection_name="pdf_dense",
10
+ vectors_config={"embedding": models.VectorParams(size=dense_dim, distance=models.Distance.COSINE)}
11
+ )
12
+ print("creating collection")
13
+
14
+ client.recreate_collection(
15
+ collection_name="pdf_colbert",
16
+ vectors_config=models.VectorParams(
17
+ size=128,
18
+ distance=models.Distance.COSINE,
19
+ multivector_config=models.MultiVectorConfig(
20
+ comparator=models.MultiVectorComparator.MAX_SIM
21
+ )
22
+ )
23
+ )
24
+
25
+ def index_documents(chunks):
26
+ dense_embs = list(dense_model.embed(chunks))
27
+ colbert_embs = list(colbert_model.embed(chunks))
28
+
29
+ dense_points = [
30
+ models.PointStruct(id=idx, payload={"chunk": chunk}, vector={"embedding": vec})
31
+ for idx, (chunk, vec) in enumerate(zip(chunks, dense_embs))
32
+ ]
33
+ colbert_points = [
34
+ models.PointStruct(id=idx, payload={"chunk": chunk}, vector=vec)
35
+ for idx, (chunk, vec) in enumerate(zip(chunks, colbert_embs))
36
+ ]
37
+
38
+
39
+ client.upload_points("pdf_dense", dense_points)
40
+ client.upload_points("pdf_colbert", colbert_points)
backend/pdf_utils.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # import pdfplumber
2
+ #
3
+ # def extract_chunks_from_pdf(pdf_path, chunk_size=100):
4
+ # text = ""
5
+ # with pdfplumber.open(pdf_path) as pdf:
6
+ # for page in pdf.pages:
7
+ # content = page.extract_text()
8
+ # if content:
9
+ # text += content + "\n"
10
+ # words = text.split()
11
+ # return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
12
+ from langchain.document_loaders import PyPDFLoader
13
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
14
+
15
+
16
+ def extract_chunks_with_langchain(pdf_path, chunk_size=500, chunk_overlap=100):
17
+ # Step 1: Load the PDF
18
+ loader = PyPDFLoader(pdf_path)
19
+ documents = loader.load() # Returns a list of Document objects
20
+
21
+ # Step 2: Split the text intelligently
22
+ splitter = RecursiveCharacterTextSplitter(
23
+ chunk_size=chunk_size,
24
+ chunk_overlap=chunk_overlap,
25
+ separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
26
+ )
27
+
28
+ chunks = splitter.split_documents(documents)
29
+
30
+ # Optional: Return just the text content
31
+ return [chunk.page_content for chunk in chunks]
backend/qdrant.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from qdrant_client import QdrantClient
4
+ # Load .env file
5
+ load_dotenv()
6
+
7
+ # Fetch credentials from environment
8
+ QDRANT_URL = os.getenv("QDRANT_URL")
9
+ QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
10
+
11
+ client = QdrantClient(
12
+ url=QDRANT_URL,
13
+ api_key=QDRANT_API_KEY,
14
+ )
backend/search.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from backend.qdrant import client
2
+ from backend.embed_models import dense_model, colbert_model, cross_encoder
3
+
4
+ def search_and_rerank(query, top_k=5):
5
+ query_vec = list(dense_model.query_embed(query))[0]
6
+ raw_results = client.query_points(
7
+ collection_name="pdf_dense",
8
+ using="embedding",
9
+ query=query_vec,
10
+ limit=top_k,
11
+ with_payload=True
12
+ )
13
+ chunks = [pt.payload["chunk"] for pt in raw_results.points]
14
+ cross_scores = list(cross_encoder.rerank(query, chunks))
15
+ cross_ranks = sorted(zip(chunks, cross_scores), key=lambda x: x[1], reverse=True)
16
+
17
+ query_colbert = list(colbert_model.query_embed(query))[0]
18
+ colbert_results = client.query_points(
19
+ collection_name="pdf_colbert",
20
+ query=query_colbert,
21
+ limit=top_k,
22
+ with_payload=True
23
+ )
24
+ colbert_ranks = [(pt.payload["chunk"], pt.score) for pt in colbert_results.points]
25
+
26
+ return {
27
+ "raw": [(pt.payload["chunk"], pt.score) for pt in raw_results.points],
28
+ "cross": cross_ranks,
29
+ "colbert": colbert_ranks
30
+ }
main.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import tempfile
3
+ from backend.pdf_utils import extract_chunks_with_langchain
4
+ from backend.indexer import setup_collections, index_documents
5
+ from backend.search import search_and_rerank
6
+
7
+ st.title("Qdrant PDF Search")
8
+
9
+ if "indexed" not in st.session_state:
10
+ st.session_state.indexed = False
11
+
12
+ uploaded = st.file_uploader("Upload a PDF", type=["pdf"])
13
+
14
+ if uploaded:
15
+ with tempfile.NamedTemporaryFile(delete=False) as tmp:
16
+ tmp.write(uploaded.read())
17
+ pdf_path = tmp.name
18
+
19
+ st.success("PDF uploaded!")
20
+ chunks = extract_chunks_with_langchain(pdf_path)
21
+
22
+ if st.button("Index PDF in Qdrant Cloud"):
23
+ with st.spinner("Indexing..."):
24
+ setup_collections()
25
+ index_documents(chunks)
26
+ st.session_state.indexed = True # Mark as indexed
27
+ st.success("Indexed successfully!")
28
+
29
+ # Only show query input *after* indexing is done
30
+ if st.session_state.indexed:
31
+ query = st.text_input("Enter your search query:")
32
+
33
+ if query:
34
+ results = search_and_rerank(query)
35
+
36
+ st.subheader("Raw Dense Results")
37
+ for chunk, score in results["raw"]:
38
+ st.markdown(f"**{score:.3f}** - {chunk[:200]}...")
39
+
40
+ st.subheader("Cross-Encoder Reranked")
41
+ for chunk, score in results["cross"]:
42
+ st.markdown(f"**{score:.3f}** - {chunk[:200]}...")
43
+
44
+ st.subheader("ColBERT Reranked")
45
+ for chunk, score in results["colbert"]:
46
+ st.markdown(f"**{score:.3f}** - {chunk[:200]}...")
47
+ else:
48
+ st.info("Please upload and index a PDF before searching.")
requirements.txt CHANGED
@@ -1,3 +1,9 @@
1
- altair
2
- pandas
3
- streamlit
 
 
 
 
 
 
 
1
+ qdrant-client[fastembed]>=1.14.2
2
+ pdfplumber
3
+ sentence-transformers
4
+ streamlit
5
+ python-dotenv
6
+ langchain
7
+ pypdf
8
+ tiktoken
9
+ langchain-community