Upload 12 files (#1)
Browse files- Upload 12 files (e998abec7346526da097cc568cf90badf620d1bd)
- backend/__pycache__/embed_models.cpython-311.pyc +0 -0
- backend/__pycache__/indexer.cpython-311.pyc +0 -0
- backend/__pycache__/pdf_utils.cpython-311.pyc +0 -0
- backend/__pycache__/qdrant.cpython-311.pyc +0 -0
- backend/__pycache__/search.cpython-311.pyc +0 -0
- backend/embed_models.py +12 -0
- backend/indexer.py +40 -0
- backend/pdf_utils.py +31 -0
- backend/qdrant.py +14 -0
- backend/search.py +30 -0
- main.py +48 -0
- requirements.txt +9 -3
backend/__pycache__/embed_models.cpython-311.pyc
ADDED
|
Binary file (941 Bytes). View file
|
|
|
backend/__pycache__/indexer.cpython-311.pyc
ADDED
|
Binary file (2.91 kB). View file
|
|
|
backend/__pycache__/pdf_utils.cpython-311.pyc
ADDED
|
Binary file (1.16 kB). View file
|
|
|
backend/__pycache__/qdrant.cpython-311.pyc
ADDED
|
Binary file (580 Bytes). View file
|
|
|
backend/__pycache__/search.cpython-311.pyc
ADDED
|
Binary file (2.42 kB). View file
|
|
|
backend/embed_models.py
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from fastembed import TextEmbedding, LateInteractionTextEmbedding
|
| 2 |
+
from fastembed.rerank.cross_encoder import TextCrossEncoder
|
| 3 |
+
from dotenv import load_dotenv
|
| 4 |
+
import os
|
| 5 |
+
load_dotenv()
|
| 6 |
+
from huggingface_hub import login
|
| 7 |
+
|
| 8 |
+
login(token=os.getenv("HUGGINGFACE_HUB_TOKEN"))
|
| 9 |
+
|
| 10 |
+
dense_model = TextEmbedding("sentence-transformers/all-MiniLM-L6-v2")
|
| 11 |
+
colbert_model = LateInteractionTextEmbedding("colbert-ir/colbertv2.0")
|
| 12 |
+
cross_encoder = TextCrossEncoder("jinaai/jina-reranker-v2-base-multilingual")
|
backend/indexer.py
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from backend.qdrant import client
|
| 2 |
+
from backend.embed_models import dense_model, colbert_model
|
| 3 |
+
from qdrant_client import models
|
| 4 |
+
|
| 5 |
+
def setup_collections():
|
| 6 |
+
dense_dim = client.get_embedding_size("sentence-transformers/all-MiniLM-L6-v2")
|
| 7 |
+
print("creating collection")
|
| 8 |
+
client.recreate_collection(
|
| 9 |
+
collection_name="pdf_dense",
|
| 10 |
+
vectors_config={"embedding": models.VectorParams(size=dense_dim, distance=models.Distance.COSINE)}
|
| 11 |
+
)
|
| 12 |
+
print("creating collection")
|
| 13 |
+
|
| 14 |
+
client.recreate_collection(
|
| 15 |
+
collection_name="pdf_colbert",
|
| 16 |
+
vectors_config=models.VectorParams(
|
| 17 |
+
size=128,
|
| 18 |
+
distance=models.Distance.COSINE,
|
| 19 |
+
multivector_config=models.MultiVectorConfig(
|
| 20 |
+
comparator=models.MultiVectorComparator.MAX_SIM
|
| 21 |
+
)
|
| 22 |
+
)
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
def index_documents(chunks):
|
| 26 |
+
dense_embs = list(dense_model.embed(chunks))
|
| 27 |
+
colbert_embs = list(colbert_model.embed(chunks))
|
| 28 |
+
|
| 29 |
+
dense_points = [
|
| 30 |
+
models.PointStruct(id=idx, payload={"chunk": chunk}, vector={"embedding": vec})
|
| 31 |
+
for idx, (chunk, vec) in enumerate(zip(chunks, dense_embs))
|
| 32 |
+
]
|
| 33 |
+
colbert_points = [
|
| 34 |
+
models.PointStruct(id=idx, payload={"chunk": chunk}, vector=vec)
|
| 35 |
+
for idx, (chunk, vec) in enumerate(zip(chunks, colbert_embs))
|
| 36 |
+
]
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
client.upload_points("pdf_dense", dense_points)
|
| 40 |
+
client.upload_points("pdf_colbert", colbert_points)
|
backend/pdf_utils.py
ADDED
|
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# import pdfplumber
|
| 2 |
+
#
|
| 3 |
+
# def extract_chunks_from_pdf(pdf_path, chunk_size=100):
|
| 4 |
+
# text = ""
|
| 5 |
+
# with pdfplumber.open(pdf_path) as pdf:
|
| 6 |
+
# for page in pdf.pages:
|
| 7 |
+
# content = page.extract_text()
|
| 8 |
+
# if content:
|
| 9 |
+
# text += content + "\n"
|
| 10 |
+
# words = text.split()
|
| 11 |
+
# return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
|
| 12 |
+
from langchain.document_loaders import PyPDFLoader
|
| 13 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 14 |
+
|
| 15 |
+
|
| 16 |
+
def extract_chunks_with_langchain(pdf_path, chunk_size=500, chunk_overlap=100):
|
| 17 |
+
# Step 1: Load the PDF
|
| 18 |
+
loader = PyPDFLoader(pdf_path)
|
| 19 |
+
documents = loader.load() # Returns a list of Document objects
|
| 20 |
+
|
| 21 |
+
# Step 2: Split the text intelligently
|
| 22 |
+
splitter = RecursiveCharacterTextSplitter(
|
| 23 |
+
chunk_size=chunk_size,
|
| 24 |
+
chunk_overlap=chunk_overlap,
|
| 25 |
+
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
|
| 26 |
+
)
|
| 27 |
+
|
| 28 |
+
chunks = splitter.split_documents(documents)
|
| 29 |
+
|
| 30 |
+
# Optional: Return just the text content
|
| 31 |
+
return [chunk.page_content for chunk in chunks]
|
backend/qdrant.py
ADDED
|
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from dotenv import load_dotenv
|
| 3 |
+
from qdrant_client import QdrantClient
|
| 4 |
+
# Load .env file
|
| 5 |
+
load_dotenv()
|
| 6 |
+
|
| 7 |
+
# Fetch credentials from environment
|
| 8 |
+
QDRANT_URL = os.getenv("QDRANT_URL")
|
| 9 |
+
QDRANT_API_KEY = os.getenv("QDRANT_API_KEY")
|
| 10 |
+
|
| 11 |
+
client = QdrantClient(
|
| 12 |
+
url=QDRANT_URL,
|
| 13 |
+
api_key=QDRANT_API_KEY,
|
| 14 |
+
)
|
backend/search.py
ADDED
|
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from backend.qdrant import client
|
| 2 |
+
from backend.embed_models import dense_model, colbert_model, cross_encoder
|
| 3 |
+
|
| 4 |
+
def search_and_rerank(query, top_k=5):
|
| 5 |
+
query_vec = list(dense_model.query_embed(query))[0]
|
| 6 |
+
raw_results = client.query_points(
|
| 7 |
+
collection_name="pdf_dense",
|
| 8 |
+
using="embedding",
|
| 9 |
+
query=query_vec,
|
| 10 |
+
limit=top_k,
|
| 11 |
+
with_payload=True
|
| 12 |
+
)
|
| 13 |
+
chunks = [pt.payload["chunk"] for pt in raw_results.points]
|
| 14 |
+
cross_scores = list(cross_encoder.rerank(query, chunks))
|
| 15 |
+
cross_ranks = sorted(zip(chunks, cross_scores), key=lambda x: x[1], reverse=True)
|
| 16 |
+
|
| 17 |
+
query_colbert = list(colbert_model.query_embed(query))[0]
|
| 18 |
+
colbert_results = client.query_points(
|
| 19 |
+
collection_name="pdf_colbert",
|
| 20 |
+
query=query_colbert,
|
| 21 |
+
limit=top_k,
|
| 22 |
+
with_payload=True
|
| 23 |
+
)
|
| 24 |
+
colbert_ranks = [(pt.payload["chunk"], pt.score) for pt in colbert_results.points]
|
| 25 |
+
|
| 26 |
+
return {
|
| 27 |
+
"raw": [(pt.payload["chunk"], pt.score) for pt in raw_results.points],
|
| 28 |
+
"cross": cross_ranks,
|
| 29 |
+
"colbert": colbert_ranks
|
| 30 |
+
}
|
main.py
ADDED
|
@@ -0,0 +1,48 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import streamlit as st
|
| 2 |
+
import tempfile
|
| 3 |
+
from backend.pdf_utils import extract_chunks_with_langchain
|
| 4 |
+
from backend.indexer import setup_collections, index_documents
|
| 5 |
+
from backend.search import search_and_rerank
|
| 6 |
+
|
| 7 |
+
st.title("Qdrant PDF Search")
|
| 8 |
+
|
| 9 |
+
if "indexed" not in st.session_state:
|
| 10 |
+
st.session_state.indexed = False
|
| 11 |
+
|
| 12 |
+
uploaded = st.file_uploader("Upload a PDF", type=["pdf"])
|
| 13 |
+
|
| 14 |
+
if uploaded:
|
| 15 |
+
with tempfile.NamedTemporaryFile(delete=False) as tmp:
|
| 16 |
+
tmp.write(uploaded.read())
|
| 17 |
+
pdf_path = tmp.name
|
| 18 |
+
|
| 19 |
+
st.success("PDF uploaded!")
|
| 20 |
+
chunks = extract_chunks_with_langchain(pdf_path)
|
| 21 |
+
|
| 22 |
+
if st.button("Index PDF in Qdrant Cloud"):
|
| 23 |
+
with st.spinner("Indexing..."):
|
| 24 |
+
setup_collections()
|
| 25 |
+
index_documents(chunks)
|
| 26 |
+
st.session_state.indexed = True # Mark as indexed
|
| 27 |
+
st.success("Indexed successfully!")
|
| 28 |
+
|
| 29 |
+
# Only show query input *after* indexing is done
|
| 30 |
+
if st.session_state.indexed:
|
| 31 |
+
query = st.text_input("Enter your search query:")
|
| 32 |
+
|
| 33 |
+
if query:
|
| 34 |
+
results = search_and_rerank(query)
|
| 35 |
+
|
| 36 |
+
st.subheader("Raw Dense Results")
|
| 37 |
+
for chunk, score in results["raw"]:
|
| 38 |
+
st.markdown(f"**{score:.3f}** - {chunk[:200]}...")
|
| 39 |
+
|
| 40 |
+
st.subheader("Cross-Encoder Reranked")
|
| 41 |
+
for chunk, score in results["cross"]:
|
| 42 |
+
st.markdown(f"**{score:.3f}** - {chunk[:200]}...")
|
| 43 |
+
|
| 44 |
+
st.subheader("ColBERT Reranked")
|
| 45 |
+
for chunk, score in results["colbert"]:
|
| 46 |
+
st.markdown(f"**{score:.3f}** - {chunk[:200]}...")
|
| 47 |
+
else:
|
| 48 |
+
st.info("Please upload and index a PDF before searching.")
|
requirements.txt
CHANGED
|
@@ -1,3 +1,9 @@
|
|
| 1 |
-
|
| 2 |
-
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
qdrant-client[fastembed]>=1.14.2
|
| 2 |
+
pdfplumber
|
| 3 |
+
sentence-transformers
|
| 4 |
+
streamlit
|
| 5 |
+
python-dotenv
|
| 6 |
+
langchain
|
| 7 |
+
pypdf
|
| 8 |
+
tiktoken
|
| 9 |
+
langchain-community
|