PDF_Search_Qdrant / backend /pdf_utils.py
Vallabhpatil777's picture
Upload 12 files (#1)
833b888 verified
raw
history blame
1.11 kB
# import pdfplumber
#
# def extract_chunks_from_pdf(pdf_path, chunk_size=100):
# text = ""
# with pdfplumber.open(pdf_path) as pdf:
# for page in pdf.pages:
# content = page.extract_text()
# if content:
# text += content + "\n"
# words = text.split()
# return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
def extract_chunks_with_langchain(pdf_path, chunk_size=500, chunk_overlap=100):
# Step 1: Load the PDF
loader = PyPDFLoader(pdf_path)
documents = loader.load() # Returns a list of Document objects
# Step 2: Split the text intelligently
splitter = RecursiveCharacterTextSplitter(
chunk_size=chunk_size,
chunk_overlap=chunk_overlap,
separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
)
chunks = splitter.split_documents(documents)
# Optional: Return just the text content
return [chunk.page_content for chunk in chunks]