# import pdfplumber # # def extract_chunks_from_pdf(pdf_path, chunk_size=100): # text = "" # with pdfplumber.open(pdf_path) as pdf: # for page in pdf.pages: # content = page.extract_text() # if content: # text += content + "\n" # words = text.split() # return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] from langchain.document_loaders import PyPDFLoader from langchain.text_splitter import RecursiveCharacterTextSplitter def extract_chunks_with_langchain(pdf_path, chunk_size=500, chunk_overlap=100): # Step 1: Load the PDF loader = PyPDFLoader(pdf_path) documents = loader.load() # Returns a list of Document objects # Step 2: Split the text intelligently splitter = RecursiveCharacterTextSplitter( chunk_size=chunk_size, chunk_overlap=chunk_overlap, separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] ) chunks = splitter.split_documents(documents) # Optional: Return just the text content return [chunk.page_content for chunk in chunks]