| # import pdfplumber | |
| # | |
| # def extract_chunks_from_pdf(pdf_path, chunk_size=100): | |
| # text = "" | |
| # with pdfplumber.open(pdf_path) as pdf: | |
| # for page in pdf.pages: | |
| # content = page.extract_text() | |
| # if content: | |
| # text += content + "\n" | |
| # words = text.split() | |
| # return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)] | |
| from langchain.document_loaders import PyPDFLoader | |
| from langchain.text_splitter import RecursiveCharacterTextSplitter | |
| def extract_chunks_with_langchain(pdf_path, chunk_size=500, chunk_overlap=100): | |
| # Step 1: Load the PDF | |
| loader = PyPDFLoader(pdf_path) | |
| documents = loader.load() # Returns a list of Document objects | |
| # Step 2: Split the text intelligently | |
| splitter = RecursiveCharacterTextSplitter( | |
| chunk_size=chunk_size, | |
| chunk_overlap=chunk_overlap, | |
| separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""] | |
| ) | |
| chunks = splitter.split_documents(documents) | |
| # Optional: Return just the text content | |
| return [chunk.page_content for chunk in chunks] | |