File size: 1,106 Bytes
833b888
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
# import pdfplumber
#
# def extract_chunks_from_pdf(pdf_path, chunk_size=100):
#     text = ""
#     with pdfplumber.open(pdf_path) as pdf:
#         for page in pdf.pages:
#             content = page.extract_text()
#             if content:
#                 text += content + "\n"
#     words = text.split()
#     return [" ".join(words[i:i+chunk_size]) for i in range(0, len(words), chunk_size)]
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter


def extract_chunks_with_langchain(pdf_path, chunk_size=500, chunk_overlap=100):
    # Step 1: Load the PDF
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()  # Returns a list of Document objects

    # Step 2: Split the text intelligently
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        separators=["\n\n", "\n", ".", "!", "?", ",", " ", ""]
    )

    chunks = splitter.split_documents(documents)

    # Optional: Return just the text content
    return [chunk.page_content for chunk in chunks]