""" PDF Parser Service for RAG Chatbot Extracts text from PDF and splits into chunks for indexing """ import pypdfium2 as pdfium from typing import List, Dict, Optional import re from dataclasses import dataclass @dataclass class PDFChunk: """Represents a chunk of text from PDF""" text: str page_number: int chunk_index: int metadata: Dict class PDFParser: """Parse PDF files and prepare for RAG indexing""" def __init__( self, chunk_size: int = 500, # words per chunk chunk_overlap: int = 50, # words overlap between chunks min_chunk_size: int = 50 # minimum words in a chunk ): self.chunk_size = chunk_size self.chunk_overlap = chunk_overlap self.min_chunk_size = min_chunk_size def extract_text_from_pdf(self, pdf_path: str) -> Dict[int, str]: """ Extract text from PDF file Args: pdf_path: Path to PDF file Returns: Dictionary mapping page number to text content """ pdf_text = {} try: pdf = pdfium.PdfDocument(pdf_path) for page_num in range(len(pdf)): page = pdf[page_num] textpage = page.get_textpage() text = textpage.get_text_range() # Clean text text = self._clean_text(text) pdf_text[page_num + 1] = text # 1-indexed pages return pdf_text except Exception as e: raise Exception(f"Error reading PDF: {str(e)}") def _clean_text(self, text: str) -> str: """Clean extracted text""" # Remove excessive whitespace text = re.sub(r'\s+', ' ', text) # Remove special characters that might cause issues text = text.replace('\x00', '') return text.strip() def chunk_text(self, text: str, page_number: int) -> List[PDFChunk]: """ Split text into overlapping chunks Args: text: Text to chunk page_number: Page number this text came from Returns: List of PDFChunk objects """ # Split into words words = text.split() if len(words) < self.min_chunk_size: # Text too short, return as single chunk if len(words) > 0: return [PDFChunk( text=text, page_number=page_number, chunk_index=0, metadata={'page': page_number, 'chunk': 0} )] return [] chunks = [] chunk_index = 0 start = 0 while start < len(words): # Get chunk end = min(start + self.chunk_size, len(words)) chunk_words = words[start:end] chunk_text = ' '.join(chunk_words) chunks.append(PDFChunk( text=chunk_text, page_number=page_number, chunk_index=chunk_index, metadata={ 'page': page_number, 'chunk': chunk_index, 'start_word': start, 'end_word': end } )) chunk_index += 1 # Move start position with overlap start = end - self.chunk_overlap # Avoid infinite loop if start >= len(words) - self.min_chunk_size: break return chunks def parse_pdf( self, pdf_path: str, document_metadata: Optional[Dict] = None ) -> List[PDFChunk]: """ Parse entire PDF into chunks Args: pdf_path: Path to PDF file document_metadata: Additional metadata for the document Returns: List of all chunks from the PDF """ # Extract text from all pages pages_text = self.extract_text_from_pdf(pdf_path) # Chunk each page all_chunks = [] for page_num, text in pages_text.items(): chunks = self.chunk_text(text, page_num) # Add document metadata if document_metadata: for chunk in chunks: chunk.metadata.update(document_metadata) all_chunks.extend(chunks) return all_chunks def parse_pdf_bytes( self, pdf_bytes: bytes, document_metadata: Optional[Dict] = None ) -> List[PDFChunk]: """ Parse PDF from bytes (for uploaded files) Args: pdf_bytes: PDF file as bytes document_metadata: Additional metadata Returns: List of chunks """ import tempfile import os # Save to temp file with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: tmp.write(pdf_bytes) tmp_path = tmp.name try: chunks = self.parse_pdf(tmp_path, document_metadata) return chunks finally: # Clean up temp file if os.path.exists(tmp_path): os.unlink(tmp_path) def get_pdf_info(self, pdf_path: str) -> Dict: """ Get basic info about PDF Args: pdf_path: Path to PDF file Returns: Dictionary with PDF information """ try: pdf = pdfium.PdfDocument(pdf_path) info = { 'num_pages': len(pdf), 'file_path': pdf_path, } return info except Exception as e: raise Exception(f"Error reading PDF info: {str(e)}") class PDFIndexer: """Index PDF chunks into RAG system""" def __init__(self, embedding_service, qdrant_service, documents_collection): self.embedding_service = embedding_service self.qdrant_service = qdrant_service self.documents_collection = documents_collection self.parser = PDFParser() def index_pdf( self, pdf_path: str, document_id: str, document_metadata: Optional[Dict] = None ) -> Dict: """ Index entire PDF into RAG system Args: pdf_path: Path to PDF file document_id: Unique ID for this document document_metadata: Additional metadata (title, author, etc.) Returns: Indexing results """ # Parse PDF chunks = self.parser.parse_pdf(pdf_path, document_metadata) # Index each chunk indexed_count = 0 chunk_ids = [] for chunk in chunks: # Generate unique ID for chunk chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}" # Generate embedding embedding = self.embedding_service.encode_text(chunk.text) # Prepare metadata metadata = { 'text': chunk.text, 'document_id': document_id, 'page': chunk.page_number, 'chunk_index': chunk.chunk_index, 'source': 'pdf', **chunk.metadata } # Index to Qdrant self.qdrant_service.index_data( doc_id=chunk_id, embedding=embedding, metadata=metadata ) chunk_ids.append(chunk_id) indexed_count += 1 # Save document info to MongoDB doc_info = { 'document_id': document_id, 'type': 'pdf', 'file_path': pdf_path, 'num_chunks': indexed_count, 'chunk_ids': chunk_ids, 'metadata': document_metadata or {}, 'pdf_info': self.parser.get_pdf_info(pdf_path) } self.documents_collection.insert_one(doc_info) return { 'success': True, 'document_id': document_id, 'chunks_indexed': indexed_count, 'chunk_ids': chunk_ids[:5] # Return first 5 as sample } def index_pdf_bytes( self, pdf_bytes: bytes, document_id: str, filename: str, document_metadata: Optional[Dict] = None ) -> Dict: """ Index PDF from bytes (for uploaded files) Args: pdf_bytes: PDF file as bytes document_id: Unique ID for this document filename: Original filename document_metadata: Additional metadata Returns: Indexing results """ # Parse PDF metadata = document_metadata or {} metadata['filename'] = filename chunks = self.parser.parse_pdf_bytes(pdf_bytes, metadata) # Index each chunk indexed_count = 0 chunk_ids = [] for chunk in chunks: # Generate unique ID for chunk chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}" # Generate embedding embedding = self.embedding_service.encode_text(chunk.text) # Prepare metadata metadata = { 'text': chunk.text, 'document_id': document_id, 'page': chunk.page_number, 'chunk_index': chunk.chunk_index, 'source': 'pdf', 'filename': filename, **chunk.metadata } # Index to Qdrant self.qdrant_service.index_data( doc_id=chunk_id, embedding=embedding, metadata=metadata ) chunk_ids.append(chunk_id) indexed_count += 1 # Save document info to MongoDB doc_info = { 'document_id': document_id, 'type': 'pdf', 'filename': filename, 'num_chunks': indexed_count, 'chunk_ids': chunk_ids, 'metadata': metadata } self.documents_collection.insert_one(doc_info) return { 'success': True, 'document_id': document_id, 'filename': filename, 'chunks_indexed': indexed_count, 'chunk_ids': chunk_ids[:5] }