Spaces:
Sleeping
Sleeping
| """ | |
| Enhanced Multimodal PDF Parser for PDFs with Text + Image URLs | |
| Extracts text, detects image URLs, and links them together | |
| """ | |
| import pypdfium2 as pdfium | |
| from typing import List, Dict, Optional, Tuple | |
| import re | |
| from dataclasses import dataclass, field | |
| class MultimodalChunk: | |
| """Represents a chunk with text and associated images""" | |
| text: str | |
| page_number: int | |
| chunk_index: int | |
| image_urls: List[str] = field(default_factory=list) | |
| metadata: Dict = field(default_factory=dict) | |
| class MultimodalPDFParser: | |
| """ | |
| Enhanced PDF Parser that extracts text and image URLs | |
| Perfect for user guides with screenshots and visual instructions | |
| """ | |
| def __init__( | |
| self, | |
| chunk_size: int = 500, | |
| chunk_overlap: int = 50, | |
| min_chunk_size: int = 50, | |
| extract_images: bool = True | |
| ): | |
| self.chunk_size = chunk_size | |
| self.chunk_overlap = chunk_overlap | |
| self.min_chunk_size = min_chunk_size | |
| self.extract_images = extract_images | |
| # URL patterns | |
| self.url_patterns = [ | |
| # Standard URLs | |
| r'https?://[^\s<>"{}|\\^`\[\]]+', | |
| # Markdown images:  | |
| r'!\[.*?\]\((https?://[^\s)]+)\)', | |
| # HTML images: <img src="url"> | |
| r'<img[^>]+src=["\']([^"\']+)["\']', | |
| # Direct image extensions | |
| r'https?://[^\s<>"{}|\\^`\[\]]+\.(?:jpg|jpeg|png|gif|bmp|svg|webp)', | |
| ] | |
| def extract_image_urls(self, text: str) -> List[str]: | |
| """ | |
| Extract all image URLs from text | |
| Args: | |
| text: Text content | |
| Returns: | |
| List of image URLs found | |
| """ | |
| urls = [] | |
| for pattern in self.url_patterns: | |
| matches = re.findall(pattern, text, re.IGNORECASE) | |
| urls.extend(matches) | |
| # Remove duplicates while preserving order | |
| seen = set() | |
| unique_urls = [] | |
| for url in urls: | |
| if url not in seen: | |
| seen.add(url) | |
| unique_urls.append(url) | |
| return unique_urls | |
| def extract_text_from_pdf(self, pdf_path: str) -> Dict[int, Tuple[str, List[str]]]: | |
| """ | |
| Extract text and image URLs from PDF | |
| Args: | |
| pdf_path: Path to PDF file | |
| Returns: | |
| Dictionary mapping page number to (text, image_urls) tuple | |
| """ | |
| pdf_pages = {} | |
| try: | |
| pdf = pdfium.PdfDocument(pdf_path) | |
| for page_num in range(len(pdf)): | |
| page = pdf[page_num] | |
| textpage = page.get_textpage() | |
| text = textpage.get_text_range() | |
| # Clean text | |
| text = self._clean_text(text) | |
| # Extract image URLs if enabled | |
| image_urls = [] | |
| if self.extract_images: | |
| image_urls = self.extract_image_urls(text) | |
| pdf_pages[page_num + 1] = (text, image_urls) | |
| return pdf_pages | |
| except Exception as e: | |
| raise Exception(f"Error reading PDF: {str(e)}") | |
| def _clean_text(self, text: str) -> str: | |
| """Clean extracted text""" | |
| # Remove excessive whitespace | |
| text = re.sub(r'\s+', ' ', text) | |
| # Remove special characters | |
| text = text.replace('\x00', '') | |
| return text.strip() | |
| def chunk_text_with_images( | |
| self, | |
| text: str, | |
| image_urls: List[str], | |
| page_number: int | |
| ) -> List[MultimodalChunk]: | |
| """ | |
| Split text into chunks and associate images with relevant chunks | |
| Args: | |
| text: Text to chunk | |
| image_urls: Image URLs from the page | |
| page_number: Page number | |
| Returns: | |
| List of MultimodalChunk objects | |
| """ | |
| # Split into words | |
| words = text.split() | |
| if len(words) < self.min_chunk_size: | |
| if len(words) > 0: | |
| return [MultimodalChunk( | |
| text=text, | |
| page_number=page_number, | |
| chunk_index=0, | |
| image_urls=image_urls, # All images go to single chunk | |
| metadata={'page': page_number, 'chunk': 0} | |
| )] | |
| return [] | |
| chunks = [] | |
| chunk_index = 0 | |
| start = 0 | |
| # Calculate how to distribute images across chunks | |
| images_per_chunk = len(image_urls) // max(1, len(words) // self.chunk_size) if image_urls else 0 | |
| image_index = 0 | |
| while start < len(words): | |
| end = min(start + self.chunk_size, len(words)) | |
| chunk_words = words[start:end] | |
| chunk_text = ' '.join(chunk_words) | |
| # Assign images to this chunk | |
| chunk_images = [] | |
| if image_urls: | |
| # Simple strategy: distribute images evenly | |
| # or detect if URL appears in chunk text | |
| for url in image_urls: | |
| if url in chunk_text: | |
| chunk_images.append(url) | |
| # If no URLs found in text, distribute evenly | |
| if not chunk_images and image_index < len(image_urls): | |
| # Assign remaining images to chunks | |
| num_imgs = min(images_per_chunk + 1, len(image_urls) - image_index) | |
| chunk_images = image_urls[image_index:image_index + num_imgs] | |
| image_index += num_imgs | |
| chunks.append(MultimodalChunk( | |
| text=chunk_text, | |
| page_number=page_number, | |
| chunk_index=chunk_index, | |
| image_urls=chunk_images, | |
| metadata={ | |
| 'page': page_number, | |
| 'chunk': chunk_index, | |
| 'start_word': start, | |
| 'end_word': end, | |
| 'has_images': len(chunk_images) > 0, | |
| 'num_images': len(chunk_images) | |
| } | |
| )) | |
| chunk_index += 1 | |
| start = end - self.chunk_overlap | |
| if start >= len(words) - self.min_chunk_size: | |
| break | |
| return chunks | |
| def parse_pdf( | |
| self, | |
| pdf_path: str, | |
| document_metadata: Optional[Dict] = None | |
| ) -> List[MultimodalChunk]: | |
| """ | |
| Parse PDF into multimodal chunks | |
| Args: | |
| pdf_path: Path to PDF file | |
| document_metadata: Additional metadata | |
| Returns: | |
| List of MultimodalChunk objects | |
| """ | |
| pages_data = self.extract_text_from_pdf(pdf_path) | |
| all_chunks = [] | |
| for page_num, (text, image_urls) in pages_data.items(): | |
| chunks = self.chunk_text_with_images(text, image_urls, page_num) | |
| # Add document metadata | |
| if document_metadata: | |
| for chunk in chunks: | |
| chunk.metadata.update(document_metadata) | |
| all_chunks.extend(chunks) | |
| return all_chunks | |
| def parse_pdf_bytes( | |
| self, | |
| pdf_bytes: bytes, | |
| document_metadata: Optional[Dict] = None | |
| ) -> List[MultimodalChunk]: | |
| """Parse PDF from bytes""" | |
| import tempfile | |
| import os | |
| with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp: | |
| tmp.write(pdf_bytes) | |
| tmp_path = tmp.name | |
| try: | |
| chunks = self.parse_pdf(tmp_path, document_metadata) | |
| return chunks | |
| finally: | |
| if os.path.exists(tmp_path): | |
| os.unlink(tmp_path) | |
| class MultimodalPDFIndexer: | |
| """Index multimodal PDF chunks into RAG system""" | |
| def __init__(self, embedding_service, qdrant_service, documents_collection): | |
| self.embedding_service = embedding_service | |
| self.qdrant_service = qdrant_service | |
| self.documents_collection = documents_collection | |
| self.parser = MultimodalPDFParser() | |
| def index_pdf( | |
| self, | |
| pdf_path: str, | |
| document_id: str, | |
| document_metadata: Optional[Dict] = None | |
| ) -> Dict: | |
| """Index PDF with image URLs""" | |
| chunks = self.parser.parse_pdf(pdf_path, document_metadata) | |
| indexed_count = 0 | |
| chunk_ids = [] | |
| total_images = 0 | |
| for chunk in chunks: | |
| chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}" | |
| # Generate embedding (text-based) | |
| embedding = self.embedding_service.encode_text(chunk.text) | |
| # Prepare metadata with image URLs | |
| metadata = { | |
| 'text': chunk.text, | |
| 'document_id': document_id, | |
| 'page': chunk.page_number, | |
| 'chunk_index': chunk.chunk_index, | |
| 'source': 'pdf', | |
| 'has_images': len(chunk.image_urls) > 0, | |
| 'image_urls': chunk.image_urls, # Store image URLs! | |
| 'num_images': len(chunk.image_urls), | |
| **chunk.metadata | |
| } | |
| # Index to Qdrant | |
| self.qdrant_service.index_data( | |
| doc_id=chunk_id, | |
| embedding=embedding, | |
| metadata=metadata | |
| ) | |
| chunk_ids.append(chunk_id) | |
| indexed_count += 1 | |
| total_images += len(chunk.image_urls) | |
| # Save document info | |
| doc_info = { | |
| 'document_id': document_id, | |
| 'type': 'multimodal_pdf', | |
| 'file_path': pdf_path, | |
| 'num_chunks': indexed_count, | |
| 'total_images': total_images, | |
| 'chunk_ids': chunk_ids, | |
| 'metadata': document_metadata or {} | |
| } | |
| self.documents_collection.insert_one(doc_info) | |
| return { | |
| 'success': True, | |
| 'document_id': document_id, | |
| 'chunks_indexed': indexed_count, | |
| 'images_found': total_images, | |
| 'chunk_ids': chunk_ids[:5] | |
| } | |
| def index_pdf_bytes( | |
| self, | |
| pdf_bytes: bytes, | |
| document_id: str, | |
| filename: str, | |
| document_metadata: Optional[Dict] = None | |
| ) -> Dict: | |
| """Index PDF from bytes""" | |
| metadata = document_metadata or {} | |
| metadata['filename'] = filename | |
| chunks = self.parser.parse_pdf_bytes(pdf_bytes, metadata) | |
| indexed_count = 0 | |
| chunk_ids = [] | |
| total_images = 0 | |
| for chunk in chunks: | |
| chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}" | |
| embedding = self.embedding_service.encode_text(chunk.text) | |
| metadata = { | |
| 'text': chunk.text, | |
| 'document_id': document_id, | |
| 'page': chunk.page_number, | |
| 'chunk_index': chunk.chunk_index, | |
| 'source': 'multimodal_pdf', | |
| 'filename': filename, | |
| 'has_images': len(chunk.image_urls) > 0, | |
| 'image_urls': chunk.image_urls, | |
| 'num_images': len(chunk.image_urls), | |
| **chunk.metadata | |
| } | |
| self.qdrant_service.index_data( | |
| doc_id=chunk_id, | |
| embedding=embedding, | |
| metadata=metadata | |
| ) | |
| chunk_ids.append(chunk_id) | |
| indexed_count += 1 | |
| total_images += len(chunk.image_urls) | |
| doc_info = { | |
| 'document_id': document_id, | |
| 'type': 'multimodal_pdf', | |
| 'filename': filename, | |
| 'num_chunks': indexed_count, | |
| 'total_images': total_images, | |
| 'chunk_ids': chunk_ids, | |
| 'metadata': metadata | |
| } | |
| self.documents_collection.insert_one(doc_info) | |
| return { | |
| 'success': True, | |
| 'document_id': document_id, | |
| 'filename': filename, | |
| 'chunks_indexed': indexed_count, | |
| 'images_found': total_images, | |
| 'chunk_ids': chunk_ids[:5] | |
| } | |