Spaces:

minhvtt
/

ChatbotRAG

Sleeping

File size: 11,941 Bytes

500cf95

"""
Enhanced Multimodal PDF Parser for PDFs with Text + Image URLs
Extracts text, detects image URLs, and links them together
"""

import pypdfium2 as pdfium
from typing import List, Dict, Optional, Tuple
import re
from dataclasses import dataclass, field


@dataclass
class MultimodalChunk:
    """Represents a chunk with text and associated images"""
    text: str
    page_number: int
    chunk_index: int
    image_urls: List[str] = field(default_factory=list)
    metadata: Dict = field(default_factory=dict)


class MultimodalPDFParser:
    """
    Enhanced PDF Parser that extracts text and image URLs
    Perfect for user guides with screenshots and visual instructions
    """

    def __init__(
        self,
        chunk_size: int = 500,
        chunk_overlap: int = 50,
        min_chunk_size: int = 50,
        extract_images: bool = True
    ):
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.min_chunk_size = min_chunk_size
        self.extract_images = extract_images

        # URL patterns
        self.url_patterns = [
            # Standard URLs
            r'https?://[^\s<>"{}|\\^`\[\]]+',
            # Markdown images: ![alt](url)
            r'!\[.*?\]\((https?://[^\s)]+)\)',
            # HTML images: <img src="url">
            r'<img[^>]+src=["\']([^"\']+)["\']',
            # Direct image extensions
            r'https?://[^\s<>"{}|\\^`\[\]]+\.(?:jpg|jpeg|png|gif|bmp|svg|webp)',
        ]

    def extract_image_urls(self, text: str) -> List[str]:
        """
        Extract all image URLs from text

        Args:
            text: Text content

        Returns:
            List of image URLs found
        """
        urls = []

        for pattern in self.url_patterns:
            matches = re.findall(pattern, text, re.IGNORECASE)
            urls.extend(matches)

        # Remove duplicates while preserving order
        seen = set()
        unique_urls = []
        for url in urls:
            if url not in seen:
                seen.add(url)
                unique_urls.append(url)

        return unique_urls

    def extract_text_from_pdf(self, pdf_path: str) -> Dict[int, Tuple[str, List[str]]]:
        """
        Extract text and image URLs from PDF

        Args:
            pdf_path: Path to PDF file

        Returns:
            Dictionary mapping page number to (text, image_urls) tuple
        """
        pdf_pages = {}

        try:
            pdf = pdfium.PdfDocument(pdf_path)

            for page_num in range(len(pdf)):
                page = pdf[page_num]
                textpage = page.get_textpage()
                text = textpage.get_text_range()

                # Clean text
                text = self._clean_text(text)

                # Extract image URLs if enabled
                image_urls = []
                if self.extract_images:
                    image_urls = self.extract_image_urls(text)

                pdf_pages[page_num + 1] = (text, image_urls)

            return pdf_pages

        except Exception as e:
            raise Exception(f"Error reading PDF: {str(e)}")

    def _clean_text(self, text: str) -> str:
        """Clean extracted text"""
        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)
        # Remove special characters
        text = text.replace('\x00', '')
        return text.strip()

    def chunk_text_with_images(
        self,
        text: str,
        image_urls: List[str],
        page_number: int
    ) -> List[MultimodalChunk]:
        """
        Split text into chunks and associate images with relevant chunks

        Args:
            text: Text to chunk
            image_urls: Image URLs from the page
            page_number: Page number

        Returns:
            List of MultimodalChunk objects
        """
        # Split into words
        words = text.split()

        if len(words) < self.min_chunk_size:
            if len(words) > 0:
                return [MultimodalChunk(
                    text=text,
                    page_number=page_number,
                    chunk_index=0,
                    image_urls=image_urls,  # All images go to single chunk
                    metadata={'page': page_number, 'chunk': 0}
                )]
            return []

        chunks = []
        chunk_index = 0
        start = 0

        # Calculate how to distribute images across chunks
        images_per_chunk = len(image_urls) // max(1, len(words) // self.chunk_size) if image_urls else 0
        image_index = 0

        while start < len(words):
            end = min(start + self.chunk_size, len(words))
            chunk_words = words[start:end]
            chunk_text = ' '.join(chunk_words)

            # Assign images to this chunk
            chunk_images = []
            if image_urls:
                # Simple strategy: distribute images evenly
                # or detect if URL appears in chunk text
                for url in image_urls:
                    if url in chunk_text:
                        chunk_images.append(url)

                # If no URLs found in text, distribute evenly
                if not chunk_images and image_index < len(image_urls):
                    # Assign remaining images to chunks
                    num_imgs = min(images_per_chunk + 1, len(image_urls) - image_index)
                    chunk_images = image_urls[image_index:image_index + num_imgs]
                    image_index += num_imgs

            chunks.append(MultimodalChunk(
                text=chunk_text,
                page_number=page_number,
                chunk_index=chunk_index,
                image_urls=chunk_images,
                metadata={
                    'page': page_number,
                    'chunk': chunk_index,
                    'start_word': start,
                    'end_word': end,
                    'has_images': len(chunk_images) > 0,
                    'num_images': len(chunk_images)
                }
            ))

            chunk_index += 1
            start = end - self.chunk_overlap

            if start >= len(words) - self.min_chunk_size:
                break

        return chunks

    def parse_pdf(
        self,
        pdf_path: str,
        document_metadata: Optional[Dict] = None
    ) -> List[MultimodalChunk]:
        """
        Parse PDF into multimodal chunks

        Args:
            pdf_path: Path to PDF file
            document_metadata: Additional metadata

        Returns:
            List of MultimodalChunk objects
        """
        pages_data = self.extract_text_from_pdf(pdf_path)

        all_chunks = []
        for page_num, (text, image_urls) in pages_data.items():
            chunks = self.chunk_text_with_images(text, image_urls, page_num)

            # Add document metadata
            if document_metadata:
                for chunk in chunks:
                    chunk.metadata.update(document_metadata)

            all_chunks.extend(chunks)

        return all_chunks

    def parse_pdf_bytes(
        self,
        pdf_bytes: bytes,
        document_metadata: Optional[Dict] = None
    ) -> List[MultimodalChunk]:
        """Parse PDF from bytes"""
        import tempfile
        import os

        with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as tmp:
            tmp.write(pdf_bytes)
            tmp_path = tmp.name

        try:
            chunks = self.parse_pdf(tmp_path, document_metadata)
            return chunks
        finally:
            if os.path.exists(tmp_path):
                os.unlink(tmp_path)


class MultimodalPDFIndexer:
    """Index multimodal PDF chunks into RAG system"""

    def __init__(self, embedding_service, qdrant_service, documents_collection):
        self.embedding_service = embedding_service
        self.qdrant_service = qdrant_service
        self.documents_collection = documents_collection
        self.parser = MultimodalPDFParser()

    def index_pdf(
        self,
        pdf_path: str,
        document_id: str,
        document_metadata: Optional[Dict] = None
    ) -> Dict:
        """Index PDF with image URLs"""
        chunks = self.parser.parse_pdf(pdf_path, document_metadata)

        indexed_count = 0
        chunk_ids = []
        total_images = 0

        for chunk in chunks:
            chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}"

            # Generate embedding (text-based)
            embedding = self.embedding_service.encode_text(chunk.text)

            # Prepare metadata with image URLs
            metadata = {
                'text': chunk.text,
                'document_id': document_id,
                'page': chunk.page_number,
                'chunk_index': chunk.chunk_index,
                'source': 'pdf',
                'has_images': len(chunk.image_urls) > 0,
                'image_urls': chunk.image_urls,  # Store image URLs!
                'num_images': len(chunk.image_urls),
                **chunk.metadata
            }

            # Index to Qdrant
            self.qdrant_service.index_data(
                doc_id=chunk_id,
                embedding=embedding,
                metadata=metadata
            )

            chunk_ids.append(chunk_id)
            indexed_count += 1
            total_images += len(chunk.image_urls)

        # Save document info
        doc_info = {
            'document_id': document_id,
            'type': 'multimodal_pdf',
            'file_path': pdf_path,
            'num_chunks': indexed_count,
            'total_images': total_images,
            'chunk_ids': chunk_ids,
            'metadata': document_metadata or {}
        }
        self.documents_collection.insert_one(doc_info)

        return {
            'success': True,
            'document_id': document_id,
            'chunks_indexed': indexed_count,
            'images_found': total_images,
            'chunk_ids': chunk_ids[:5]
        }

    def index_pdf_bytes(
        self,
        pdf_bytes: bytes,
        document_id: str,
        filename: str,
        document_metadata: Optional[Dict] = None
    ) -> Dict:
        """Index PDF from bytes"""
        metadata = document_metadata or {}
        metadata['filename'] = filename

        chunks = self.parser.parse_pdf_bytes(pdf_bytes, metadata)

        indexed_count = 0
        chunk_ids = []
        total_images = 0

        for chunk in chunks:
            chunk_id = f"{document_id}_p{chunk.page_number}_c{chunk.chunk_index}"

            embedding = self.embedding_service.encode_text(chunk.text)

            metadata = {
                'text': chunk.text,
                'document_id': document_id,
                'page': chunk.page_number,
                'chunk_index': chunk.chunk_index,
                'source': 'multimodal_pdf',
                'filename': filename,
                'has_images': len(chunk.image_urls) > 0,
                'image_urls': chunk.image_urls,
                'num_images': len(chunk.image_urls),
                **chunk.metadata
            }

            self.qdrant_service.index_data(
                doc_id=chunk_id,
                embedding=embedding,
                metadata=metadata
            )

            chunk_ids.append(chunk_id)
            indexed_count += 1
            total_images += len(chunk.image_urls)

        doc_info = {
            'document_id': document_id,
            'type': 'multimodal_pdf',
            'filename': filename,
            'num_chunks': indexed_count,
            'total_images': total_images,
            'chunk_ids': chunk_ids,
            'metadata': metadata
        }
        self.documents_collection.insert_one(doc_info)

        return {
            'success': True,
            'document_id': document_id,
            'filename': filename,
            'chunks_indexed': indexed_count,
            'images_found': total_images,
            'chunk_ids': chunk_ids[:5]
        }