Spaces:

raksama19
/

Test-Dolphin-PDF

Runtime error

File size: 18,145 Bytes

"""
DOLPHIN PDF Document AI - Final Version
Optimized for HuggingFace Spaces NVIDIA T4 Small deployment
"""

import gradio as gr
import json
import markdown
import cv2
import numpy as np
from PIL import Image
from transformers import AutoProcessor, VisionEncoderDecoderModel
import torch
import os
import tempfile
import uuid
import base64
import io
from utils.utils import *
from utils.markdown_utils import MarkdownConverter

# Math extension is optional for enhanced math rendering
MATH_EXTENSION_AVAILABLE = False
try:
    from mdx_math import MathExtension
    MATH_EXTENSION_AVAILABLE = True
except ImportError:
    pass


class DOLPHIN:
    def __init__(self, model_id_or_path):
        """Initialize the Hugging Face model optimized for T4 Small"""
        self.processor = AutoProcessor.from_pretrained(model_id_or_path)
        self.model = VisionEncoderDecoderModel.from_pretrained(
            model_id_or_path,
            torch_dtype=torch.float16,
            device_map="auto" if torch.cuda.is_available() else None
        )
        self.model.eval()
        
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        if not torch.cuda.is_available():
            self.model = self.model.float()
        
        self.tokenizer = self.processor.tokenizer
        
    def chat(self, prompt, image):
        """Process an image or batch of images with the given prompt(s)"""
        is_batch = isinstance(image, list)
        
        if not is_batch:
            images = [image]
            prompts = [prompt]
        else:
            images = image
            prompts = prompt if isinstance(prompt, list) else [prompt] * len(images)
        
        batch_inputs = self.processor(images, return_tensors="pt", padding=True)
        batch_pixel_values = batch_inputs.pixel_values
        
        if torch.cuda.is_available():
            batch_pixel_values = batch_pixel_values.half().to(self.device)
        else:
            batch_pixel_values = batch_pixel_values.to(self.device)
        
        prompts = [f"<s>{p} <Answer/>" for p in prompts]
        batch_prompt_inputs = self.tokenizer(
            prompts,
            add_special_tokens=False,
            return_tensors="pt"
        )

        batch_prompt_ids = batch_prompt_inputs.input_ids.to(self.device)
        batch_attention_mask = batch_prompt_inputs.attention_mask.to(self.device)
        
        with torch.no_grad():
            outputs = self.model.generate(
                pixel_values=batch_pixel_values,
                decoder_input_ids=batch_prompt_ids,
                decoder_attention_mask=batch_attention_mask,
                min_length=1,
                max_length=1024,  # Reduced for T4 Small
                pad_token_id=self.tokenizer.pad_token_id,
                eos_token_id=self.tokenizer.eos_token_id,
                use_cache=True,
                bad_words_ids=[[self.tokenizer.unk_token_id]],
                return_dict_in_generate=True,
                do_sample=False,
                num_beams=1,
                repetition_penalty=1.1,
                temperature=1.0
            )
        
        sequences = self.tokenizer.batch_decode(outputs.sequences, skip_special_tokens=False)
        
        results = []
        for i, sequence in enumerate(sequences):
            cleaned = sequence.replace(prompts[i], "").replace("<pad>", "").replace("</s>", "").strip()
            results.append(cleaned)
            
        if not is_batch:
            return results[0]
        return results


def convert_pdf_to_images_gradio(pdf_file):
    """Convert uploaded PDF file to list of PIL Images"""
    try:
        import pymupdf
        
        if isinstance(pdf_file, str):
            pdf_document = pymupdf.open(pdf_file)
        else:
            pdf_bytes = pdf_file.read()
            pdf_document = pymupdf.open(stream=pdf_bytes, filetype="pdf")
        
        images = []
        for page_num in range(len(pdf_document)):
            page = pdf_document[page_num]
            mat = pymupdf.Matrix(2.0, 2.0)
            pix = page.get_pixmap(matrix=mat)
            img_data = pix.tobytes("png")
            pil_image = Image.open(io.BytesIO(img_data)).convert("RGB")
            images.append(pil_image)
        
        pdf_document.close()
        return images
        
    except Exception as e:
        raise Exception(f"Error converting PDF: {str(e)}")


def process_pdf_document(pdf_file, model, progress=gr.Progress()):
    """Process uploaded PDF file page by page"""
    if pdf_file is None:
        return "No PDF file uploaded", ""
    
    try:
        progress(0.1, desc="Converting PDF to images...")
        images = convert_pdf_to_images_gradio(pdf_file)
        
        if not images:
            return "Failed to convert PDF to images", ""
        
        all_results = []
        
        for page_idx, pil_image in enumerate(images):
            progress((page_idx + 1) / len(images) * 0.8 + 0.1, 
                    desc=f"Processing page {page_idx + 1}/{len(images)}...")
            
            layout_output = model.chat("Parse the reading order of this document.", pil_image)
            
            padded_image, dims = prepare_image(pil_image)
            recognition_results = process_elements_optimized(
                layout_output, 
                padded_image, 
                dims, 
                model, 
                max_batch_size=2  # Smaller batch for T4 Small
            )
            
            try:
                markdown_converter = MarkdownConverter()
                markdown_content = markdown_converter.convert(recognition_results)
            except:
                markdown_content = generate_fallback_markdown(recognition_results)
            
            page_result = {
                "page_number": page_idx + 1,
                "markdown": markdown_content
            }
            all_results.append(page_result)
        
        progress(1.0, desc="Processing complete!")
        
        combined_markdown = "\n\n---\n\n".join([
            f"# Page {result['page_number']}\n\n{result['markdown']}" 
            for result in all_results
        ])
        
        return combined_markdown, "processing_complete"
        
    except Exception as e:
        error_msg = f"Error processing PDF: {str(e)}"
        return error_msg, "error"


def process_elements_optimized(layout_results, padded_image, dims, model, max_batch_size=2):
    """Optimized element processing for T4 Small"""
    layout_results = parse_layout_string(layout_results)
    
    text_elements = []
    table_elements = []
    figure_results = []
    previous_box = None
    reading_order = 0
    
    for bbox, label in layout_results:
        try:
            x1, y1, x2, y2, orig_x1, orig_y1, orig_x2, orig_y2, previous_box = process_coordinates(
                bbox, padded_image, dims, previous_box
            )
            
            cropped = padded_image[y1:y2, x1:x2]
            if cropped.size > 0 and cropped.shape[0] > 3 and cropped.shape[1] > 3:
                if label == "fig":
                    pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
                    pil_crop = crop_margin(pil_crop)
                    
                    buffered = io.BytesIO()
                    pil_crop.save(buffered, format="PNG")
                    img_base64 = base64.b64encode(buffered.getvalue()).decode()
                    data_uri = f"data:image/png;base64,{img_base64}"
                    
                    figure_results.append({
                        "label": label,
                        "text": f"![Figure {reading_order}]({data_uri})",
                        "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
                        "reading_order": reading_order,
                    })
                else:
                    pil_crop = Image.fromarray(cv2.cvtColor(cropped, cv2.COLOR_BGR2RGB))
                    element_info = {
                        "crop": pil_crop,
                        "label": label,
                        "bbox": [orig_x1, orig_y1, orig_x2, orig_y2],
                        "reading_order": reading_order,
                    }
                    
                    if label == "tab":
                        table_elements.append(element_info)
                    else:
                        text_elements.append(element_info)
                        
            reading_order += 1
            
        except Exception as e:
            print(f"Error processing element {label}: {str(e)}")
            continue
    
    recognition_results = figure_results.copy()
    
    if text_elements:
        text_results = process_element_batch_optimized(
            text_elements, model, "Read text in the image.", max_batch_size
        )
        recognition_results.extend(text_results)
    
    if table_elements:
        table_results = process_element_batch_optimized(
            table_elements, model, "Parse the table in the image.", max_batch_size
        )
        recognition_results.extend(table_results)
    
    recognition_results.sort(key=lambda x: x.get("reading_order", 0))
    return recognition_results


def process_element_batch_optimized(elements, model, prompt, max_batch_size=2):
    """Process elements in small batches for T4 Small"""
    results = []
    batch_size = min(len(elements), max_batch_size)
    
    for i in range(0, len(elements), batch_size):
        batch_elements = elements[i:i+batch_size]
        crops_list = [elem["crop"] for elem in batch_elements]
        prompts_list = [prompt] * len(crops_list)
        
        batch_results = model.chat(prompts_list, crops_list)
        
        for j, result in enumerate(batch_results):
            elem = batch_elements[j]
            results.append({
                "label": elem["label"],
                "bbox": elem["bbox"],
                "text": result.strip(),
                "reading_order": elem["reading_order"],
            })
            
        del crops_list, batch_elements
        if torch.cuda.is_available():
            torch.cuda.empty_cache()
    
    return results


def generate_fallback_markdown(recognition_results):
    """Generate basic markdown if converter fails"""
    markdown_content = ""
    for element in recognition_results:
        if element["label"] == "tab":
            markdown_content += f"\n\n{element['text']}\n\n"
        elif element["label"] in ["para", "title", "sec", "sub_sec"]:
            markdown_content += f"{element['text']}\n\n"
        elif element["label"] == "fig":
            markdown_content += f"{element['text']}\n\n"
    return markdown_content


# Initialize model
model_path = "./hf_model"
if not os.path.exists(model_path):
    model_path = "ByteDance/DOLPHIN"

try:
    dolphin_model = DOLPHIN(model_path)
    print(f"Model loaded successfully from {model_path}")
    model_status = f"✅ Model ready (Device: {dolphin_model.device})"
except Exception as e:
    print(f"Error loading model: {e}")
    dolphin_model = None
    model_status = f"❌ Model failed to load: {str(e)}"


# Global state for managing tabs
processed_markdown = ""
show_results_tab = False


def process_uploaded_pdf(pdf_file, progress=gr.Progress()):
    """Main processing function for uploaded PDF"""
    global processed_markdown, show_results_tab
    
    if dolphin_model is None:
        return "❌ Model not loaded", gr.Tabs(visible=False)
    
    if pdf_file is None:
        return "❌ No PDF uploaded", gr.Tabs(visible=False)
    
    try:
        combined_markdown, status = process_pdf_document(pdf_file, dolphin_model, progress)
        
        if status == "processing_complete":
            processed_markdown = combined_markdown
            show_results_tab = True
            return "✅ PDF processed successfully! Check the 'Document' tab above.", gr.Tabs(visible=True)
        else:
            show_results_tab = False
            return combined_markdown, gr.Tabs(visible=False)
            
    except Exception as e:
        show_results_tab = False
        error_msg = f"❌ Error processing PDF: {str(e)}"
        return error_msg, gr.Tabs(visible=False)


def get_processed_markdown():
    """Return the processed markdown content"""
    global processed_markdown
    return processed_markdown if processed_markdown else "No document processed yet."


def clear_all():
    """Clear all data and hide results tab"""
    global processed_markdown, show_results_tab
    processed_markdown = ""
    show_results_tab = False
    return None, "✅ Ready to process your PDF", gr.Tabs(visible=False)


# Create Gradio interface
with gr.Blocks(
    title="DOLPHIN PDF AI", 
    theme=gr.themes.Soft(),
    css="""
    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700&display=swap');
    
    * {
        font-family: 'Inter', -apple-system, BlinkMacSystemFont, 'Segoe UI', sans-serif !important;
    }
    
    .main-container { 
        max-width: 1000px; 
        margin: 0 auto; 
    }
    .upload-container { 
        text-align: center; 
        padding: 40px 20px;
        border: 2px dashed #e0e0e0;
        border-radius: 15px;
        margin: 20px 0;
    }
    .upload-button {
        font-size: 18px !important;
        padding: 15px 30px !important;
        margin: 20px 0 !important;
        font-weight: 600 !important;
    }
    .status-message {
        text-align: center;
        padding: 15px;
        margin: 10px 0;
        border-radius: 8px;
        font-weight: 500;
    }
    .chatbot-container {
        max-height: 600px;
    }
    h1, h2, h3 {
        font-weight: 700 !important;
    }
    #progress-container {
        margin: 10px 0;
        min-height: 20px;
    }
    """
) as demo:
    
    with gr.Tabs() as main_tabs:
        # Home Tab
        with gr.TabItem("🏠 Home", id="home"):
            gr.Markdown(
                "# Scholar Express\n"
                "### Upload a research paper to get a web-friendly version, an AI chatbot, and a podcast summary. Because of our reliance on Generative AI, some errors are inevitable.\n"
                f"**Status:** {model_status}"
            )
            
            with gr.Column(elem_classes="upload-container"):
                gr.Markdown("## 📄 Upload Your PDF Document")
                
                pdf_input = gr.File(
                    file_types=[".pdf"],
                    label="",
                    height=150,
                    elem_id="pdf_upload"
                )
                
                process_btn = gr.Button(
                    "🚀 Process PDF", 
                    variant="primary", 
                    size="lg",
                    elem_classes="upload-button"
                )
                
                clear_btn = gr.Button(
                    "🗑️ Clear", 
                    variant="secondary"
                )
            
            # Dedicated progress space
            progress_space = gr.HTML(
                value="",
                visible=False,
                elem_id="progress-container"
            )
            
            # Status output (hidden during processing)
            status_output = gr.Markdown(
                "✅ Ready to process your PDF",
                elem_classes="status-message"
            )
        
        # Results Tab (initially hidden)
        with gr.TabItem("📖 Document", id="results", visible=False) as results_tab:
            gr.Markdown("## Processed Document")
            
            markdown_display = gr.Markdown(
                value="",
                latex_delimiters=[
                    {"left": "$$", "right": "$$", "display": True},
                    {"left": "$", "right": "$", "display": False}
                ],
                height=700
            )
        
        # Chatbot Tab (initially hidden)
        with gr.TabItem("💬 Chat", id="chat", visible=False) as chat_tab:
            gr.Markdown("## Ask Questions About Your Document")
            
            chatbot = gr.Chatbot(
                value=[],
                height=500,
                elem_classes="chatbot-container",
                placeholder="Your conversation will appear here once you process a document..."
            )
            
            with gr.Row():
                msg_input = gr.Textbox(
                    placeholder="Ask a question about the processed document...",
                    scale=4,
                    container=False
                )
                send_btn = gr.Button("Send", variant="primary", scale=1)
            
            gr.Markdown(
                "*Chat functionality will be implemented in the next version*",
                elem_id="chat-notice"
            )
    
    # Event handlers
    process_btn.click(
        fn=process_uploaded_pdf,
        inputs=[pdf_input],
        outputs=[status_output, results_tab],
        show_progress=True
    ).then(
        fn=get_processed_markdown,
        outputs=[markdown_display]
    ).then(
        fn=lambda: gr.TabItem(visible=True),
        outputs=[chat_tab]
    )
    
    clear_btn.click(
        fn=clear_all,
        outputs=[pdf_input, status_output, results_tab]
    ).then(
        fn=lambda: gr.HTML(visible=False),
        outputs=[progress_space]
    ).then(
        fn=lambda: gr.TabItem(visible=False),
        outputs=[chat_tab]
    )
    
    # Placeholder chat functionality
    def placeholder_chat(message, history):
        return history + [["Coming soon: AI-powered document Q&A", "This feature will allow you to ask questions about your processed PDF document."]]
    
    send_btn.click(
        fn=placeholder_chat,
        inputs=[msg_input, chatbot],
        outputs=[chatbot]
    ).then(
        lambda: "",
        outputs=[msg_input]
    )


if __name__ == "__main__":
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
        show_error=True,
        max_threads=1,  # Single thread for T4 Small
        inbrowser=False,
        quiet=True
    )