Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import gradio as gr | |
| import json | |
| import os | |
| from pathlib import Path | |
| import logging | |
| from docling.document_converter import DocumentConverter | |
| from docling.datamodel.base_models import InputFormat, DocumentStream | |
| from docling.datamodel.pipeline_options import PdfPipelineOptions, TableFormerMode | |
| from docling.document_converter import PdfFormatOption | |
| import requests | |
| from urllib.parse import urlparse | |
| from datetime import datetime | |
| import tempfile | |
| from docx import Document | |
| from docx.shared import Inches | |
| import markdown | |
| # Set up logging | |
| logging.basicConfig(level=logging.DEBUG) | |
| logger = logging.getLogger(__name__) | |
| def is_valid_url(url): | |
| try: | |
| result = urlparse(url) | |
| return all([result.scheme, result.netloc]) | |
| except: | |
| return False | |
| def markdown_to_docx(markdown_content): | |
| """Convert markdown content to DOCX format""" | |
| doc = Document() | |
| # Split content into lines | |
| lines = markdown_content.split('\n') | |
| for line in lines: | |
| # Handle headers | |
| if line.startswith('# '): | |
| doc.add_heading(line[2:], level=1) | |
| elif line.startswith('## '): | |
| doc.add_heading(line[3:], level=2) | |
| elif line.startswith('### '): | |
| doc.add_heading(line[4:], level=3) | |
| # Handle lists | |
| elif line.startswith('* ') or line.startswith('- '): | |
| doc.add_paragraph(line[2:], style='List Bullet') | |
| elif line.startswith('1. '): | |
| doc.add_paragraph(line[3:], style='List Number') | |
| # Handle normal text | |
| elif line.strip(): | |
| doc.add_paragraph(line) | |
| # Handle empty lines | |
| else: | |
| doc.add_paragraph() | |
| return doc | |
| def create_output_files(content, original_name): | |
| """Create temporary files for different formats and return their paths""" | |
| files = {} | |
| # Generate base filename | |
| base_name = Path(original_name).stem | |
| timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") | |
| # Create markdown file | |
| md_path = tempfile.NamedTemporaryFile(delete=False, suffix='.md').name | |
| with open(md_path, "w", encoding="utf-8") as f: | |
| f.write(content) | |
| files['markdown'] = md_path | |
| # Create JSON file | |
| json_content = { | |
| "title": original_name, | |
| "content": content, | |
| "metadata": { | |
| "conversion_date": datetime.now().isoformat() | |
| } | |
| } | |
| json_path = tempfile.NamedTemporaryFile(delete=False, suffix='.json').name | |
| with open(json_path, "w", encoding="utf-8") as f: | |
| json.dump(json_content, f, ensure_ascii=False, indent=2) | |
| files['json'] = json_path | |
| # Create proper DOCX file | |
| docx_path = tempfile.NamedTemporaryFile(delete=False, suffix='.docx').name | |
| doc = markdown_to_docx(content) | |
| doc.save(docx_path) | |
| files['docx'] = docx_path | |
| return files | |
| def process_document(input_type, file_input, url_input, use_gpu, table_mode): | |
| try: | |
| logger.debug(f"Processing with input type: {input_type}") | |
| logger.debug(f"File input: {file_input}") | |
| # Configure pipeline | |
| pipeline_options = PdfPipelineOptions(do_table_structure=True) | |
| if table_mode: | |
| pipeline_options.table_structure_options.mode = TableFormerMode.ACCURATE | |
| else: | |
| pipeline_options.table_structure_options.mode = TableFormerMode.FAST | |
| converter = DocumentConverter( | |
| format_options={ | |
| InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options) | |
| } | |
| ) | |
| # Handle different input types | |
| if input_type == "file": | |
| if file_input is None: | |
| return None, None, None, None, "Please upload a file" | |
| source = file_input | |
| original_name = Path(file_input).name | |
| elif input_type == "url": | |
| if not url_input or not is_valid_url(url_input): | |
| return None, None, None, None, "Please enter a valid URL" | |
| source = url_input | |
| original_name = Path(urlparse(url_input).path).name or "url_document" | |
| else: | |
| return None, None, None, None, "Invalid input type" | |
| # Convert document | |
| logger.debug(f"Converting document: {source}") | |
| result = converter.convert(source) | |
| # Get markdown content | |
| markdown_content = result.document.export_to_markdown() | |
| # Create output files | |
| output_files = create_output_files(markdown_content, original_name) | |
| return ( | |
| output_files['markdown'], | |
| output_files['json'], | |
| output_files['docx'], | |
| markdown_content, | |
| "Conversion completed successfully! Use the download buttons below to get your files." | |
| ) | |
| except Exception as e: | |
| logger.exception("Error occurred during conversion") | |
| return None, None, None, None, f"Error during conversion: {str(e)}\nCheck the console for detailed error logs." | |
| # Create title HTML with custom style and duplicate button CSS | |
| title_html = """ | |
| <div style="text-align: center; max-width: 800px; margin: 0 auto;"> | |
| <h1 style="color: #FFD700; font-size: 2.5rem; margin-bottom: 0.5rem;">Professional Document Converter</h1> | |
| <p style="color: #FFA500; font-size: 1.1rem; margin-bottom: 1.5rem;">Convert documents from files or URLs to various formats</p> | |
| <p style="color: #87CEEB; font-size: 0.9rem;">Please like this Space if you find it useful! Your support is appreciated 🙏</p> | |
| </div> | |
| <style> | |
| .duplicate-button { | |
| margin: 0.5em auto 1em; | |
| display: block; | |
| background-color: #FFD700 !important; | |
| color: black !important; | |
| border: none !important; | |
| font-weight: bold !important; | |
| } | |
| .duplicate-button:hover { | |
| background-color: #FFA500 !important; | |
| transform: translateY(-2px); | |
| transition: all 0.2s ease; | |
| } | |
| </style> | |
| """ | |
| # Create Gradio interface with custom theme | |
| with gr.Blocks(css="footer {display: none}") as demo: | |
| gr.HTML(title_html) | |
| # Add duplicate button at the top | |
| gr.DuplicateButton( | |
| value="Duplicate Space for private use", | |
| elem_classes="duplicate-button", | |
| ) | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| input_type = gr.Radio( | |
| choices=["file", "url"], | |
| value="file", | |
| label="Input Type" | |
| ) | |
| # File input with proper file type handling | |
| file_input = gr.File( | |
| label="Upload Document", | |
| file_types=[".pdf", ".PDF"], | |
| type="filepath" | |
| ) | |
| # URL input | |
| url_input = gr.Textbox( | |
| label="Or Enter URL", | |
| placeholder="https://arxiv.org/pdf/2408.09869" | |
| ) | |
| # Processing options | |
| use_gpu = gr.Checkbox(label="Use GPU", value=True) | |
| table_mode = gr.Checkbox(label="Use Accurate Table Mode (Slower but better)", value=False) | |
| convert_btn = gr.Button("Convert Document", variant="primary") | |
| with gr.Column(scale=2): | |
| # Status message | |
| status_message = gr.Markdown("") | |
| # Preview area | |
| preview = gr.Markdown("", label="Preview") | |
| # Download files | |
| with gr.Group() as download_group: | |
| gr.Markdown("### Download Files") | |
| with gr.Row(): | |
| markdown_output = gr.File(label="Download Markdown") | |
| json_output = gr.File(label="Download JSON") | |
| docx_output = gr.File(label="Download DOCX") | |
| # Define the main conversion event | |
| convert_btn.click( | |
| fn=process_document, | |
| inputs=[input_type, file_input, url_input, use_gpu, table_mode], | |
| outputs=[markdown_output, json_output, docx_output, preview, status_message] | |
| ) | |
| # Updated footer with better visibility | |
| footer = """ | |
| <div style="text-align: center; margin: 2rem auto; padding: 1rem; border-top: 1px solid #FFD700; max-width: 800px;"> | |
| <div style="margin-bottom: 1rem;"> | |
| <a href="https://www.linkedin.com/in/pejman-ebrahimi-4a60151a7/" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">LinkedIn</a> | | |
| <a href="https://github.com/arad1367" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">GitHub</a> | | |
| <a href="https://arad1367.pythonanywhere.com/" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">PhD Defense Demo</a> | | |
| <a href="https://github.com/DS4SD/docling" target="_blank" style="text-decoration: none; color: #FFD700; margin: 0 10px;">Docling Project</a> | |
| </div> | |
| <p style="color: #FFA500; margin-top: 0.5rem;">Made with 💖 by Pejman Ebrahimi</p> | |
| </div> | |
| """ | |
| gr.HTML(footer) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.queue(max_size=5) # Increased timeout to 120 seconds | |
| demo.launch( | |
| show_error=True, | |
| share=False, | |
| debug=True, | |
| show_api=False, | |
| server_name="0.0.0.0" | |
| ) |