Spaces:
Running
Running
| import os | |
| import gradio as gr | |
| import pandas as pd | |
| from dockling_parser import DocumentParser | |
| from dockling_parser.exceptions import ParserError, UnsupportedFormatError | |
| import tempfile | |
| import mimetypes | |
| import traceback | |
| import requests | |
| from urllib.parse import urlparse | |
| TITLE = "๐ Smart Document Parser" | |
| DESCRIPTION = """ | |
| A powerful document parsing application that automatically extracts structured information from various document formats. | |
| Upload a document or provide a URL (PDF, DOCX, TXT, HTML, Markdown) and get structured information automatically. | |
| """ | |
| ARTICLE = """ | |
| ## ๐ Features | |
| - Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown | |
| - Support for File Upload and URLs | |
| - Rich Information Extraction | |
| - Smart Processing with Confidence Scoring | |
| - Automatic Format Detection | |
| Made with โค๏ธ using Docling and Gradio | |
| """ | |
| ERROR_MESSAGES = { | |
| "no_input": ( | |
| "โ ๏ธ No input provided", | |
| "Please upload a document or provide a URL.", | |
| "No sections available", | |
| "No entities available", | |
| "Confidence Score: 0.0" | |
| ), | |
| "invalid_url": ( | |
| "โ ๏ธ Invalid URL", | |
| "Please provide a valid URL to a document.", | |
| "No sections available", | |
| "No entities available", | |
| "Confidence Score: 0.0" | |
| ), | |
| "download_error": ( | |
| "โ ๏ธ Failed to download document", | |
| "Could not download the document from the provided URL.", | |
| "No sections available", | |
| "No entities available", | |
| "Confidence Score: 0.0" | |
| ), | |
| "unsupported_format": ( | |
| "โ ๏ธ Unsupported file format", | |
| "Please provide a file in one of the supported formats: PDF, DOCX, TXT, HTML, or MD.", | |
| "No sections available", | |
| "No entities available", | |
| "Confidence Score: 0.0" | |
| ), | |
| "processing_error": ( | |
| "โ ๏ธ Error processing document", | |
| "An error occurred while processing the document. Please try again with a different file.", | |
| "No sections available", | |
| "No entities available", | |
| "Confidence Score: 0.0" | |
| ) | |
| } | |
| # Initialize the document parser | |
| parser = DocumentParser() | |
| def download_file(url: str) -> str: | |
| """Download file from URL and save to temporary file""" | |
| try: | |
| # Extract filename from URL | |
| parsed_url = urlparse(url) | |
| filename = os.path.basename(parsed_url.path) | |
| if not filename: | |
| filename = "document.pdf" # Default filename | |
| # Download file | |
| response = requests.get(url, allow_redirects=True) | |
| response.raise_for_status() | |
| # Save to temporary file | |
| with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(filename)[1]) as tmp_file: | |
| tmp_file.write(response.content) | |
| return tmp_file.name | |
| except Exception as e: | |
| raise Exception(f"Failed to download file: {str(e)}") | |
| def process_input(file_input, url_input): | |
| """Process either uploaded file or URL input""" | |
| # Check if we have any input | |
| if file_input is None and not url_input: | |
| return ERROR_MESSAGES["no_input"] | |
| temp_file = None | |
| try: | |
| # Handle URL input if provided | |
| if url_input: | |
| try: | |
| temp_file = download_file(url_input) | |
| result = parser.parse(temp_file) | |
| except Exception as e: | |
| return ERROR_MESSAGES["download_error"] | |
| # Handle file upload | |
| else: | |
| result = parser.parse(file_input) | |
| # Prepare the outputs | |
| metadata_df = pd.DataFrame([{ | |
| "Property": k, | |
| "Value": str(v) | |
| } for k, v in result.metadata.dict().items()]) | |
| # Extract structured content | |
| sections = result.structured_content.get('sections', []) | |
| sections_text = "\n\n".join([f"Section {i+1}:\n{section}" for i, section in enumerate(sections)]) | |
| # Format entities if available | |
| entities = result.structured_content.get('entities', {}) | |
| entities_text = "\n".join([f"{entity_type}: {', '.join(entities_list)}" | |
| for entity_type, entities_list in entities.items()]) if entities else "No entities detected" | |
| return ( | |
| result.content, # Main content | |
| metadata_df, # Metadata as table | |
| sections_text, # Structured sections | |
| entities_text, # Named entities | |
| f"Confidence Score: {result.confidence_score:.2f}" # Confidence score | |
| ) | |
| except UnsupportedFormatError as e: | |
| error_msg = f"โ ๏ธ {str(e)}" | |
| return ( | |
| error_msg, | |
| pd.DataFrame([{"Property": "Error", "Value": error_msg}]), | |
| "No sections available", | |
| "No entities available", | |
| "Confidence Score: 0.0" | |
| ) | |
| except ParserError as e: | |
| error_msg = f"โ ๏ธ {str(e)}" | |
| return ( | |
| error_msg, | |
| pd.DataFrame([{"Property": "Error", "Value": error_msg}]), | |
| "No sections available", | |
| "No entities available", | |
| "Confidence Score: 0.0" | |
| ) | |
| except Exception as e: | |
| error_msg = f"โ ๏ธ Unexpected error: {str(e)}\n{traceback.format_exc()}" | |
| return ( | |
| error_msg, | |
| pd.DataFrame([{"Property": "Error", "Value": error_msg}]), | |
| "No sections available", | |
| "No entities available", | |
| "Confidence Score: 0.0" | |
| ) | |
| finally: | |
| # Cleanup temporary file if it was created | |
| if temp_file and os.path.exists(temp_file): | |
| try: | |
| os.unlink(temp_file) | |
| except: | |
| pass | |
| # Create Gradio interface | |
| with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface: | |
| gr.Markdown(f"# {TITLE}") | |
| gr.Markdown(DESCRIPTION) | |
| with gr.Row(): | |
| with gr.Column(): | |
| file_input = gr.File( | |
| label="Upload Document", | |
| file_types=[".pdf", ".docx", ".txt", ".html", ".md"], | |
| type="filepath" | |
| ) | |
| url_input = gr.Textbox( | |
| label="Or Enter Document URL", | |
| placeholder="https://example.com/document.pdf" | |
| ) | |
| submit_btn = gr.Button("Process Document", variant="primary") | |
| with gr.Column(): | |
| confidence = gr.Textbox(label="Processing Confidence") | |
| with gr.Tabs(): | |
| with gr.TabItem("๐ Content"): | |
| content_output = gr.Textbox( | |
| label="Extracted Content", | |
| lines=10, | |
| max_lines=30 | |
| ) | |
| with gr.TabItem("๐ Metadata"): | |
| metadata_output = gr.Dataframe( | |
| label="Document Metadata", | |
| headers=["Property", "Value"] | |
| ) | |
| with gr.TabItem("๐ Sections"): | |
| sections_output = gr.Textbox( | |
| label="Document Sections", | |
| lines=10, | |
| max_lines=30 | |
| ) | |
| with gr.TabItem("๐ท๏ธ Entities"): | |
| entities_output = gr.Textbox( | |
| label="Named Entities", | |
| lines=5, | |
| max_lines=15 | |
| ) | |
| # Handle file submission | |
| submit_btn.click( | |
| fn=process_input, | |
| inputs=[file_input, url_input], | |
| outputs=[ | |
| content_output, | |
| metadata_output, | |
| sections_output, | |
| entities_output, | |
| confidence | |
| ] | |
| ) | |
| gr.Markdown(""" | |
| ### ๐ Supported Formats | |
| - PDF Documents (*.pdf) | |
| - Word Documents (*.docx) | |
| - Text Files (*.txt) | |
| - HTML Files (*.html) | |
| - Markdown Files (*.md) | |
| ### ๐ Example URLs | |
| - ArXiv PDFs: https://arxiv.org/pdf/2408.08921.pdf | |
| - Research Papers | |
| - Documentation | |
| """) | |
| gr.Markdown(ARTICLE) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| iface.launch() |