Spaces:
Running
Running
Commit
·
15fdcff
0
Parent(s):
Initial commit with document parser
Browse files- .cursorrules +46 -0
- .github/workflows/sync-to-hub.yml +20 -0
- README-HF.md +46 -0
- README.md +81 -0
- app.py +160 -0
- config.json +12 -0
- dockling_parser/__init__.py +11 -0
- dockling_parser/exceptions.py +19 -0
- dockling_parser/parser.py +94 -0
- dockling_parser/types.py +25 -0
- requirements.txt +10 -0
.cursorrules
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
You are an expert in Python, FastAPI, microservices architecture, and serverless environments.
|
| 3 |
+
|
| 4 |
+
Advanced Principles
|
| 5 |
+
- Design services to be stateless; leverage external storage and caches (e.g., Redis) for state persistence.
|
| 6 |
+
- Implement API gateways and reverse proxies (e.g., NGINX, Traefik) for handling traffic to microservices.
|
| 7 |
+
- Use circuit breakers and retries for resilient service communication.
|
| 8 |
+
- Favor serverless deployment for reduced infrastructure overhead in scalable environments.
|
| 9 |
+
- Use asynchronous workers (e.g., Celery, RQ) for handling background tasks efficiently.
|
| 10 |
+
|
| 11 |
+
Microservices and API Gateway Integration
|
| 12 |
+
- Integrate FastAPI services with API Gateway solutions like Kong or AWS API Gateway.
|
| 13 |
+
- Use API Gateway for rate limiting, request transformation, and security filtering.
|
| 14 |
+
- Design APIs with clear separation of concerns to align with microservices principles.
|
| 15 |
+
- Implement inter-service communication using message brokers (e.g., RabbitMQ, Kafka) for event-driven architectures.
|
| 16 |
+
|
| 17 |
+
Serverless and Cloud-Native Patterns
|
| 18 |
+
- Optimize FastAPI apps for serverless environments (e.g., AWS Lambda, Azure Functions) by minimizing cold start times.
|
| 19 |
+
- Package FastAPI applications using lightweight containers or as a standalone binary for deployment in serverless setups.
|
| 20 |
+
- Use managed services (e.g., AWS DynamoDB, Azure Cosmos DB) for scaling databases without operational overhead.
|
| 21 |
+
- Implement automatic scaling with serverless functions to handle variable loads effectively.
|
| 22 |
+
|
| 23 |
+
Advanced Middleware and Security
|
| 24 |
+
- Implement custom middleware for detailed logging, tracing, and monitoring of API requests.
|
| 25 |
+
- Use OpenTelemetry or similar libraries for distributed tracing in microservices architectures.
|
| 26 |
+
- Apply security best practices: OAuth2 for secure API access, rate limiting, and DDoS protection.
|
| 27 |
+
- Use security headers (e.g., CORS, CSP) and implement content validation using tools like OWASP Zap.
|
| 28 |
+
|
| 29 |
+
Optimizing for Performance and Scalability
|
| 30 |
+
- Leverage FastAPI’s async capabilities for handling large volumes of simultaneous connections efficiently.
|
| 31 |
+
- Optimize backend services for high throughput and low latency; use databases optimized for read-heavy workloads (e.g., Elasticsearch).
|
| 32 |
+
- Use caching layers (e.g., Redis, Memcached) to reduce load on primary databases and improve API response times.
|
| 33 |
+
- Apply load balancing and service mesh technologies (e.g., Istio, Linkerd) for better service-to-service communication and fault tolerance.
|
| 34 |
+
|
| 35 |
+
Monitoring and Logging
|
| 36 |
+
- Use Prometheus and Grafana for monitoring FastAPI applications and setting up alerts.
|
| 37 |
+
- Implement structured logging for better log analysis and observability.
|
| 38 |
+
- Integrate with centralized logging systems (e.g., ELK Stack, AWS CloudWatch) for aggregated logging and monitoring.
|
| 39 |
+
|
| 40 |
+
Key Conventions
|
| 41 |
+
1. Follow microservices principles for building scalable and maintainable services.
|
| 42 |
+
2. Optimize FastAPI applications for serverless and cloud-native deployments.
|
| 43 |
+
3. Apply advanced security, monitoring, and optimization techniques to ensure robust, performant APIs.
|
| 44 |
+
|
| 45 |
+
Refer to FastAPI, microservices, and serverless documentation for best practices and advanced usage patterns.
|
| 46 |
+
|
.github/workflows/sync-to-hub.yml
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: Sync to Hugging Face Hub
|
| 2 |
+
on:
|
| 3 |
+
push:
|
| 4 |
+
branches: [main]
|
| 5 |
+
|
| 6 |
+
# to run this workflow manually from the Actions tab
|
| 7 |
+
workflow_dispatch:
|
| 8 |
+
|
| 9 |
+
jobs:
|
| 10 |
+
sync-to-hub:
|
| 11 |
+
runs-on: ubuntu-latest
|
| 12 |
+
steps:
|
| 13 |
+
- uses: actions/checkout@v3
|
| 14 |
+
with:
|
| 15 |
+
fetch-depth: 0
|
| 16 |
+
lfs: true
|
| 17 |
+
- name: Push to hub
|
| 18 |
+
env:
|
| 19 |
+
HF_TOKEN: ${{ secrets.HF_TOKEN }}
|
| 20 |
+
run: git push https://rahulkumar:[email protected]/spaces/rahulkumar/dockling-parser main
|
README-HF.md
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# 📄 Smart Document Parser
|
| 2 |
+
|
| 3 |
+
A powerful document parsing application that automatically extracts structured information from various document formats.
|
| 4 |
+
|
| 5 |
+
## 🚀 Features
|
| 6 |
+
|
| 7 |
+
- **Multiple Format Support**: PDF, DOCX, TXT, HTML, and Markdown
|
| 8 |
+
- **Rich Information Extraction**:
|
| 9 |
+
- Document content with preserved formatting
|
| 10 |
+
- Comprehensive metadata
|
| 11 |
+
- Section breakdown
|
| 12 |
+
- Named entity recognition
|
| 13 |
+
- **Smart Processing**:
|
| 14 |
+
- Automatic format detection
|
| 15 |
+
- Confidence scoring
|
| 16 |
+
- Error handling
|
| 17 |
+
|
| 18 |
+
## 🎯 How to Use
|
| 19 |
+
|
| 20 |
+
1. **Upload Document**: Click the upload button or drag & drop your document
|
| 21 |
+
2. **Process**: Click "Process Document"
|
| 22 |
+
3. **View Results**: Explore the extracted information in different tabs:
|
| 23 |
+
- 📝 Content: Main document text
|
| 24 |
+
- 📊 Metadata: Document properties
|
| 25 |
+
- 📑 Sections: Document structure
|
| 26 |
+
- 🏷️ Entities: Named entities
|
| 27 |
+
|
| 28 |
+
## 📋 Supported Formats
|
| 29 |
+
|
| 30 |
+
- PDF Documents (*.pdf)
|
| 31 |
+
- Word Documents (*.docx)
|
| 32 |
+
- Text Files (*.txt)
|
| 33 |
+
- HTML Files (*.html)
|
| 34 |
+
- Markdown Files (*.md)
|
| 35 |
+
|
| 36 |
+
## 🛠️ Technical Details
|
| 37 |
+
|
| 38 |
+
Built with:
|
| 39 |
+
- Docling: Advanced document processing
|
| 40 |
+
- Gradio: Interactive web interface
|
| 41 |
+
- Pydantic: Type-safe data handling
|
| 42 |
+
- Hugging Face Spaces: Cloud deployment
|
| 43 |
+
|
| 44 |
+
## 📝 License
|
| 45 |
+
|
| 46 |
+
MIT License
|
README.md
ADDED
|
@@ -0,0 +1,81 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Dockling Parser
|
| 2 |
+
|
| 3 |
+
A powerful multiformat document parsing module built on top of Docling. This module provides a unified interface for parsing various document formats including PDF, DOCX, TXT, HTML, and Markdown.
|
| 4 |
+
|
| 5 |
+
## Features
|
| 6 |
+
|
| 7 |
+
- Unified interface for multiple document formats
|
| 8 |
+
- Rich metadata extraction
|
| 9 |
+
- Structured content parsing
|
| 10 |
+
- Format detection using MIME types
|
| 11 |
+
- Error handling and validation
|
| 12 |
+
- Type-safe using Pydantic models
|
| 13 |
+
- Web interface using Gradio
|
| 14 |
+
|
| 15 |
+
## Installation
|
| 16 |
+
|
| 17 |
+
```bash
|
| 18 |
+
pip install -r requirements.txt
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
## Usage
|
| 22 |
+
|
| 23 |
+
### Python API
|
| 24 |
+
|
| 25 |
+
```python
|
| 26 |
+
from dockling_parser import DocumentParser
|
| 27 |
+
|
| 28 |
+
# Initialize parser
|
| 29 |
+
parser = DocumentParser()
|
| 30 |
+
|
| 31 |
+
# Parse a document
|
| 32 |
+
result = parser.parse("path/to/document.pdf")
|
| 33 |
+
|
| 34 |
+
# Access parsed content
|
| 35 |
+
print(result.content) # Get main text content
|
| 36 |
+
print(result.metadata) # Get document metadata
|
| 37 |
+
print(result.structured_content) # Get structured content (sections, paragraphs, etc.)
|
| 38 |
+
|
| 39 |
+
# Check format support
|
| 40 |
+
is_supported = parser.supports_format("application/pdf")
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
### Web Interface
|
| 44 |
+
|
| 45 |
+
The package includes a Gradio-based web interface for easy document parsing:
|
| 46 |
+
|
| 47 |
+
```bash
|
| 48 |
+
python app.py
|
| 49 |
+
```
|
| 50 |
+
|
| 51 |
+
This will launch a web interface with the following features:
|
| 52 |
+
- Drag-and-drop document upload
|
| 53 |
+
- Support for multiple document formats
|
| 54 |
+
- Automatic format detection
|
| 55 |
+
- Structured output display:
|
| 56 |
+
- Document content
|
| 57 |
+
- Metadata table
|
| 58 |
+
- Section breakdown
|
| 59 |
+
- Named entity recognition
|
| 60 |
+
- Confidence scoring
|
| 61 |
+
|
| 62 |
+
## Supported Formats
|
| 63 |
+
|
| 64 |
+
- PDF (application/pdf)
|
| 65 |
+
- DOCX (application/vnd.openxmlformats-officedocument.wordprocessingml.document)
|
| 66 |
+
- Plain Text (text/plain)
|
| 67 |
+
- HTML (text/html)
|
| 68 |
+
- Markdown (text/markdown)
|
| 69 |
+
|
| 70 |
+
## Error Handling
|
| 71 |
+
|
| 72 |
+
The module provides specific exceptions for different error cases:
|
| 73 |
+
|
| 74 |
+
- `UnsupportedFormatError`: When the document format is not supported
|
| 75 |
+
- `ParseError`: When document parsing fails
|
| 76 |
+
- `ValidationError`: When document validation fails
|
| 77 |
+
- `EncodingError`: When document encoding issues occur
|
| 78 |
+
|
| 79 |
+
## License
|
| 80 |
+
|
| 81 |
+
MIT License
|
app.py
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import gradio as gr
|
| 3 |
+
import pandas as pd
|
| 4 |
+
from dockling_parser import DocumentParser
|
| 5 |
+
from dockling_parser.exceptions import ParserError
|
| 6 |
+
import tempfile
|
| 7 |
+
|
| 8 |
+
TITLE = "📄 Smart Document Parser"
|
| 9 |
+
DESCRIPTION = """
|
| 10 |
+
A powerful document parsing application that automatically extracts structured information from various document formats.
|
| 11 |
+
Upload any document (PDF, DOCX, TXT, HTML, Markdown) and get structured information extracted automatically.
|
| 12 |
+
"""
|
| 13 |
+
|
| 14 |
+
ARTICLE = """
|
| 15 |
+
## 🚀 Features
|
| 16 |
+
|
| 17 |
+
- Multiple Format Support: PDF, DOCX, TXT, HTML, and Markdown
|
| 18 |
+
- Rich Information Extraction
|
| 19 |
+
- Smart Processing with Confidence Scoring
|
| 20 |
+
- Automatic Format Detection
|
| 21 |
+
|
| 22 |
+
Made with ❤️ using Docling and Gradio
|
| 23 |
+
"""
|
| 24 |
+
|
| 25 |
+
# Initialize the document parser
|
| 26 |
+
parser = DocumentParser()
|
| 27 |
+
|
| 28 |
+
def process_document(file):
|
| 29 |
+
"""Process uploaded document and return structured information"""
|
| 30 |
+
try:
|
| 31 |
+
# Create a temporary file to handle the upload
|
| 32 |
+
with tempfile.NamedTemporaryFile(delete=False, suffix=os.path.splitext(file.name)[1]) as tmp_file:
|
| 33 |
+
tmp_file.write(file.read())
|
| 34 |
+
temp_path = tmp_file.name
|
| 35 |
+
|
| 36 |
+
# Parse the document
|
| 37 |
+
result = parser.parse(temp_path)
|
| 38 |
+
|
| 39 |
+
# Clean up temporary file
|
| 40 |
+
os.unlink(temp_path)
|
| 41 |
+
|
| 42 |
+
# Prepare the outputs
|
| 43 |
+
metadata_df = pd.DataFrame([{
|
| 44 |
+
"Property": k,
|
| 45 |
+
"Value": str(v)
|
| 46 |
+
} for k, v in result.metadata.dict().items()])
|
| 47 |
+
|
| 48 |
+
# Extract structured content
|
| 49 |
+
sections = result.structured_content.get('sections', [])
|
| 50 |
+
sections_text = "\n\n".join([f"Section {i+1}:\n{section}" for i, section in enumerate(sections)])
|
| 51 |
+
|
| 52 |
+
# Format entities if available
|
| 53 |
+
entities = result.structured_content.get('entities', {})
|
| 54 |
+
entities_text = "\n".join([f"{entity_type}: {', '.join(entities_list)}"
|
| 55 |
+
for entity_type, entities_list in entities.items()]) if entities else "No entities detected"
|
| 56 |
+
|
| 57 |
+
return (
|
| 58 |
+
result.content, # Main content
|
| 59 |
+
metadata_df, # Metadata as table
|
| 60 |
+
sections_text, # Structured sections
|
| 61 |
+
entities_text, # Named entities
|
| 62 |
+
f"Confidence Score: {result.confidence_score:.2f}" # Confidence score
|
| 63 |
+
)
|
| 64 |
+
|
| 65 |
+
except ParserError as e:
|
| 66 |
+
return (
|
| 67 |
+
f"Error parsing document: {str(e)}",
|
| 68 |
+
pd.DataFrame(),
|
| 69 |
+
"No sections available",
|
| 70 |
+
"No entities available",
|
| 71 |
+
"Confidence Score: 0.0"
|
| 72 |
+
)
|
| 73 |
+
except Exception as e:
|
| 74 |
+
return (
|
| 75 |
+
f"Unexpected error: {str(e)}",
|
| 76 |
+
pd.DataFrame(),
|
| 77 |
+
"No sections available",
|
| 78 |
+
"No entities available",
|
| 79 |
+
"Confidence Score: 0.0"
|
| 80 |
+
)
|
| 81 |
+
finally:
|
| 82 |
+
# Ensure temporary file is cleaned up
|
| 83 |
+
if 'temp_path' in locals() and os.path.exists(temp_path):
|
| 84 |
+
try:
|
| 85 |
+
os.unlink(temp_path)
|
| 86 |
+
except:
|
| 87 |
+
pass
|
| 88 |
+
|
| 89 |
+
# Create Gradio interface
|
| 90 |
+
with gr.Blocks(title=TITLE, theme=gr.themes.Soft()) as iface:
|
| 91 |
+
gr.Markdown(f"# {TITLE}")
|
| 92 |
+
gr.Markdown(DESCRIPTION)
|
| 93 |
+
|
| 94 |
+
with gr.Row():
|
| 95 |
+
with gr.Column():
|
| 96 |
+
file_input = gr.File(
|
| 97 |
+
label="Upload Document",
|
| 98 |
+
file_types=[".pdf", ".docx", ".txt", ".html", ".md"],
|
| 99 |
+
type="file"
|
| 100 |
+
)
|
| 101 |
+
submit_btn = gr.Button("Process Document", variant="primary")
|
| 102 |
+
|
| 103 |
+
with gr.Column():
|
| 104 |
+
confidence = gr.Textbox(label="Processing Confidence")
|
| 105 |
+
|
| 106 |
+
with gr.Tabs():
|
| 107 |
+
with gr.TabItem("📝 Content"):
|
| 108 |
+
content_output = gr.Textbox(
|
| 109 |
+
label="Extracted Content",
|
| 110 |
+
lines=10,
|
| 111 |
+
max_lines=30
|
| 112 |
+
)
|
| 113 |
+
|
| 114 |
+
with gr.TabItem("📊 Metadata"):
|
| 115 |
+
metadata_output = gr.Dataframe(
|
| 116 |
+
label="Document Metadata",
|
| 117 |
+
headers=["Property", "Value"]
|
| 118 |
+
)
|
| 119 |
+
|
| 120 |
+
with gr.TabItem("📑 Sections"):
|
| 121 |
+
sections_output = gr.Textbox(
|
| 122 |
+
label="Document Sections",
|
| 123 |
+
lines=10,
|
| 124 |
+
max_lines=30
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
with gr.TabItem("🏷️ Entities"):
|
| 128 |
+
entities_output = gr.Textbox(
|
| 129 |
+
label="Named Entities",
|
| 130 |
+
lines=5,
|
| 131 |
+
max_lines=15
|
| 132 |
+
)
|
| 133 |
+
|
| 134 |
+
# Handle file submission
|
| 135 |
+
submit_btn.click(
|
| 136 |
+
fn=process_document,
|
| 137 |
+
inputs=[file_input],
|
| 138 |
+
outputs=[
|
| 139 |
+
content_output,
|
| 140 |
+
metadata_output,
|
| 141 |
+
sections_output,
|
| 142 |
+
entities_output,
|
| 143 |
+
confidence
|
| 144 |
+
]
|
| 145 |
+
)
|
| 146 |
+
|
| 147 |
+
gr.Markdown("""
|
| 148 |
+
### 📌 Supported Formats
|
| 149 |
+
- PDF Documents (*.pdf)
|
| 150 |
+
- Word Documents (*.docx)
|
| 151 |
+
- Text Files (*.txt)
|
| 152 |
+
- HTML Files (*.html)
|
| 153 |
+
- Markdown Files (*.md)
|
| 154 |
+
""")
|
| 155 |
+
|
| 156 |
+
gr.Markdown(ARTICLE)
|
| 157 |
+
|
| 158 |
+
# Launch the app
|
| 159 |
+
if __name__ == "__main__":
|
| 160 |
+
iface.launch()
|
config.json
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"title": "Smart Document Parser",
|
| 3 |
+
"emoji": "📄",
|
| 4 |
+
"colorFrom": "blue",
|
| 5 |
+
"colorTo": "indigo",
|
| 6 |
+
"sdk": "gradio",
|
| 7 |
+
"sdk_version": "4.0.0",
|
| 8 |
+
"python_version": "3.10",
|
| 9 |
+
"app_file": "app.py",
|
| 10 |
+
"pinned": false,
|
| 11 |
+
"license": "mit"
|
| 12 |
+
}
|
dockling_parser/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Dockling Parser - A multiformat document parsing module using Docling
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
__version__ = "0.1.0"
|
| 6 |
+
|
| 7 |
+
from .parser import DocumentParser
|
| 8 |
+
from .types import ParsedDocument, DocumentMetadata
|
| 9 |
+
from .exceptions import ParserError
|
| 10 |
+
|
| 11 |
+
__all__ = ["DocumentParser", "ParsedDocument", "DocumentMetadata", "ParserError"]
|
dockling_parser/exceptions.py
ADDED
|
@@ -0,0 +1,19 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
class ParserError(Exception):
|
| 2 |
+
"""Base exception for parser errors"""
|
| 3 |
+
pass
|
| 4 |
+
|
| 5 |
+
class UnsupportedFormatError(ParserError):
|
| 6 |
+
"""Raised when document format is not supported"""
|
| 7 |
+
pass
|
| 8 |
+
|
| 9 |
+
class ParseError(ParserError):
|
| 10 |
+
"""Raised when document parsing fails"""
|
| 11 |
+
pass
|
| 12 |
+
|
| 13 |
+
class ValidationError(ParserError):
|
| 14 |
+
"""Raised when document validation fails"""
|
| 15 |
+
pass
|
| 16 |
+
|
| 17 |
+
class EncodingError(ParserError):
|
| 18 |
+
"""Raised when document encoding cannot be determined or is not supported"""
|
| 19 |
+
pass
|
dockling_parser/parser.py
ADDED
|
@@ -0,0 +1,94 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
from pathlib import Path
|
| 3 |
+
from typing import Optional, Dict, Any, Union
|
| 4 |
+
import magic
|
| 5 |
+
import docling as dl
|
| 6 |
+
from datetime import datetime
|
| 7 |
+
|
| 8 |
+
from .types import ParsedDocument, DocumentMetadata
|
| 9 |
+
from .exceptions import UnsupportedFormatError, ParseError
|
| 10 |
+
|
| 11 |
+
class DocumentParser:
|
| 12 |
+
"""
|
| 13 |
+
A multiformat document parser using Docling
|
| 14 |
+
"""
|
| 15 |
+
|
| 16 |
+
SUPPORTED_FORMATS = {
|
| 17 |
+
'application/pdf': 'pdf',
|
| 18 |
+
'application/vnd.openxmlformats-officedocument.wordprocessingml.document': 'docx',
|
| 19 |
+
'text/plain': 'txt',
|
| 20 |
+
'text/html': 'html',
|
| 21 |
+
'text/markdown': 'md'
|
| 22 |
+
}
|
| 23 |
+
|
| 24 |
+
def __init__(self, config: Optional[Dict[str, Any]] = None):
|
| 25 |
+
self.config = config or {}
|
| 26 |
+
self.docling = dl.Docling()
|
| 27 |
+
|
| 28 |
+
def parse(self, file_path: Union[str, Path]) -> ParsedDocument:
|
| 29 |
+
"""
|
| 30 |
+
Parse a document file and return structured content
|
| 31 |
+
|
| 32 |
+
Args:
|
| 33 |
+
file_path: Path to the document file
|
| 34 |
+
|
| 35 |
+
Returns:
|
| 36 |
+
ParsedDocument object containing parsed content and metadata
|
| 37 |
+
|
| 38 |
+
Raises:
|
| 39 |
+
UnsupportedFormatError: If the file format is not supported
|
| 40 |
+
ParseError: If parsing fails
|
| 41 |
+
"""
|
| 42 |
+
file_path = Path(file_path)
|
| 43 |
+
if not file_path.exists():
|
| 44 |
+
raise FileNotFoundError(f"File not found: {file_path}")
|
| 45 |
+
|
| 46 |
+
mime_type = magic.from_file(str(file_path), mime=True)
|
| 47 |
+
if mime_type not in self.SUPPORTED_FORMATS:
|
| 48 |
+
raise UnsupportedFormatError(f"Unsupported file format: {mime_type}")
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
# Get file metadata
|
| 52 |
+
stats = file_path.stat()
|
| 53 |
+
metadata = DocumentMetadata(
|
| 54 |
+
filename=file_path.name,
|
| 55 |
+
file_type=self.SUPPORTED_FORMATS[mime_type],
|
| 56 |
+
size_bytes=stats.st_size,
|
| 57 |
+
created_at=datetime.fromtimestamp(stats.st_ctime),
|
| 58 |
+
modified_at=datetime.fromtimestamp(stats.st_mtime),
|
| 59 |
+
mime_type=mime_type
|
| 60 |
+
)
|
| 61 |
+
|
| 62 |
+
# Parse document using Docling
|
| 63 |
+
doc = self.docling.parse(str(file_path))
|
| 64 |
+
|
| 65 |
+
# Extract content and structure
|
| 66 |
+
content = doc.text
|
| 67 |
+
structured_content = {
|
| 68 |
+
'sections': doc.sections,
|
| 69 |
+
'paragraphs': doc.paragraphs,
|
| 70 |
+
'entities': doc.entities,
|
| 71 |
+
'metadata': doc.metadata
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
# Update metadata with document-specific information
|
| 75 |
+
if doc.metadata:
|
| 76 |
+
metadata.title = doc.metadata.get('title')
|
| 77 |
+
metadata.author = doc.metadata.get('author')
|
| 78 |
+
metadata.pages = doc.metadata.get('pages')
|
| 79 |
+
metadata.extra.update(doc.metadata)
|
| 80 |
+
|
| 81 |
+
return ParsedDocument(
|
| 82 |
+
content=content,
|
| 83 |
+
metadata=metadata,
|
| 84 |
+
raw_text=doc.raw_text,
|
| 85 |
+
structured_content=structured_content,
|
| 86 |
+
confidence_score=doc.confidence if hasattr(doc, 'confidence') else 1.0
|
| 87 |
+
)
|
| 88 |
+
|
| 89 |
+
except Exception as e:
|
| 90 |
+
raise ParseError(f"Failed to parse document: {str(e)}") from e
|
| 91 |
+
|
| 92 |
+
def supports_format(self, mime_type: str) -> bool:
|
| 93 |
+
"""Check if a given MIME type is supported"""
|
| 94 |
+
return mime_type in self.SUPPORTED_FORMATS
|
dockling_parser/types.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from datetime import datetime
|
| 2 |
+
from typing import Optional, Dict, Any
|
| 3 |
+
from pydantic import BaseModel, Field
|
| 4 |
+
|
| 5 |
+
class DocumentMetadata(BaseModel):
|
| 6 |
+
"""Metadata for parsed documents"""
|
| 7 |
+
filename: str
|
| 8 |
+
file_type: str
|
| 9 |
+
size_bytes: int
|
| 10 |
+
created_at: Optional[datetime] = None
|
| 11 |
+
modified_at: Optional[datetime] = None
|
| 12 |
+
author: Optional[str] = None
|
| 13 |
+
title: Optional[str] = None
|
| 14 |
+
pages: Optional[int] = None
|
| 15 |
+
encoding: Optional[str] = None
|
| 16 |
+
mime_type: str
|
| 17 |
+
extra: Dict[str, Any] = Field(default_factory=dict)
|
| 18 |
+
|
| 19 |
+
class ParsedDocument(BaseModel):
|
| 20 |
+
"""Represents a parsed document with its content and metadata"""
|
| 21 |
+
content: str
|
| 22 |
+
metadata: DocumentMetadata
|
| 23 |
+
raw_text: Optional[str] = None
|
| 24 |
+
structured_content: Optional[Dict[str, Any]] = None
|
| 25 |
+
confidence_score: float = Field(ge=0.0, le=1.0, default=1.0)
|
requirements.txt
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
docling>=0.1.0
|
| 2 |
+
pydantic>=2.0.0
|
| 3 |
+
python-magic-bin>=0.4.14
|
| 4 |
+
python-docx>=0.8.11
|
| 5 |
+
PyPDF2>=3.0.0
|
| 6 |
+
beautifulsoup4>=4.12.0
|
| 7 |
+
lxml>=4.9.0
|
| 8 |
+
gradio>=4.0.0
|
| 9 |
+
pandas>=1.5.0
|
| 10 |
+
huggingface-hub>=0.19.0
|