Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import os | |
| import shutil | |
| import fitz | |
| from PIL import Image | |
| import numpy as np | |
| import cv2 | |
| import pytesseract | |
| from pytesseract import Output | |
| import zipfile | |
| from pdf2image import convert_from_path | |
| import google.generativeai as genai | |
| import json | |
| from docx import Document | |
| from docx.shared import Pt, RGBColor, Inches | |
| from docx.enum.text import WD_ALIGN_PARAGRAPH | |
| from docx.enum.section import WD_SECTION | |
| from docx.oxml import OxmlElement | |
| from docx.oxml.ns import qn | |
| from typing import Dict, Any, List, Union # Ajout des imports typing nécessaires | |
| import logging | |
| # helpers functions | |
| from helpers.rapport_generator import * | |
| from helpers.text_extraction import * | |
| from helpers.gemini_functions import * | |
| def authenticate(username, password): | |
| return username == os.getenv("HF_USERNAME") and password == os.getenv("HF_PASSWORD") | |
| # Main Processing Function | |
| def process_pdf(pdf_file): | |
| template_dir = os.path.join(os.getcwd(), "templates") | |
| temp_dir = os.path.join(os.getcwd(), "temp_processing") | |
| output_dir = os.path.join(temp_dir, 'output_images') | |
| if os.path.exists(temp_dir): | |
| shutil.rmtree(temp_dir) | |
| os.makedirs(output_dir, exist_ok=True) | |
| path_to_data_to_extract = os.path.join(template_dir, "data_to_extract.json") | |
| text_file_path = os.path.join(output_dir, 'extracted_text.txt') | |
| try: | |
| # Convert PDF to images and process | |
| images = convert_from_path(pdf_file.name) | |
| annotated_images = [] | |
| # Process each page | |
| for i, img in enumerate(images): | |
| temp_img_path = os.path.join(temp_dir, f'temp_page_{i}.png') | |
| img.save(temp_img_path) | |
| blocks, annotated_image_path = process_image(temp_img_path, output_dir, i) | |
| annotated_images.append(annotated_image_path) | |
| save_extracted_text(blocks, i + 1, output_dir) | |
| # Create ZIP file | |
| zip_path = os.path.join(temp_dir, "annotated_images.zip") | |
| with zipfile.ZipFile(zip_path, 'w') as zipf: | |
| for img_path in annotated_images: | |
| zipf.write(img_path, os.path.basename(img_path)) | |
| # Process with Gemini | |
| extracted_data = extract_data_with_gemini(text_file_path, path_to_data_to_extract) | |
| # Save extracted data to JSON file | |
| json_path = os.path.join(temp_dir, "extracted_data.json") | |
| with open(json_path, 'w', encoding='utf-8') as f: | |
| json.dump(extracted_data, f, ensure_ascii=False, indent=2) | |
| # Generate DOCX report | |
| try: | |
| docx_path = os.path.join(temp_dir, "rapport_extraction.docx") | |
| generator = RapportGenerator(json_path, docx_path) | |
| generator.generate_report() | |
| except Exception as e: | |
| raise gr.Error(f"Error processing rapport: {str(e)}") | |
| return text_file_path, zip_path, json_path, docx_path | |
| except Exception as e: | |
| raise gr.Error(f"Error processing PDF: {str(e)}") | |
| # Gradio Interface | |
| css = """ | |
| .gradio-container { | |
| font-family: 'IBM Plex Sans', sans-serif; | |
| } | |
| .gr-button { | |
| color: white; | |
| border-radius: 8px; | |
| background: linear-gradient(45deg, #7928CA, #FF0080); | |
| border: none; | |
| } | |
| """ | |
| demo = gr.Interface( | |
| fn=process_pdf, | |
| inputs=[ | |
| gr.File( | |
| label="Télécharger un document PDF", | |
| file_types=[".pdf"], | |
| type="filepath" | |
| ) | |
| ], | |
| outputs=[ | |
| gr.File(label="Texte extrait (TXT)"), | |
| gr.File(label="Images annotées (ZIP)"), | |
| gr.File(label="Données extraites (JSON)"), | |
| gr.File(label="Rapport généré (DOCX)") | |
| ], | |
| title="ORDONNANCE DE REFERE<br>Extraction de texte PDF et création d'un rapport DOCX", | |
| description=""" | |
| Téléchargez un document PDF pour : | |
| 1. Extraire le contenu textuel | |
| 2. Obtenir des images annotées montrant les blocs de texte détectés | |
| 3. Extraire des données structurées grâce à une analyse IA | |
| 4. Générer un rapport formaté au format DOCX | |
| Prend en charge les documents multi-pages et les documents juridiques français. | |
| """, | |
| css=css, | |
| examples=[], | |
| cache_examples=False, | |
| theme=gr.themes.Soft() | |
| ) | |
| # Launch the app | |
| if __name__ == "__main__": | |
| demo.launch( | |
| debug=True, | |
| auth=authenticate | |
| ).launch() |