|  | import os | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html') | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | os.system('pip install git+https://github.com/facebookresearch/detectron2.git') | 
					
						
						|  |  | 
					
						
						|  | import detectron2 | 
					
						
						|  | from detectron2.utils.logger import setup_logger | 
					
						
						|  | setup_logger() | 
					
						
						|  |  | 
					
						
						|  | import gradio as gr | 
					
						
						|  | import re | 
					
						
						|  | import string | 
					
						
						|  |  | 
					
						
						|  | from operator import itemgetter | 
					
						
						|  | import collections | 
					
						
						|  |  | 
					
						
						|  | import pypdf | 
					
						
						|  | from pypdf import PdfReader | 
					
						
						|  | from pypdf.errors import PdfReadError | 
					
						
						|  |  | 
					
						
						|  | import pypdfium2 as pdfium | 
					
						
						|  | import langdetect | 
					
						
						|  | from langdetect import detect_langs | 
					
						
						|  |  | 
					
						
						|  | import pandas as pd | 
					
						
						|  | import numpy as np | 
					
						
						|  | import random | 
					
						
						|  | import tempfile | 
					
						
						|  | import itertools | 
					
						
						|  |  | 
					
						
						|  | from matplotlib import font_manager | 
					
						
						|  | from PIL import Image, ImageDraw, ImageFont | 
					
						
						|  | import cv2 | 
					
						
						|  |  | 
					
						
						|  | import pathlib | 
					
						
						|  | from pathlib import Path | 
					
						
						|  | import shutil | 
					
						
						|  |  | 
					
						
						|  | from functools import partial | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | import sys | 
					
						
						|  | sys.path.insert(0, 'files/') | 
					
						
						|  |  | 
					
						
						|  | import functions | 
					
						
						|  | from functions import * | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | os.system('python -m pip install --upgrade pip') | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | model_id_lilt = "pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" | 
					
						
						|  | model_id_layoutxlm = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | tokenizer_id_layoutxlm = "xlm-roberta-base" | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | import torch | 
					
						
						|  | device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | import transformers | 
					
						
						|  | from transformers import AutoTokenizer, AutoModelForTokenClassification | 
					
						
						|  | tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt) | 
					
						
						|  | model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt); | 
					
						
						|  | model_lilt.to(device); | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | from transformers import LayoutLMv2ForTokenClassification | 
					
						
						|  | model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm); | 
					
						
						|  | model_layoutxlm.to(device); | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | from transformers import LayoutLMv2FeatureExtractor | 
					
						
						|  | feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | from transformers import AutoTokenizer | 
					
						
						|  | tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | id2label_lilt = model_lilt.config.id2label | 
					
						
						|  | label2id_lilt = model_lilt.config.label2id | 
					
						
						|  | num_labels_lilt = len(id2label_lilt) | 
					
						
						|  |  | 
					
						
						|  | id2label_layoutxlm = model_layoutxlm.config.id2label | 
					
						
						|  | label2id_layoutxlm = model_layoutxlm.config.label2id | 
					
						
						|  | num_labels_layoutxlm = len(id2label_layoutxlm) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | def app_outputs_by_model(uploaded_pdf, model_id, model, tokenizer, max_length, id2label, cls_box, sep_box): | 
					
						
						|  | filename, msg, images = pdf_to_images(uploaded_pdf) | 
					
						
						|  | num_images = len(images) | 
					
						
						|  |  | 
					
						
						|  | if not msg.startswith("Error with the PDF"): | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | dataset, texts_lines, texts_pars, texts_lines_par, row_indexes, par_boxes, line_boxes, lines_par_boxes = extraction_data_from_image(images) | 
					
						
						|  |  | 
					
						
						|  | prepare_inference_features_partial = partial(prepare_inference_features_paragraph, tokenizer=tokenizer, max_length=max_length, cls_box=cls_box, sep_box=sep_box) | 
					
						
						|  | encoded_dataset = dataset.map(prepare_inference_features_partial, batched=True, batch_size=64, remove_columns=dataset.column_names) | 
					
						
						|  | custom_encoded_dataset = CustomDataset(encoded_dataset, tokenizer) | 
					
						
						|  |  | 
					
						
						|  | outputs, images_ids_list, chunk_ids, input_ids, bboxes = predictions_token_level(images, custom_encoded_dataset, model_id, model) | 
					
						
						|  |  | 
					
						
						|  | probs_bbox, bboxes_list_dict, input_ids_dict_dict, probs_dict_dict, df = predictions_paragraph_level(max_length, tokenizer, id2label, dataset, outputs, images_ids_list, chunk_ids, input_ids, bboxes, cls_box, sep_box) | 
					
						
						|  |  | 
					
						
						|  | images = get_labeled_images(id2label, dataset, images_ids_list, bboxes_list_dict, probs_dict_dict) | 
					
						
						|  |  | 
					
						
						|  | img_files = list() | 
					
						
						|  |  | 
					
						
						|  | for i in range(num_images): | 
					
						
						|  | if filename != "files/blank.png": img_file = f"img_{i}_" + filename.replace(".pdf", ".png") | 
					
						
						|  | else: img_file = filename.replace(".pdf", ".png") | 
					
						
						|  | img_file = img_file.replace("/", "_") | 
					
						
						|  | images[i].save(img_file) | 
					
						
						|  | img_files.append(img_file) | 
					
						
						|  |  | 
					
						
						|  | if num_images < max_imgboxes: | 
					
						
						|  | img_files += [image_blank]*(max_imgboxes - num_images) | 
					
						
						|  | images += [Image.open(image_blank)]*(max_imgboxes - num_images) | 
					
						
						|  | for count in range(max_imgboxes - num_images): | 
					
						
						|  | df[num_images + count] = pd.DataFrame() | 
					
						
						|  | else: | 
					
						
						|  | img_files = img_files[:max_imgboxes] | 
					
						
						|  | images = images[:max_imgboxes] | 
					
						
						|  | df = dict(itertools.islice(df.items(), max_imgboxes)) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | csv_files = list() | 
					
						
						|  | for i in range(max_imgboxes): | 
					
						
						|  | csv_file = f"csv_{i}_" + filename.replace(".pdf", ".csv") | 
					
						
						|  | csv_file = csv_file.replace("/", "_") | 
					
						
						|  | csv_files.append(gr.File.update(value=csv_file, visible=True)) | 
					
						
						|  | df[i].to_csv(csv_file, encoding="utf-8", index=False) | 
					
						
						|  |  | 
					
						
						|  | if max_imgboxes >= 2: | 
					
						
						|  | return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1] | 
					
						
						|  | else: | 
					
						
						|  | return msg, img_files[0], images[0], csv_files[0], df[0] | 
					
						
						|  | else: | 
					
						
						|  | img_files, images, csv_files = [""]*max_imgboxes, [""]*max_imgboxes, [""]*max_imgboxes | 
					
						
						|  | if max_imgboxes >= 2: | 
					
						
						|  | img_files[0], img_files[1] = image_blank, image_blank | 
					
						
						|  | images[0], images[1] = Image.open(image_blank), Image.open(image_blank) | 
					
						
						|  | csv_file = "csv_wo_content.csv" | 
					
						
						|  | csv_files[0], csv_files[1] = gr.File.update(value=csv_file, visible=True), gr.File.update(value=csv_file, visible=True) | 
					
						
						|  | df, df_empty = dict(), pd.DataFrame() | 
					
						
						|  | df[0], df[1] = df_empty.to_csv(csv_file, encoding="utf-8", index=False), df_empty.to_csv(csv_file, encoding="utf-8", index=False) | 
					
						
						|  |  | 
					
						
						|  | return msg, img_files[0], img_files[1], images[0], images[1], csv_files[0], csv_files[1], df[0], df[1] | 
					
						
						|  | else: | 
					
						
						|  | img_files[0] = image_blank | 
					
						
						|  | images[0] = Image.open(image_blank) | 
					
						
						|  | csv_file = "csv_wo_content.csv" | 
					
						
						|  | csv_files[0] = gr.File.update(value=csv_file, visible=True) | 
					
						
						|  | df, df_empty = dict(), pd.DataFrame() | 
					
						
						|  | df[0] = df_empty.to_csv(csv_file, encoding="utf-8", index=False) | 
					
						
						|  |  | 
					
						
						|  | return msg, img_files[0], images[0], csv_files[0], df[0] | 
					
						
						|  |  | 
					
						
						|  | def app_outputs(uploaded_pdf): | 
					
						
						|  | msg_lilt, img_files_lilt, images_lilt, csv_files_lilt, df_lilt = app_outputs_by_model(uploaded_pdf, | 
					
						
						|  | model_id=model_id_lilt, model=model_lilt, tokenizer=tokenizer_lilt, | 
					
						
						|  | max_length=max_length_lilt, id2label=id2label_lilt, cls_box=cls_box, sep_box=sep_box_lilt) | 
					
						
						|  |  | 
					
						
						|  | msg_layoutxlm, img_files_layoutxlm, images_layoutxlm, csv_files_layoutxlm, df_layoutxlm = app_outputs_by_model(uploaded_pdf, | 
					
						
						|  | model_id=model_id_layoutxlm, model=model_layoutxlm, tokenizer=tokenizer_layoutxlm, | 
					
						
						|  | max_length=max_length_layoutxlm, id2label=id2label_layoutxlm, cls_box=cls_box, sep_box=sep_box_layoutxlm) | 
					
						
						|  |  | 
					
						
						|  | return msg_lilt, msg_layoutxlm, img_files_lilt, img_files_layoutxlm, images_lilt, images_layoutxlm, csv_files_lilt, csv_files_layoutxlm, df_lilt, df_layoutxlm | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | with gr.Blocks(title="Inference APP for Document Understanding at paragraph level (v1 - LiLT base vs LayoutXLM base)", css=".gradio-container") as demo: | 
					
						
						|  | gr.HTML(""" | 
					
						
						|  | <div style="font-family:'Times New Roman', 'Serif'; font-size:26pt; font-weight:bold; text-align:center;"><h1>Inference APP for Document Understanding at paragraph level (v1 - LiLT base vs LayoutXLM base)</h1></div> | 
					
						
						|  | <div style="margin-top: 40px"><p>(04/01/2023) This Inference APP compares - only on the first PDF page - 2 Document Understanding models finetuned on the dataset <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/datasets/pierreguillou/DocLayNet-base" target="_blank">DocLayNet base</a> at paragraph level (chunk size of 512 tokens): <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/lilt-xlm-roberta-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" target="_blank">LiLT base combined with XLM-RoBERTa base</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" target="_blank">LayoutXLM base combined with XLM-RoBERTa base</a>.</p></div> | 
					
						
						|  | <div><p>To test these 2 models separately, use their corresponding APP on Hugging Face Spaces: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v1" target="_blank">LiLT base APP (v1 - paragraph level)</a> and <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2" target="_blank">LayoutXLM base APP (v2 - paragraph level)</a>.</p></div><div style="margin-top: 20px"><p>Links to Document Understanding APPs:</p><ul><li>Line level: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v1" target="_blank">v1 (LiLT base)</a> | <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-linelevel-v2" target="_blank">v2 (LayoutXLM base)</a> | <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-comparison-APP-Document-Understanding-at-paragraphlevel-v1" target="_blank">v2 vs v1 (LayoutXLM vs LiLT)</a> | <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v3" target="_blank">Ensemble v1 (LiLT & LayoutXLM)</a></li><li>Paragraph level: <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v1" target="_blank">v1 (LiLT base)</a> | <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://huggingface.co/spaces/pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2" target="_blank">v2 (LayoutXLM base)</a></li></ul></div> | 
					
						
						|  | <div style="margin-top: 20px"><p>More information about the DocLayNet datasets, the finetuning of the model and this APP in the following blog posts:</p><ul><li>(03/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-paragraph-level-3507af80573d" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at paragraph level with LayoutXLM base</a></li><li>(03/25/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-app-to-compare-the-document-understanding-lilt-and-layoutxlm-base-models-at-line-1c53eb481a15" target="_blank">Document AI | APP to compare the Document Understanding LiLT and LayoutXLM (base) models at line level</a></li><li>(03/05/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-and-fine-tuning-notebook-for-document-understanding-at-line-level-with-b08fdca5f4dc" target="_blank">Document AI | Inference APP and fine-tuning notebook for Document Understanding at line level with LayoutXLM base</a></li><li>(02/14/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-inference-app-for-document-understanding-at-line-level-a35bbfa98893" target="_blank">Document AI | Inference APP for Document Understanding at line level</a></li><li>(02/10/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-document-understanding-model-at-line-level-with-lilt-tesseract-and-doclaynet-dataset-347107a643b8" target="_blank">Document AI | Document Understanding model at line level with LiLT, Tesseract and DocLayNet dataset</a></li><li>(01/31/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-doclaynet-image-viewer-app-3ac54c19956" target="_blank">Document AI | DocLayNet image viewer APP</a></li><li>(01/27/2023) <a style="text-decoration: none; border-bottom: #64b5f6 0.125em solid; color: #64b5f6" href="https://medium.com/@pierre_guillou/document-ai-processing-of-doclaynet-dataset-to-be-used-by-layout-models-of-the-hugging-face-hub-308d8bd81cdb" target="_blank">Document AI | Processing of DocLayNet dataset to be used by layout models of the Hugging Face hub (finetuning, inference)</a></li></ul></div> | 
					
						
						|  | """) | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | pdf_file = gr.File(label="PDF") | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | submit_btn = gr.Button(f"Get layout detection by LiLT and LayoutXLM on the first PDF page") | 
					
						
						|  | reset_btn = gr.Button(value="Clear") | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | output_messages = [] | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | output_msg = gr.Textbox(label="LiLT output message") | 
					
						
						|  | output_messages.append(output_msg) | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | output_msg = gr.Textbox(label="LayoutXLM output message") | 
					
						
						|  | output_messages.append(output_msg) | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | fileboxes = [] | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | file_path = gr.File(visible=True, label=f"LiLT image file") | 
					
						
						|  | fileboxes.append(file_path) | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | file_path = gr.File(visible=True, label=f"LayoutXLM image file") | 
					
						
						|  | fileboxes.append(file_path) | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | imgboxes = [] | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | img = gr.Image(type="pil", label=f"Lilt Image") | 
					
						
						|  | imgboxes.append(img) | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | img = gr.Image(type="pil", label=f"LayoutXLM Image") | 
					
						
						|  | imgboxes.append(img) | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | csvboxes = [] | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | csv = gr.File(visible=True, label=f"LiLT csv file at paragraph level") | 
					
						
						|  | csvboxes.append(csv) | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | csv = gr.File(visible=True, label=f"LayoutXLM csv file at paragraph level") | 
					
						
						|  | csvboxes.append(csv) | 
					
						
						|  | with gr.Row(): | 
					
						
						|  | dfboxes = [] | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | df = gr.Dataframe( | 
					
						
						|  | headers=["bounding boxes", "texts", "labels"], | 
					
						
						|  | datatype=["str", "str", "str"], | 
					
						
						|  | col_count=(3, "fixed"), | 
					
						
						|  | visible=True, | 
					
						
						|  | label=f"LiLT data", | 
					
						
						|  | type="pandas", | 
					
						
						|  | wrap=True | 
					
						
						|  | ) | 
					
						
						|  | dfboxes.append(df) | 
					
						
						|  | with gr.Column(): | 
					
						
						|  | df = gr.Dataframe( | 
					
						
						|  | headers=["bounding boxes", "texts", "labels"], | 
					
						
						|  | datatype=["str", "str", "str"], | 
					
						
						|  | col_count=(3, "fixed"), | 
					
						
						|  | visible=True, | 
					
						
						|  | label=f"LayoutXLM data", | 
					
						
						|  | type="pandas", | 
					
						
						|  | wrap=True | 
					
						
						|  | ) | 
					
						
						|  | dfboxes.append(df) | 
					
						
						|  |  | 
					
						
						|  | outputboxes = output_messages + fileboxes + imgboxes + csvboxes + dfboxes | 
					
						
						|  |  | 
					
						
						|  | submit_btn.click(app_outputs, inputs=[pdf_file], outputs=outputboxes) | 
					
						
						|  |  | 
					
						
						|  |  | 
					
						
						|  | reset_btn.click( | 
					
						
						|  | lambda: [pdf_file.update(value=None)] + [output_msg.update(value=None) for output_msg in output_messages] + [filebox.update(value=None) for filebox in fileboxes] + [imgbox.update(value=None) for imgbox in imgboxes] + [csvbox.update(value=None) for csvbox in csvboxes] + [dfbox.update(value=None) for dfbox in dfboxes], | 
					
						
						|  | inputs=[], | 
					
						
						|  | outputs=[pdf_file] + output_messages + fileboxes + imgboxes + csvboxes + dfboxes | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | gr.Examples( | 
					
						
						|  | [["files/example.pdf"]], | 
					
						
						|  | [pdf_file], | 
					
						
						|  | outputboxes, | 
					
						
						|  | fn=app_outputs, | 
					
						
						|  | cache_examples=True, | 
					
						
						|  | ) | 
					
						
						|  |  | 
					
						
						|  | demo.launch() |