|
|
import os |
|
|
|
|
|
|
|
|
|
|
|
os.system('pip install -q torch==1.10.0+cu111 torchvision==0.11+cu111 -f https://download.pytorch.org/whl/torch_stable.html') |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
os.system('pip install git+https://github.com/facebookresearch/detectron2.git') |
|
|
|
|
|
import detectron2 |
|
|
from detectron2.utils.logger import setup_logger |
|
|
setup_logger() |
|
|
|
|
|
import gradio as gr |
|
|
import re |
|
|
import string |
|
|
import torch |
|
|
|
|
|
from operator import itemgetter |
|
|
import collections |
|
|
|
|
|
import pypdf |
|
|
from pypdf import PdfReader |
|
|
from pypdf.errors import PdfReadError |
|
|
|
|
|
import pypdfium2 as pdfium |
|
|
import langdetect |
|
|
from langdetect import detect_langs |
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import random |
|
|
import tempfile |
|
|
import itertools |
|
|
|
|
|
from matplotlib import font_manager |
|
|
from PIL import Image, ImageDraw, ImageFont |
|
|
import cv2 |
|
|
|
|
|
import pathlib |
|
|
from pathlib import Path |
|
|
import shutil |
|
|
|
|
|
|
|
|
print(os.popen(f'cat /etc/debian_version').read()) |
|
|
print(os.popen(f'cat /etc/issue').read()) |
|
|
print(os.popen(f'apt search tesseract').read()) |
|
|
import pytesseract |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
label2color = { |
|
|
'Caption': 'brown', |
|
|
'Footnote': 'orange', |
|
|
'Formula': 'gray', |
|
|
'List-item': 'yellow', |
|
|
'Page-footer': 'red', |
|
|
'Page-header': 'red', |
|
|
'Picture': 'violet', |
|
|
'Section-header': 'orange', |
|
|
'Table': 'green', |
|
|
'Text': 'blue', |
|
|
'Title': 'pink' |
|
|
} |
|
|
|
|
|
|
|
|
cls_box = [0, 0, 0, 0] |
|
|
sep_box = [1000, 1000, 1000, 1000] |
|
|
|
|
|
|
|
|
model_id = "pierreguillou/layout-xlm-base-finetuned-with-DocLayNet-base-at-paragraphlevel-ml512" |
|
|
|
|
|
|
|
|
tokenizer_id = "xlm-roberta-base" |
|
|
|
|
|
|
|
|
if str(384) in model_id: |
|
|
max_length = 384 |
|
|
elif str(512) in model_id: |
|
|
max_length = 512 |
|
|
else: |
|
|
print("Error with max_length of chunks!") |
|
|
|
|
|
|
|
|
doc_stride = 128 |
|
|
|
|
|
|
|
|
max_imgboxes = 2 |
|
|
|
|
|
|
|
|
examples_dir = 'files/' |
|
|
Path(examples_dir).mkdir(parents=True, exist_ok=True) |
|
|
from huggingface_hub import hf_hub_download |
|
|
files = ["example.pdf", "blank.pdf", "blank.png", "languages_iso.csv", "languages_tesseract.csv", "wo_content.png"] |
|
|
for file_name in files: |
|
|
path_to_file = hf_hub_download( |
|
|
repo_id = "pierreguillou/Inference-APP-Document-Understanding-at-paragraphlevel-v2", |
|
|
filename = "files/" + file_name, |
|
|
repo_type = "space" |
|
|
) |
|
|
shutil.copy(path_to_file,examples_dir) |
|
|
|
|
|
|
|
|
image_wo_content = examples_dir + "wo_content.png" |
|
|
pdf_blank = examples_dir + "blank.pdf" |
|
|
image_blank = examples_dir + "blank.png" |
|
|
|
|
|
|
|
|
t = "files/languages_tesseract.csv" |
|
|
l = "files/languages_iso.csv" |
|
|
|
|
|
df_t = pd.read_csv(t) |
|
|
df_l = pd.read_csv(l) |
|
|
|
|
|
langs_t = df_t["Language"].to_list() |
|
|
langs_t = [lang_t.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_t in langs_t] |
|
|
langs_l = df_l["Language"].to_list() |
|
|
langs_l = [lang_l.lower().strip().translate(str.maketrans('', '', string.punctuation)) for lang_l in langs_l] |
|
|
langscode_t = df_t["LangCode"].to_list() |
|
|
langscode_l = df_l["LangCode"].to_list() |
|
|
|
|
|
Tesseract2langdetect, langdetect2Tesseract = dict(), dict() |
|
|
for lang_t, langcode_t in zip(langs_t,langscode_t): |
|
|
try: |
|
|
if lang_t == "Chinese - Simplified".lower().strip().translate(str.maketrans('', '', string.punctuation)): lang_t = "chinese" |
|
|
index = langs_l.index(lang_t) |
|
|
langcode_l = langscode_l[index] |
|
|
Tesseract2langdetect[langcode_t] = langcode_l |
|
|
except: |
|
|
continue |
|
|
|
|
|
langdetect2Tesseract = {v:k for k,v in Tesseract2langdetect.items()} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import torch |
|
|
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") |
|
|
|
|
|
|
|
|
import transformers |
|
|
from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
|
tokenizer_lilt = AutoTokenizer.from_pretrained(model_id_lilt) |
|
|
model_lilt = AutoModelForTokenClassification.from_pretrained(model_id_lilt); |
|
|
model_lilt.to(device); |
|
|
|
|
|
|
|
|
from transformers import LayoutLMv2ForTokenClassification |
|
|
model_layoutxlm = LayoutLMv2ForTokenClassification.from_pretrained(model_id_layoutxlm); |
|
|
model_layoutxlm.to(device); |
|
|
|
|
|
|
|
|
from transformers import LayoutLMv2FeatureExtractor |
|
|
feature_extractor = LayoutLMv2FeatureExtractor(apply_ocr=False) |
|
|
|
|
|
|
|
|
from transformers import AutoTokenizer |
|
|
tokenizer_layoutxlm = AutoTokenizer.from_pretrained(tokenizer_id_layoutxlm) |
|
|
|
|
|
|
|
|
id2label_lilt = model_lilt.config.id2label |
|
|
label2id_lilt = model_lilt.config.label2id |
|
|
num_labels_lilt = len(id2label_lilt) |
|
|
|
|
|
id2label_layoutxlm = model_layoutxlm.config.id2label |
|
|
label2id_layoutxlm = model_layoutxlm.config.label2id |
|
|
num_labels_layoutxlm = len(id2label_layoutxlm) |
|
|
|
|
|
|