Spaces:

gaunernst
/

layoutlm-docvqa-paddleocr

Sleeping

App Files Files Community

layoutlm-docvqa-paddleocr / app.py

gaunernst

switch to Docker space

71a9e68 almost 2 years ago

raw

history blame

2.19 kB

	import cv2
	import gradio as gr
	import numpy as np
	from paddleocr import PaddleOCR
	from PIL import Image
	from transformers import pipeline
	from transformers.pipelines.document_question_answering import apply_tesseract

	PIPE = pipeline("document-question-answering", "impira/layoutlm-document-qa")
	OCR = PaddleOCR(
	use_angle_cls=True,
	lang="en",
	det_limit_side_len=10_000,
	det_db_score_mode="slow",
	enable_mlkdnn=True,
	)


	PADDLE_OCR_LABEL = "PaddleOCR (en)"
	TESSERACT_LABEL = "Tesseract (HF default)"


	def predict(image: Image.Image, question: str, ocr_engine: str):
	image_np = np.asarray(image)

	if ocr_engine == PADDLE_OCR_LABEL:
	ocr_result = OCR.ocr(image_np)[0]
	words = [x[1][0] for x in ocr_result]
	boxes = np.asarray([x[0] for x in ocr_result]) # (n_boxes, 4, 2)

	for box in boxes:
	cv2.polylines(image_np, [box.reshape(-1, 1, 2).astype(int)], True, (0, 255, 255), 3)

	x1 = boxes[:, :, 0].min(1) * 1000 / image.width
	y1 = boxes[:, :, 1].min(1) * 1000 / image.height
	x2 = boxes[:, :, 0].max(1) * 1000 / image.width
	y2 = boxes[:, :, 1].max(1) * 1000 / image.height

	# (n_boxes, 4) in xyxy format
	boxes = np.stack([x1, y1, x2, y2], axis=1).astype(int)

	elif ocr_engine == TESSERACT_LABEL:
	words, boxes = apply_tesseract(image, None, "")

	for x1, y1, x2, y2 in boxes:
	x1 = int(x1 * image.width / 1000)
	y1 = int(y1 * image.height / 1000)
	x2 = int(x2 * image.width / 1000)
	y2 = int(y2 * image.height / 1000)
	cv2.rectangle(image_np, (x1, y1), (x2, y2), (0, 255, 255), 3)

	else:
	raise ValueError(f"Unsupported ocr_engine={ocr_engine}")

	word_boxes = list(zip(words, boxes))
	result = PIPE(image, question, word_boxes)[0]
	return result["answer"], result["score"], image_np


	gr.Interface(
	fn=predict,
	inputs=[
	gr.Image(type="pil"),
	"text",
	gr.Radio([PADDLE_OCR_LABEL, TESSERACT_LABEL]),
	],
	outputs=[
	gr.Textbox(label="Answer"),
	gr.Number(label="Score"),
	gr.Image(label="OCR results"),
	],
	).launch()