Spaces:
Sleeping
Sleeping
vk
commited on
Commit
·
9a0940b
1
Parent(s):
0b577b8
boundig box sorting
Browse files- app.py +33 -2
- models/{ocr_fp16.bin → ocr.bin} +0 -0
- models/{ocr_fp16.xml → ocr.xml} +0 -0
- ocr_inference.py +3 -27
- text_detection.py +6 -42
- utils.py +18 -1
app.py
CHANGED
|
@@ -1,19 +1,50 @@
|
|
| 1 |
from text_detection import Text_Detection
|
|
|
|
|
|
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
|
| 4 |
def get_response(input_img):
|
| 5 |
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
|
| 8 |
|
| 9 |
if __name__ == "__main__":
|
| 10 |
text_detector=Text_Detection('models/text_detector.xml')
|
|
|
|
| 11 |
iface = gr.Interface(
|
| 12 |
fn=get_response,
|
| 13 |
inputs=gr.Image(type="numpy"), # Accepts image input
|
| 14 |
outputs=gr.Textbox(),
|
| 15 |
title="SVTR-OCR-App",
|
| 16 |
-
description="Upload images for accurate English
|
| 17 |
)
|
| 18 |
|
| 19 |
iface.launch(share=True)
|
|
|
|
| 1 |
from text_detection import Text_Detection
|
| 2 |
+
from ocr_inference import OCR
|
| 3 |
+
import numpy as np
|
| 4 |
+
import cv2
|
| 5 |
import gradio as gr
|
| 6 |
|
| 7 |
def get_response(input_img):
|
| 8 |
|
| 9 |
+
if hasattr(input_img,'shape'):
|
| 10 |
+
src_img=input_img.copy()
|
| 11 |
+
outputs=text_detector.predict([input_img])
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
texts = ["Found texts:"]
|
| 15 |
+
dt_boxes = outputs[0][0]['points']
|
| 16 |
+
dt_scores = outputs[1][0]['scores']
|
| 17 |
+
|
| 18 |
+
if len(dt_boxes) > 0:
|
| 19 |
+
j = 0
|
| 20 |
+
for score, box in zip(dt_scores, dt_boxes):
|
| 21 |
+
pts = np.array(box).astype(np.int32).reshape((-1, 1, 2))
|
| 22 |
+
mask = np.zeros(src_img.shape[:2], dtype=np.uint8)
|
| 23 |
+
cv2.fillPoly(mask, [pts], 255)
|
| 24 |
+
|
| 25 |
+
# Extract the region
|
| 26 |
+
result = cv2.bitwise_and(src_img, src_img, mask=mask)
|
| 27 |
+
|
| 28 |
+
# Find bounding box and crop
|
| 29 |
+
x, y, w, h = cv2.boundingRect(pts)
|
| 30 |
+
cropped = result[y:y + h, x:x + w, :]
|
| 31 |
+
# cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
|
| 32 |
+
# cv2.imwrite(f"cropped/output_{i}_{j}.jpg",cropped)
|
| 33 |
+
texts.append(ocr.predict([cropped]))
|
| 34 |
+
j += 1
|
| 35 |
+
|
| 36 |
+
return "\n".join(texts)
|
| 37 |
|
| 38 |
|
| 39 |
if __name__ == "__main__":
|
| 40 |
text_detector=Text_Detection('models/text_detector.xml')
|
| 41 |
+
ocr=OCR('models/ocr.xml')
|
| 42 |
iface = gr.Interface(
|
| 43 |
fn=get_response,
|
| 44 |
inputs=gr.Image(type="numpy"), # Accepts image input
|
| 45 |
outputs=gr.Textbox(),
|
| 46 |
title="SVTR-OCR-App",
|
| 47 |
+
description="Upload images for fast & accurate English(Latin) OCR. Works best with image format of invoice, documents"
|
| 48 |
)
|
| 49 |
|
| 50 |
iface.launch(share=True)
|
models/{ocr_fp16.bin → ocr.bin}
RENAMED
|
File without changes
|
models/{ocr_fp16.xml → ocr.xml}
RENAMED
|
File without changes
|
ocr_inference.py
CHANGED
|
@@ -16,7 +16,7 @@ import numpy as np
|
|
| 16 |
from openvino.runtime import Core
|
| 17 |
import math
|
| 18 |
import cv2
|
| 19 |
-
from utils import CTCLabelDecode
|
| 20 |
|
| 21 |
|
| 22 |
|
|
@@ -43,32 +43,8 @@ class OCR():
|
|
| 43 |
self.dynamic_width=False
|
| 44 |
|
| 45 |
|
| 46 |
-
|
| 47 |
|
| 48 |
|
| 49 |
-
def img_decode(self,img):
|
| 50 |
-
|
| 51 |
-
img = np.frombuffer(img, dtype='uint8')
|
| 52 |
-
img=cv2.imdecode(img, 1)
|
| 53 |
-
#print(img.shape)
|
| 54 |
-
|
| 55 |
-
return img
|
| 56 |
-
|
| 57 |
-
def preprocess_img(self,img):
|
| 58 |
-
|
| 59 |
-
grayscale_image = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
|
| 60 |
-
|
| 61 |
-
# Create an empty array of shape (height, width, 3) for the stacked image
|
| 62 |
-
stacked_image = np.zeros((grayscale_image.shape[0], grayscale_image.shape[1], 3), dtype=np.uint8)
|
| 63 |
-
|
| 64 |
-
# Assign the grayscale image to each channel of the stacked image
|
| 65 |
-
stacked_image[:, :, 0] = grayscale_image
|
| 66 |
-
stacked_image[:, :, 1] = grayscale_image
|
| 67 |
-
stacked_image[:, :, 2] = grayscale_image
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
return self.resize_norm_img(stacked_image)
|
| 71 |
-
|
| 72 |
def resize_norm_img(self,img,
|
| 73 |
padding=True,
|
| 74 |
interpolation=cv2.INTER_LINEAR):
|
|
@@ -125,13 +101,13 @@ class OCR():
|
|
| 125 |
|
| 126 |
if hasattr(item,'shape'):
|
| 127 |
|
| 128 |
-
imgs.append(np.expand_dims(self.
|
| 129 |
|
| 130 |
elif isinstance(item,str):
|
| 131 |
|
| 132 |
with open(item, 'rb') as f:
|
| 133 |
content=f.read()
|
| 134 |
-
imgs.append(np.expand_dims(self.
|
| 135 |
|
| 136 |
else:
|
| 137 |
return "Error: Invalid Input"
|
|
|
|
| 16 |
from openvino.runtime import Core
|
| 17 |
import math
|
| 18 |
import cv2
|
| 19 |
+
from utils import CTCLabelDecode,img_decode
|
| 20 |
|
| 21 |
|
| 22 |
|
|
|
|
| 43 |
self.dynamic_width=False
|
| 44 |
|
| 45 |
|
|
|
|
| 46 |
|
| 47 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
def resize_norm_img(self,img,
|
| 49 |
padding=True,
|
| 50 |
interpolation=cv2.INTER_LINEAR):
|
|
|
|
| 101 |
|
| 102 |
if hasattr(item,'shape'):
|
| 103 |
|
| 104 |
+
imgs.append(np.expand_dims(self.resize_norm_img(item),axis=0))
|
| 105 |
|
| 106 |
elif isinstance(item,str):
|
| 107 |
|
| 108 |
with open(item, 'rb') as f:
|
| 109 |
content=f.read()
|
| 110 |
+
imgs.append(np.expand_dims(self.resize_norm_img(img_decode(content)),axis=0))
|
| 111 |
|
| 112 |
else:
|
| 113 |
return "Error: Invalid Input"
|
text_detection.py
CHANGED
|
@@ -13,9 +13,9 @@ limitations under the License.
|
|
| 13 |
|
| 14 |
import numpy as np
|
| 15 |
from openvino.runtime import Core
|
| 16 |
-
from utils import DBPostProcess
|
| 17 |
import cv2
|
| 18 |
-
|
| 19 |
|
| 20 |
|
| 21 |
|
|
@@ -29,8 +29,6 @@ class Text_Detection():
|
|
| 29 |
|
| 30 |
model = ie.read_model(model=model_path)
|
| 31 |
self.compiled_model = ie.compile_model(model=model, device_name="CPU")
|
| 32 |
-
self.ocr=OCR('models/ocr_fp16.xml')
|
| 33 |
-
|
| 34 |
self.input_layer = self.compiled_model.input(0)
|
| 35 |
self.output_layer = self.compiled_model.output(0)
|
| 36 |
self.show_frame = None
|
|
@@ -42,16 +40,7 @@ class Text_Detection():
|
|
| 42 |
self.std= [0.229, 0.224, 0.225]
|
| 43 |
self.postprocess_detection=DBPostProcess()
|
| 44 |
|
| 45 |
-
def img_decode(self, img):
|
| 46 |
-
|
| 47 |
-
img = np.frombuffer(img, dtype='uint8')
|
| 48 |
-
img = cv2.imdecode(img, 1)
|
| 49 |
-
# print(img.shape)
|
| 50 |
|
| 51 |
-
return img
|
| 52 |
-
|
| 53 |
-
def preprocess_img(self, img):
|
| 54 |
-
return self.resize_norm_img(img)
|
| 55 |
|
| 56 |
def resize_norm_img(self, img,):
|
| 57 |
|
|
@@ -122,15 +111,15 @@ class Text_Detection():
|
|
| 122 |
for item in src:
|
| 123 |
|
| 124 |
if hasattr(item, 'shape'):
|
| 125 |
-
preprocessed_data=self.
|
| 126 |
src_imgs.append(item)
|
| 127 |
|
| 128 |
elif isinstance(item, str):
|
| 129 |
|
| 130 |
with open(item, 'rb') as f:
|
| 131 |
content = f.read()
|
| 132 |
-
decoded_img=
|
| 133 |
-
preprocessed_data = self.
|
| 134 |
src_imgs.append(decoded_img)
|
| 135 |
|
| 136 |
else:
|
|
@@ -145,32 +134,7 @@ class Text_Detection():
|
|
| 145 |
|
| 146 |
outputs = self.compiled_model([blob])[self.output_layer]
|
| 147 |
outputs=self.postprocess_detection(outputs,shape_list)
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
for i,src_img in enumerate(src_imgs):
|
| 151 |
-
dt_boxes = outputs[0][i]['points']
|
| 152 |
-
dt_scores= outputs[1][i]['scores']
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
if len(dt_boxes) > 0:
|
| 156 |
-
j=0
|
| 157 |
-
for score, box in zip(dt_scores, dt_boxes):
|
| 158 |
-
pts = np.array(box).astype(np.int32).reshape((-1, 1, 2))
|
| 159 |
-
mask = np.zeros(src_img.shape[:2], dtype=np.uint8)
|
| 160 |
-
cv2.fillPoly(mask, [pts], 255)
|
| 161 |
-
|
| 162 |
-
# Extract the region
|
| 163 |
-
result = cv2.bitwise_and(src_img,src_img, mask=mask)
|
| 164 |
-
|
| 165 |
-
# Find bounding box and crop
|
| 166 |
-
x, y, w, h = cv2.boundingRect(pts)
|
| 167 |
-
cropped = result[y:y + h, x:x + w,:]
|
| 168 |
-
#cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
|
| 169 |
-
#cv2.imwrite(f"cropped/output_{i}_{j}.jpg",cropped)
|
| 170 |
-
texts.append(self.ocr.predict([cropped]))
|
| 171 |
-
j+=1
|
| 172 |
-
|
| 173 |
-
return "\n".join(texts)
|
| 174 |
|
| 175 |
|
| 176 |
|
|
|
|
| 13 |
|
| 14 |
import numpy as np
|
| 15 |
from openvino.runtime import Core
|
| 16 |
+
from utils import DBPostProcess,img_decode
|
| 17 |
import cv2
|
| 18 |
+
|
| 19 |
|
| 20 |
|
| 21 |
|
|
|
|
| 29 |
|
| 30 |
model = ie.read_model(model=model_path)
|
| 31 |
self.compiled_model = ie.compile_model(model=model, device_name="CPU")
|
|
|
|
|
|
|
| 32 |
self.input_layer = self.compiled_model.input(0)
|
| 33 |
self.output_layer = self.compiled_model.output(0)
|
| 34 |
self.show_frame = None
|
|
|
|
| 40 |
self.std= [0.229, 0.224, 0.225]
|
| 41 |
self.postprocess_detection=DBPostProcess()
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 44 |
|
| 45 |
def resize_norm_img(self, img,):
|
| 46 |
|
|
|
|
| 111 |
for item in src:
|
| 112 |
|
| 113 |
if hasattr(item, 'shape'):
|
| 114 |
+
preprocessed_data=self.resize_norm_img(item)
|
| 115 |
src_imgs.append(item)
|
| 116 |
|
| 117 |
elif isinstance(item, str):
|
| 118 |
|
| 119 |
with open(item, 'rb') as f:
|
| 120 |
content = f.read()
|
| 121 |
+
decoded_img=img_decode(content)
|
| 122 |
+
preprocessed_data = self.resize_norm_img(decoded_img)
|
| 123 |
src_imgs.append(decoded_img)
|
| 124 |
|
| 125 |
else:
|
|
|
|
| 134 |
|
| 135 |
outputs = self.compiled_model([blob])[self.output_layer]
|
| 136 |
outputs=self.postprocess_detection(outputs,shape_list)
|
| 137 |
+
return outputs
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
|
| 140 |
|
utils.py
CHANGED
|
@@ -19,6 +19,12 @@ from shapely.geometry import Polygon
|
|
| 19 |
import pyclipper
|
| 20 |
|
| 21 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
|
| 24 |
class DBPostProcess(object):
|
|
@@ -213,6 +219,17 @@ class DBPostProcess(object):
|
|
| 213 |
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
|
| 214 |
|
| 215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 216 |
def __call__(self,pred, shape_list):
|
| 217 |
pred = pred[:, 0, :, :]
|
| 218 |
segmentation = pred > self.thresh
|
|
@@ -231,12 +248,12 @@ class DBPostProcess(object):
|
|
| 231 |
boxes, scores = self.polygons_from_bitmap(pred[batch_index],
|
| 232 |
mask, src_w, src_h)
|
| 233 |
elif self.box_type == 'quad':
|
| 234 |
-
print(mask.shape)
|
| 235 |
boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
|
| 236 |
src_w, src_h)
|
| 237 |
else:
|
| 238 |
raise ValueError("box_type can only be one of ['quad', 'poly']")
|
| 239 |
|
|
|
|
| 240 |
boxes_batch.append({'points': boxes})
|
| 241 |
scores_batch.append({'scores':scores})
|
| 242 |
return [boxes_batch,scores_batch]
|
|
|
|
| 19 |
import pyclipper
|
| 20 |
|
| 21 |
|
| 22 |
+
def img_decode(img):
|
| 23 |
+
img = np.frombuffer(img, dtype='uint8')
|
| 24 |
+
img = cv2.imdecode(img, 1)
|
| 25 |
+
# print(img.shape)
|
| 26 |
+
|
| 27 |
+
return img
|
| 28 |
|
| 29 |
|
| 30 |
class DBPostProcess(object):
|
|
|
|
| 219 |
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
|
| 220 |
|
| 221 |
|
| 222 |
+
def sort_boxes(self,boxes):
|
| 223 |
+
|
| 224 |
+
ymin_list=[]
|
| 225 |
+
for box in boxes:
|
| 226 |
+
xmin,ymin=box[0]
|
| 227 |
+
ymin_list.append(ymin)
|
| 228 |
+
ymin_sorted_indices=np.argsort(ymin_list)
|
| 229 |
+
boxes=[boxes[i] for i in ymin_sorted_indices]
|
| 230 |
+
return boxes
|
| 231 |
+
|
| 232 |
+
|
| 233 |
def __call__(self,pred, shape_list):
|
| 234 |
pred = pred[:, 0, :, :]
|
| 235 |
segmentation = pred > self.thresh
|
|
|
|
| 248 |
boxes, scores = self.polygons_from_bitmap(pred[batch_index],
|
| 249 |
mask, src_w, src_h)
|
| 250 |
elif self.box_type == 'quad':
|
|
|
|
| 251 |
boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
|
| 252 |
src_w, src_h)
|
| 253 |
else:
|
| 254 |
raise ValueError("box_type can only be one of ['quad', 'poly']")
|
| 255 |
|
| 256 |
+
boxes=self.sort_boxes(boxes)
|
| 257 |
boxes_batch.append({'points': boxes})
|
| 258 |
scores_batch.append({'scores':scores})
|
| 259 |
return [boxes_batch,scores_batch]
|