Spaces:

vk888
/

SVTR-OCR-App

Sleeping

App Files Files Community

vk commited on Feb 23

Commit

9a0940b

1 Parent(s): 0b577b8

boundig box sorting

Browse files

Files changed (6) hide show

app.py +33 -2
models/{ocr_fp16.bin → ocr.bin} +0 -0
models/{ocr_fp16.xml → ocr.xml} +0 -0
ocr_inference.py +3 -27
text_detection.py +6 -42
utils.py +18 -1

app.py CHANGED Viewed

@@ -1,19 +1,50 @@
 from text_detection import Text_Detection
 import gradio as gr
 def get_response(input_img):
-    return text_detector.predict([input_img])
 if __name__ == "__main__":
     text_detector=Text_Detection('models/text_detector.xml')
     iface = gr.Interface(
         fn=get_response,
         inputs=gr.Image(type="numpy"),  # Accepts image input
         outputs=gr.Textbox(),
         title="SVTR-OCR-App",
-        description="Upload images for accurate English / Latin OCR. Works best with document images and document type of texts "
     )
     iface.launch(share=True)

 from text_detection import Text_Detection
+from ocr_inference import  OCR
+import numpy as np
+import cv2
 import gradio as gr
 def get_response(input_img):
+    if hasattr(input_img,'shape'):
+        src_img=input_img.copy()
+    outputs=text_detector.predict([input_img])
+    texts = ["Found texts:"]
+    dt_boxes = outputs[0][0]['points']
+    dt_scores = outputs[1][0]['scores']
+    if len(dt_boxes) > 0:
+        j = 0
+        for score, box in zip(dt_scores, dt_boxes):
+            pts = np.array(box).astype(np.int32).reshape((-1, 1, 2))
+            mask = np.zeros(src_img.shape[:2], dtype=np.uint8)
+            cv2.fillPoly(mask, [pts], 255)
+            # Extract the region
+            result = cv2.bitwise_and(src_img, src_img, mask=mask)
+            # Find bounding box and crop
+            x, y, w, h = cv2.boundingRect(pts)
+            cropped = result[y:y + h, x:x + w, :]
+            # cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
+            # cv2.imwrite(f"cropped/output_{i}_{j}.jpg",cropped)
+            texts.append(ocr.predict([cropped]))
+            j += 1
+    return "\n".join(texts)
 if __name__ == "__main__":
     text_detector=Text_Detection('models/text_detector.xml')
+    ocr=OCR('models/ocr.xml')
     iface = gr.Interface(
         fn=get_response,
         inputs=gr.Image(type="numpy"),  # Accepts image input
         outputs=gr.Textbox(),
         title="SVTR-OCR-App",
+        description="Upload images for fast & accurate English(Latin) OCR. Works best with image format of invoice, documents"
     )
     iface.launch(share=True)

models/{ocr_fp16.bin → ocr.bin} RENAMED Viewed

File without changes

models/{ocr_fp16.xml → ocr.xml} RENAMED Viewed

File without changes

ocr_inference.py CHANGED Viewed

@@ -16,7 +16,7 @@ import numpy as np
 from openvino.runtime import Core
 import math
 import cv2
-from utils import CTCLabelDecode
@@ -43,32 +43,8 @@ class OCR():
         self.dynamic_width=False
-    def img_decode(self,img):
-        img = np.frombuffer(img, dtype='uint8')
-        img=cv2.imdecode(img, 1)
-        #print(img.shape)
-        return img
-    def preprocess_img(self,img):
-        grayscale_image = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
-        # Create an empty array of shape (height, width, 3) for the stacked image
-        stacked_image = np.zeros((grayscale_image.shape[0], grayscale_image.shape[1], 3), dtype=np.uint8)
-        # Assign the grayscale image to each channel of the stacked image
-        stacked_image[:, :, 0] = grayscale_image
-        stacked_image[:, :, 1] = grayscale_image
-        stacked_image[:, :, 2] = grayscale_image
-        return self.resize_norm_img(stacked_image)
     def resize_norm_img(self,img,
                     padding=True,
                     interpolation=cv2.INTER_LINEAR):
@@ -125,13 +101,13 @@ class OCR():
             if hasattr(item,'shape'):
-                imgs.append(np.expand_dims(self.preprocess_img(item),axis=0))
             elif isinstance(item,str):
                 with open(item, 'rb') as f:
                     content=f.read()
-                imgs.append(np.expand_dims(self.preprocess_img(self.img_decode(content)),axis=0))
             else:
                 return "Error: Invalid Input"

 from openvino.runtime import Core
 import math
 import cv2
+from utils import CTCLabelDecode,img_decode
         self.dynamic_width=False
     def resize_norm_img(self,img,
                     padding=True,
                     interpolation=cv2.INTER_LINEAR):
             if hasattr(item,'shape'):
+                imgs.append(np.expand_dims(self.resize_norm_img(item),axis=0))
             elif isinstance(item,str):
                 with open(item, 'rb') as f:
                     content=f.read()
+                imgs.append(np.expand_dims(self.resize_norm_img(img_decode(content)),axis=0))
             else:
                 return "Error: Invalid Input"

text_detection.py CHANGED Viewed

@@ -13,9 +13,9 @@ limitations under the License.
 import numpy as np
 from openvino.runtime import Core
-from utils import DBPostProcess
 import cv2
-from ocr_inference import  OCR
@@ -29,8 +29,6 @@ class Text_Detection():
         model = ie.read_model(model=model_path)
         self.compiled_model = ie.compile_model(model=model, device_name="CPU")
-        self.ocr=OCR('models/ocr_fp16.xml')
         self.input_layer = self.compiled_model.input(0)
         self.output_layer = self.compiled_model.output(0)
         self.show_frame = None
@@ -42,16 +40,7 @@ class Text_Detection():
         self.std= [0.229, 0.224, 0.225]
         self.postprocess_detection=DBPostProcess()
-    def img_decode(self, img):
-        img = np.frombuffer(img, dtype='uint8')
-        img = cv2.imdecode(img, 1)
-        # print(img.shape)
-        return img
-    def preprocess_img(self, img):
-        return self.resize_norm_img(img)
     def resize_norm_img(self, img,):
@@ -122,15 +111,15 @@ class Text_Detection():
         for item in src:
             if hasattr(item, 'shape'):
-                preprocessed_data=self.preprocess_img(item)
                 src_imgs.append(item)
             elif isinstance(item, str):
                 with open(item, 'rb') as f:
                     content = f.read()
-                decoded_img=self.img_decode(content)
-                preprocessed_data = self.preprocess_img(decoded_img)
                 src_imgs.append(decoded_img)
             else:
@@ -145,32 +134,7 @@ class Text_Detection():
         outputs = self.compiled_model([blob])[self.output_layer]
         outputs=self.postprocess_detection(outputs,shape_list)
-        texts=["Found texts:"]
-        for i,src_img in enumerate(src_imgs):
-            dt_boxes = outputs[0][i]['points']
-            dt_scores= outputs[1][i]['scores']
-            if len(dt_boxes) > 0:
-                j=0
-                for score, box in zip(dt_scores, dt_boxes):
-                    pts = np.array(box).astype(np.int32).reshape((-1, 1, 2))
-                    mask = np.zeros(src_img.shape[:2], dtype=np.uint8)
-                    cv2.fillPoly(mask, [pts], 255)
-                    # Extract the region
-                    result = cv2.bitwise_and(src_img,src_img, mask=mask)
-                    # Find bounding box and crop
-                    x, y, w, h = cv2.boundingRect(pts)
-                    cropped = result[y:y + h, x:x + w,:]
-                    #cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
-                    #cv2.imwrite(f"cropped/output_{i}_{j}.jpg",cropped)
-                    texts.append(self.ocr.predict([cropped]))
-                    j+=1
-        return "\n".join(texts)

 import numpy as np
 from openvino.runtime import Core
+from utils import DBPostProcess,img_decode
 import cv2
         model = ie.read_model(model=model_path)
         self.compiled_model = ie.compile_model(model=model, device_name="CPU")
         self.input_layer = self.compiled_model.input(0)
         self.output_layer = self.compiled_model.output(0)
         self.show_frame = None
         self.std= [0.229, 0.224, 0.225]
         self.postprocess_detection=DBPostProcess()
     def resize_norm_img(self, img,):
         for item in src:
             if hasattr(item, 'shape'):
+                preprocessed_data=self.resize_norm_img(item)
                 src_imgs.append(item)
             elif isinstance(item, str):
                 with open(item, 'rb') as f:
                     content = f.read()
+                decoded_img=img_decode(content)
+                preprocessed_data = self.resize_norm_img(decoded_img)
                 src_imgs.append(decoded_img)
             else:
         outputs = self.compiled_model([blob])[self.output_layer]
         outputs=self.postprocess_detection(outputs,shape_list)
+        return outputs

utils.py CHANGED Viewed

@@ -19,6 +19,12 @@ from shapely.geometry import Polygon
 import pyclipper
 class DBPostProcess(object):
@@ -213,6 +219,17 @@ class DBPostProcess(object):
         return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
     def __call__(self,pred, shape_list):
         pred = pred[:, 0, :, :]
         segmentation = pred > self.thresh
@@ -231,12 +248,12 @@ class DBPostProcess(object):
                 boxes, scores = self.polygons_from_bitmap(pred[batch_index],
                                                           mask, src_w, src_h)
             elif self.box_type == 'quad':
-                print(mask.shape)
                 boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
                                                        src_w, src_h)
             else:
                 raise ValueError("box_type can only be one of ['quad', 'poly']")
             boxes_batch.append({'points': boxes})
             scores_batch.append({'scores':scores})
         return [boxes_batch,scores_batch]

 import pyclipper
+def img_decode(img):
+    img = np.frombuffer(img, dtype='uint8')
+    img = cv2.imdecode(img, 1)
+    # print(img.shape)
+    return img
 class DBPostProcess(object):
         return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
+    def sort_boxes(self,boxes):
+       ymin_list=[]
+       for box in boxes:
+           xmin,ymin=box[0]
+           ymin_list.append(ymin)
+       ymin_sorted_indices=np.argsort(ymin_list)
+       boxes=[boxes[i]  for i in ymin_sorted_indices]
+       return boxes
     def __call__(self,pred, shape_list):
         pred = pred[:, 0, :, :]
         segmentation = pred > self.thresh
                 boxes, scores = self.polygons_from_bitmap(pred[batch_index],
                                                           mask, src_w, src_h)
             elif self.box_type == 'quad':
                 boxes, scores = self.boxes_from_bitmap(pred[batch_index], mask,
                                                        src_w, src_h)
             else:
                 raise ValueError("box_type can only be one of ['quad', 'poly']")
+            boxes=self.sort_boxes(boxes)
             boxes_batch.append({'points': boxes})
             scores_batch.append({'scores':scores})
         return [boxes_batch,scores_batch]