Intelligent_EHR_extract / Git / Diff of /acabs.py

Models:

philipB/

Intelligent_EHR_extract

Downloads: 1

Diff of /acabs.py [000000] .. [036ed5]

Switch to unified view

 b/acabs.py
+import cv2
+import pytesseract
+import numpy as np
+import os
+#tesseract dependencies
+pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
+tessdata_dir_config = '--tessdata-dir "C:\\Program Files\\Tesseract-OCR\\tessdata"'
+cv2.resizeWindow('output', 400, 400)
+# OCR in text block
+def ACABS(imgg):
+    img = cv2.imread(imgg)
+    output = img.copy()
+    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    #crop 10 colums one the right and left of the image
+    #avoid bugs with the segmentation algorithm on the border of the image
+    h, w = gray.shape
+    gray = np.array(gray[0:0 + h, 10: w - 10])
+    # clean the image using otsu method with the inversed binarized image
+    ret1, th1 = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
+    boxs = create_bounding_box(th1, output)
+    crop_top, crop_bottom = crop(boxs, gray, 0.10)
+    text = ocr(gray, crop_top, crop_bottom, boxs)
+    return text
+# apply dilatation and erosion
+# create bounding box
+def create_bounding_box(thresh, output):
+    boxs = []
+    # assign a rectangle kernel size
+    k1 = np.ones((15, 15), 'uint8')
+    par_img = cv2.dilate(thresh, k1, iterations=3)
+    par_img2 = cv2.erode(par_img, kernel = np.ones((5,5), 'uint8'), iterations=3)
+    (contours, _) = cv2.findContours(par_img2.copy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
+    #create the bounding box
+    for cnt in reversed(contours):
+        x, y, w, h = cv2.boundingRect(cnt)
+        boxs.append([x, y, w, h])
+        #draw bounding box on image
+        #cv2.rectangle(output, (x, y), (x + w, y + h), (0, 255, 0), 1)
+    #cv2.imwrite("../segmentation/adaptative_reding_exemple/repport_9_box.jpg", output)
+    return boxs
+#define line of cropping
+def crop(boxs, gray, percent):
+    h, w = gray.shape
+    #define percent of Y crop on the image
+    top_crop_percent = percent
+    bottom_crop_percent = percent
+    crop_top = h * top_crop_percent
+    crop_bottom = h * bottom_crop_percent
+    #test
+    output = gray.copy()
+    #Adapt crop to avoid cropping in the middle of a bounding box
+    #if the crop is made between Y and Y + H (bounding box) the crop become Y + H
+    # box[0] : x
+    # box[1] : y
+    # box[2] : w
+    # box[3] : h
+    change_bottom = False
+    for box in boxs:
+        if (inbetween(box[1], crop_top, box[1] + box[3]) == crop_top):
+            crop_top = box[1] + box[3]
+        if (inbetween(box[1], h-crop_bottom, box[1] + box[3]) == h-crop_bottom):
+            crop_bottom = box[1] + box[3]
+            change_bottom = True
+        cv2.rectangle(output, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (0, 255, 0), 10)
+    #round to avoid error
+    crop_top = round(crop_top)
+    crop_bottom = round(crop_bottom)
+    #if bottom crop haven't change
+    if not change_bottom:
+        crop_bottom = h-crop_bottom
+    #print bottom and top threshold lines on the image
+    # bottom
+    #output = cv2.line(output, (0, crop_bottom), (w, crop_bottom), (0, 0, 255), 5)
+    # top
+    #output = cv2.line(output, (0, crop_top), (w, crop_top), (0, 0, 255), 5)
+    #cv2.imwrite("../segmentation/adaptative_reding_exemple/repport_9_box.jpg", output)
+    return crop_top, crop_bottom
+#find if val is between min and max
+def inbetween(min,val,max):
+    return sorted([min,val,max])[1]
+#extract text for each bounding box
+def ocr(gray, crop_top, crop_bottom, cropped_boxs):
+        text = ""
+        for box in cropped_boxs:
+            #box[0] : x
+            #box[1] : y
+            #box[2] : w
+            #box[3] : h
+            if (box[1] > crop_top) and (box[1] + box[3] <= crop_bottom):
+                ocr_box = np.array(gray[box[1]:box[1] + box[3], box[0]:box[0] + box[2]])
+                #check if array contain only 255
+                #if(all(all(p == 255 for p in lines)for lines in ocr_box)):
+                #check if 95% of the array is 255
+                flattened = np.ravel(ocr_box)
+                if not (np.sum((flattened == 255)) / len(flattened) > 0.95):
+                    #cv2.imshow('ok', ocr_box)
+                    #cv2.waitKey(0)
+                    data = (pytesseract.image_to_string(ocr_box, lang='fra'))
+                    #\jump used as a delimiter of each text block
+                    #Used in the NER annotator software
+                    data = data.replace(' \n\n', '')
+                    data = data.replace(' \n\x0c', '')
+                    data = data.replace('\x0c', '')
+                    text += data
+        return text