Diff of /acabs.py [000000] .. [036ed5]

Switch to unified view

a b/acabs.py
1
import cv2
2
import pytesseract
3
import numpy as np
4
import os
5
6
7
#tesseract dependencies
8
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
9
tessdata_dir_config = '--tessdata-dir "C:\\Program Files\\Tesseract-OCR\\tessdata"'
10
11
cv2.resizeWindow('output', 400, 400)
12
13
# OCR in text block
14
def ACABS(imgg):
15
    img = cv2.imread(imgg)
16
17
    output = img.copy()
18
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
19
20
    #crop 10 colums one the right and left of the image
21
    #avoid bugs with the segmentation algorithm on the border of the image
22
    h, w = gray.shape
23
    gray = np.array(gray[0:0 + h, 10: w - 10])
24
25
    # clean the image using otsu method with the inversed binarized image
26
    ret1, th1 = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
27
28
    boxs = create_bounding_box(th1, output)
29
    crop_top, crop_bottom = crop(boxs, gray, 0.10)
30
    text = ocr(gray, crop_top, crop_bottom, boxs)
31
32
    return text
33
34
35
# apply dilatation and erosion
36
# create bounding box
37
def create_bounding_box(thresh, output):
38
    boxs = []
39
40
    # assign a rectangle kernel size
41
    k1 = np.ones((15, 15), 'uint8')
42
43
    par_img = cv2.dilate(thresh, k1, iterations=3)
44
    par_img2 = cv2.erode(par_img, kernel = np.ones((5,5), 'uint8'), iterations=3)
45
46
    (contours, _) = cv2.findContours(par_img2.copy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
47
48
    #create the bounding box
49
    for cnt in reversed(contours):
50
        x, y, w, h = cv2.boundingRect(cnt)
51
        boxs.append([x, y, w, h])
52
53
        #draw bounding box on image
54
        #cv2.rectangle(output, (x, y), (x + w, y + h), (0, 255, 0), 1)
55
56
    #cv2.imwrite("../segmentation/adaptative_reding_exemple/repport_9_box.jpg", output)
57
    return boxs
58
59
#define line of cropping
60
def crop(boxs, gray, percent):
61
62
    h, w = gray.shape
63
64
    #define percent of Y crop on the image
65
    top_crop_percent = percent
66
    bottom_crop_percent = percent
67
    crop_top = h * top_crop_percent
68
    crop_bottom = h * bottom_crop_percent
69
70
    #test
71
    output = gray.copy()
72
73
    #Adapt crop to avoid cropping in the middle of a bounding box
74
    #if the crop is made between Y and Y + H (bounding box) the crop become Y + H
75
    # box[0] : x
76
    # box[1] : y
77
    # box[2] : w
78
    # box[3] : h
79
    change_bottom = False
80
    for box in boxs:
81
82
        if (inbetween(box[1], crop_top, box[1] + box[3]) == crop_top):
83
            crop_top = box[1] + box[3]
84
85
        if (inbetween(box[1], h-crop_bottom, box[1] + box[3]) == h-crop_bottom):
86
            crop_bottom = box[1] + box[3]
87
            change_bottom = True
88
89
        cv2.rectangle(output, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (0, 255, 0), 10)
90
91
    #round to avoid error
92
    crop_top = round(crop_top)
93
    crop_bottom = round(crop_bottom)
94
95
    #if bottom crop haven't change
96
    if not change_bottom:
97
        crop_bottom = h-crop_bottom
98
99
    #print bottom and top threshold lines on the image
100
    # bottom
101
    #output = cv2.line(output, (0, crop_bottom), (w, crop_bottom), (0, 0, 255), 5)
102
    # top
103
    #output = cv2.line(output, (0, crop_top), (w, crop_top), (0, 0, 255), 5)
104
105
    #cv2.imwrite("../segmentation/adaptative_reding_exemple/repport_9_box.jpg", output)
106
107
    return crop_top, crop_bottom
108
109
110
#find if val is between min and max
111
def inbetween(min,val,max):
112
    return sorted([min,val,max])[1]
113
114
115
#extract text for each bounding box
116
def ocr(gray, crop_top, crop_bottom, cropped_boxs):
117
118
        text = ""
119
120
        for box in cropped_boxs:
121
            #box[0] : x
122
            #box[1] : y
123
            #box[2] : w
124
            #box[3] : h
125
126
            if (box[1] > crop_top) and (box[1] + box[3] <= crop_bottom):
127
                ocr_box = np.array(gray[box[1]:box[1] + box[3], box[0]:box[0] + box[2]])
128
129
                #check if array contain only 255
130
                #if(all(all(p == 255 for p in lines)for lines in ocr_box)):
131
132
                #check if 95% of the array is 255
133
                flattened = np.ravel(ocr_box)
134
                if not (np.sum((flattened == 255)) / len(flattened) > 0.95):
135
136
                    #cv2.imshow('ok', ocr_box)
137
                    #cv2.waitKey(0)
138
                    data = (pytesseract.image_to_string(ocr_box, lang='fra'))
139
140
                    #\jump used as a delimiter of each text block
141
                    #Used in the NER annotator software
142
143
                    data = data.replace(' \n\n', '')
144
                    data = data.replace(' \n\x0c', '')
145
                    data = data.replace('\x0c', '')
146
147
                    text += data
148
149
        return text