|
a |
|
b/acabs.py |
|
|
1 |
import cv2 |
|
|
2 |
import pytesseract |
|
|
3 |
import numpy as np |
|
|
4 |
import os |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
#tesseract dependencies |
|
|
8 |
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe' |
|
|
9 |
tessdata_dir_config = '--tessdata-dir "C:\\Program Files\\Tesseract-OCR\\tessdata"' |
|
|
10 |
|
|
|
11 |
cv2.resizeWindow('output', 400, 400) |
|
|
12 |
|
|
|
13 |
# OCR in text block |
|
|
14 |
def ACABS(imgg): |
|
|
15 |
img = cv2.imread(imgg) |
|
|
16 |
|
|
|
17 |
output = img.copy() |
|
|
18 |
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) |
|
|
19 |
|
|
|
20 |
#crop 10 colums one the right and left of the image |
|
|
21 |
#avoid bugs with the segmentation algorithm on the border of the image |
|
|
22 |
h, w = gray.shape |
|
|
23 |
gray = np.array(gray[0:0 + h, 10: w - 10]) |
|
|
24 |
|
|
|
25 |
# clean the image using otsu method with the inversed binarized image |
|
|
26 |
ret1, th1 = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) |
|
|
27 |
|
|
|
28 |
boxs = create_bounding_box(th1, output) |
|
|
29 |
crop_top, crop_bottom = crop(boxs, gray, 0.10) |
|
|
30 |
text = ocr(gray, crop_top, crop_bottom, boxs) |
|
|
31 |
|
|
|
32 |
return text |
|
|
33 |
|
|
|
34 |
|
|
|
35 |
# apply dilatation and erosion |
|
|
36 |
# create bounding box |
|
|
37 |
def create_bounding_box(thresh, output): |
|
|
38 |
boxs = [] |
|
|
39 |
|
|
|
40 |
# assign a rectangle kernel size |
|
|
41 |
k1 = np.ones((15, 15), 'uint8') |
|
|
42 |
|
|
|
43 |
par_img = cv2.dilate(thresh, k1, iterations=3) |
|
|
44 |
par_img2 = cv2.erode(par_img, kernel = np.ones((5,5), 'uint8'), iterations=3) |
|
|
45 |
|
|
|
46 |
(contours, _) = cv2.findContours(par_img2.copy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE) |
|
|
47 |
|
|
|
48 |
#create the bounding box |
|
|
49 |
for cnt in reversed(contours): |
|
|
50 |
x, y, w, h = cv2.boundingRect(cnt) |
|
|
51 |
boxs.append([x, y, w, h]) |
|
|
52 |
|
|
|
53 |
#draw bounding box on image |
|
|
54 |
#cv2.rectangle(output, (x, y), (x + w, y + h), (0, 255, 0), 1) |
|
|
55 |
|
|
|
56 |
#cv2.imwrite("../segmentation/adaptative_reding_exemple/repport_9_box.jpg", output) |
|
|
57 |
return boxs |
|
|
58 |
|
|
|
59 |
#define line of cropping |
|
|
60 |
def crop(boxs, gray, percent): |
|
|
61 |
|
|
|
62 |
h, w = gray.shape |
|
|
63 |
|
|
|
64 |
#define percent of Y crop on the image |
|
|
65 |
top_crop_percent = percent |
|
|
66 |
bottom_crop_percent = percent |
|
|
67 |
crop_top = h * top_crop_percent |
|
|
68 |
crop_bottom = h * bottom_crop_percent |
|
|
69 |
|
|
|
70 |
#test |
|
|
71 |
output = gray.copy() |
|
|
72 |
|
|
|
73 |
#Adapt crop to avoid cropping in the middle of a bounding box |
|
|
74 |
#if the crop is made between Y and Y + H (bounding box) the crop become Y + H |
|
|
75 |
# box[0] : x |
|
|
76 |
# box[1] : y |
|
|
77 |
# box[2] : w |
|
|
78 |
# box[3] : h |
|
|
79 |
change_bottom = False |
|
|
80 |
for box in boxs: |
|
|
81 |
|
|
|
82 |
if (inbetween(box[1], crop_top, box[1] + box[3]) == crop_top): |
|
|
83 |
crop_top = box[1] + box[3] |
|
|
84 |
|
|
|
85 |
if (inbetween(box[1], h-crop_bottom, box[1] + box[3]) == h-crop_bottom): |
|
|
86 |
crop_bottom = box[1] + box[3] |
|
|
87 |
change_bottom = True |
|
|
88 |
|
|
|
89 |
cv2.rectangle(output, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (0, 255, 0), 10) |
|
|
90 |
|
|
|
91 |
#round to avoid error |
|
|
92 |
crop_top = round(crop_top) |
|
|
93 |
crop_bottom = round(crop_bottom) |
|
|
94 |
|
|
|
95 |
#if bottom crop haven't change |
|
|
96 |
if not change_bottom: |
|
|
97 |
crop_bottom = h-crop_bottom |
|
|
98 |
|
|
|
99 |
#print bottom and top threshold lines on the image |
|
|
100 |
# bottom |
|
|
101 |
#output = cv2.line(output, (0, crop_bottom), (w, crop_bottom), (0, 0, 255), 5) |
|
|
102 |
# top |
|
|
103 |
#output = cv2.line(output, (0, crop_top), (w, crop_top), (0, 0, 255), 5) |
|
|
104 |
|
|
|
105 |
#cv2.imwrite("../segmentation/adaptative_reding_exemple/repport_9_box.jpg", output) |
|
|
106 |
|
|
|
107 |
return crop_top, crop_bottom |
|
|
108 |
|
|
|
109 |
|
|
|
110 |
#find if val is between min and max |
|
|
111 |
def inbetween(min,val,max): |
|
|
112 |
return sorted([min,val,max])[1] |
|
|
113 |
|
|
|
114 |
|
|
|
115 |
#extract text for each bounding box |
|
|
116 |
def ocr(gray, crop_top, crop_bottom, cropped_boxs): |
|
|
117 |
|
|
|
118 |
text = "" |
|
|
119 |
|
|
|
120 |
for box in cropped_boxs: |
|
|
121 |
#box[0] : x |
|
|
122 |
#box[1] : y |
|
|
123 |
#box[2] : w |
|
|
124 |
#box[3] : h |
|
|
125 |
|
|
|
126 |
if (box[1] > crop_top) and (box[1] + box[3] <= crop_bottom): |
|
|
127 |
ocr_box = np.array(gray[box[1]:box[1] + box[3], box[0]:box[0] + box[2]]) |
|
|
128 |
|
|
|
129 |
#check if array contain only 255 |
|
|
130 |
#if(all(all(p == 255 for p in lines)for lines in ocr_box)): |
|
|
131 |
|
|
|
132 |
#check if 95% of the array is 255 |
|
|
133 |
flattened = np.ravel(ocr_box) |
|
|
134 |
if not (np.sum((flattened == 255)) / len(flattened) > 0.95): |
|
|
135 |
|
|
|
136 |
#cv2.imshow('ok', ocr_box) |
|
|
137 |
#cv2.waitKey(0) |
|
|
138 |
data = (pytesseract.image_to_string(ocr_box, lang='fra')) |
|
|
139 |
|
|
|
140 |
#\jump used as a delimiter of each text block |
|
|
141 |
#Used in the NER annotator software |
|
|
142 |
|
|
|
143 |
data = data.replace(' \n\n', '') |
|
|
144 |
data = data.replace(' \n\x0c', '') |
|
|
145 |
data = data.replace('\x0c', '') |
|
|
146 |
|
|
|
147 |
text += data |
|
|
148 |
|
|
|
149 |
return text |