[036ed5]: / acabs.py

Download this file

150 lines (105 with data), 4.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import cv2
import pytesseract
import numpy as np
import os
#tesseract dependencies
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
tessdata_dir_config = '--tessdata-dir "C:\\Program Files\\Tesseract-OCR\\tessdata"'
cv2.resizeWindow('output', 400, 400)
# OCR in text block
def ACABS(imgg):
img = cv2.imread(imgg)
output = img.copy()
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#crop 10 colums one the right and left of the image
#avoid bugs with the segmentation algorithm on the border of the image
h, w = gray.shape
gray = np.array(gray[0:0 + h, 10: w - 10])
# clean the image using otsu method with the inversed binarized image
ret1, th1 = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
boxs = create_bounding_box(th1, output)
crop_top, crop_bottom = crop(boxs, gray, 0.10)
text = ocr(gray, crop_top, crop_bottom, boxs)
return text
# apply dilatation and erosion
# create bounding box
def create_bounding_box(thresh, output):
boxs = []
# assign a rectangle kernel size
k1 = np.ones((15, 15), 'uint8')
par_img = cv2.dilate(thresh, k1, iterations=3)
par_img2 = cv2.erode(par_img, kernel = np.ones((5,5), 'uint8'), iterations=3)
(contours, _) = cv2.findContours(par_img2.copy(), cv2.RETR_CCOMP, cv2.CHAIN_APPROX_SIMPLE)
#create the bounding box
for cnt in reversed(contours):
x, y, w, h = cv2.boundingRect(cnt)
boxs.append([x, y, w, h])
#draw bounding box on image
#cv2.rectangle(output, (x, y), (x + w, y + h), (0, 255, 0), 1)
#cv2.imwrite("../segmentation/adaptative_reding_exemple/repport_9_box.jpg", output)
return boxs
#define line of cropping
def crop(boxs, gray, percent):
h, w = gray.shape
#define percent of Y crop on the image
top_crop_percent = percent
bottom_crop_percent = percent
crop_top = h * top_crop_percent
crop_bottom = h * bottom_crop_percent
#test
output = gray.copy()
#Adapt crop to avoid cropping in the middle of a bounding box
#if the crop is made between Y and Y + H (bounding box) the crop become Y + H
# box[0] : x
# box[1] : y
# box[2] : w
# box[3] : h
change_bottom = False
for box in boxs:
if (inbetween(box[1], crop_top, box[1] + box[3]) == crop_top):
crop_top = box[1] + box[3]
if (inbetween(box[1], h-crop_bottom, box[1] + box[3]) == h-crop_bottom):
crop_bottom = box[1] + box[3]
change_bottom = True
cv2.rectangle(output, (box[0], box[1]), (box[0] + box[2], box[1] + box[3]), (0, 255, 0), 10)
#round to avoid error
crop_top = round(crop_top)
crop_bottom = round(crop_bottom)
#if bottom crop haven't change
if not change_bottom:
crop_bottom = h-crop_bottom
#print bottom and top threshold lines on the image
# bottom
#output = cv2.line(output, (0, crop_bottom), (w, crop_bottom), (0, 0, 255), 5)
# top
#output = cv2.line(output, (0, crop_top), (w, crop_top), (0, 0, 255), 5)
#cv2.imwrite("../segmentation/adaptative_reding_exemple/repport_9_box.jpg", output)
return crop_top, crop_bottom
#find if val is between min and max
def inbetween(min,val,max):
return sorted([min,val,max])[1]
#extract text for each bounding box
def ocr(gray, crop_top, crop_bottom, cropped_boxs):
text = ""
for box in cropped_boxs:
#box[0] : x
#box[1] : y
#box[2] : w
#box[3] : h
if (box[1] > crop_top) and (box[1] + box[3] <= crop_bottom):
ocr_box = np.array(gray[box[1]:box[1] + box[3], box[0]:box[0] + box[2]])
#check if array contain only 255
#if(all(all(p == 255 for p in lines)for lines in ocr_box)):
#check if 95% of the array is 255
flattened = np.ravel(ocr_box)
if not (np.sum((flattened == 255)) / len(flattened) > 0.95):
#cv2.imshow('ok', ocr_box)
#cv2.waitKey(0)
data = (pytesseract.image_to_string(ocr_box, lang='fra'))
#\jump used as a delimiter of each text block
#Used in the NER annotator software
data = data.replace(' \n\n', '')
data = data.replace(' \n\x0c', '')
data = data.replace('\x0c', '')
text += data
return text