a b/EAST_text_recognition.py
1
# USAGE
2
# python text_recognition.py --east frozen_east_text_detection.pb --image images/example_01.jpg
3
# python text_recognition.py --east frozen_east_text_detection.pb --image images/example_04.jpg --padding 0.05
4
5
# import the necessary packages
6
from imutils.object_detection import non_max_suppression
7
import numpy as np
8
import pytesseract
9
import argparse
10
import cv2
11
12
def decode_predictions(scores, geometry):
13
    # grab the number of rows and columns from the scores volume, then
14
    # initialize our set of bounding box rectangles and corresponding
15
    # confidence scores
16
    (numRows, numCols) = scores.shape[2:4]
17
    rects = []
18
    confidences = []
19
20
    # loop over the number of rows
21
    for y in range(0, numRows):
22
        # extract the scores (probabilities), followed by the
23
        # geometrical data used to derive potential bounding box
24
        # coordinates that surround text
25
        scoresData = scores[0, 0, y]
26
        xData0 = geometry[0, 0, y]
27
        xData1 = geometry[0, 1, y]
28
        xData2 = geometry[0, 2, y]
29
        xData3 = geometry[0, 3, y]
30
        anglesData = geometry[0, 4, y]
31
32
        # loop over the number of columns
33
        for x in range(0, numCols):
34
            # if our score does not have sufficient probability,
35
            # ignore it
36
            if scoresData[x] < args["min_confidence"]:
37
                continue
38
39
            # compute the offset factor as our resulting feature
40
            # maps will be 4x smaller than the input image
41
            (offsetX, offsetY) = (x * 4.0, y * 4.0)
42
43
            # extract the rotation angle for the prediction and
44
            # then compute the sin and cosine
45
            angle = anglesData[x]
46
            cos = np.cos(angle)
47
            sin = np.sin(angle)
48
49
            # use the geometry volume to derive the width and height
50
            # of the bounding box
51
            h = xData0[x] + xData2[x]
52
            w = xData1[x] + xData3[x]
53
54
            # compute both the starting and ending (x, y)-coordinates
55
            # for the text prediction bounding box
56
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
57
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
58
            startX = int(endX - w)
59
            startY = int(endY - h)
60
61
            # add the bounding box coordinates and probability score
62
            # to our respective lists
63
            rects.append((startX, startY, endX, endY))
64
            confidences.append(scoresData[x])
65
66
    # return a tuple of the bounding boxes and associated confidences
67
    return (rects, confidences)
68
69
# construct the argument parser and parse the arguments
70
ap = argparse.ArgumentParser()
71
#ap.add_argument("-i", "--image", type=str,
72
#   help="path to input image")
73
#ap.add_argument("-east", "--east", type=str,
74
#   help="path to input EAST text detector")
75
ap.add_argument("-east", "--east", type=str,default=r'\EAST-Text-Detection-and-Extraction\frozen_east_text_detection.pb'
76
    ,help="path to input EAST text detector")
77
ap.add_argument("-c", "--min-confidence", type=float, default=0.5,
78
    help="minimum probability required to inspect a region")
79
ap.add_argument("-w", "--width", type=int, default=320,
80
    help="nearest multiple of 32 for resized width")
81
ap.add_argument("-e", "--height", type=int, default=320,
82
    help="nearest multiple of 32 for resized height")
83
ap.add_argument("-p", "--padding", type=float, default=0.0,
84
    help="amount of padding to add to each border of ROI")
85
args = vars(ap.parse_args())
86
87
# load the input image and grab the image dimensions
88
image = cv2.imread(r"PATH TO INPUT IMAGE")
89
orig = image.copy()
90
(origH, origW) = image.shape[:2]
91
92
# set the new width and height and then determine the ratio in change
93
# for both the width and height
94
(newW, newH) = (args["width"], args["height"])
95
rW = origW / float(newW)
96
rH = origH / float(newH)
97
98
# resize the image and grab the new image dimensions
99
image = cv2.resize(image, (newW, newH))
100
(H, W) = image.shape[:2]
101
102
# define the two output layer names for the EAST detector model that
103
# we are interested -- the first is the output probabilities and the
104
# second can be used to derive the bounding box coordinates of text
105
layerNames = [
106
    "feature_fusion/Conv_7/Sigmoid",
107
    "feature_fusion/concat_3"]
108
109
# load the pre-trained EAST text detector
110
print("[INFO] loading EAST text detector...")
111
net = cv2.dnn.readNet(args["east"])
112
113
# construct a blob from the image and then perform a forward pass of
114
# the model to obtain the two output layer sets
115
blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),
116
    (123.68, 116.78, 103.94), swapRB=True, crop=False)
117
net.setInput(blob)
118
(scores, geometry) = net.forward(layerNames)
119
120
# decode the predictions, then  apply non-maxima suppression to
121
# suppress weak, overlapping bounding boxes
122
(rects, confidences) = decode_predictions(scores, geometry)
123
boxes = non_max_suppression(np.array(rects), probs=confidences)
124
125
# initialize the list of results
126
results = []
127
128
# loop over the bounding boxes
129
for (startX, startY, endX, endY) in boxes:
130
    # scale the bounding box coordinates based on the respective
131
    # ratios
132
    startX = int(startX * rW)
133
    startY = int(startY * rH)
134
    endX = int(endX * rW)
135
    endY = int(endY * rH)
136
137
    # in order to obtain a better OCR of the text we can potentially
138
    # apply a bit of padding surrounding the bounding box -- here we
139
    # are computing the deltas in both the x and y directions
140
    dX = int((endX - startX) * args["padding"])
141
    dY = int((endY - startY) * args["padding"])
142
143
    # apply padding to each side of the bounding box, respectively
144
    startX = max(0, startX - dX)
145
    startY = max(0, startY - dY)
146
    endX = min(origW, endX + (dX * 2))
147
    endY = min(origH, endY + (dY * 2))
148
149
    # extract the actual padded ROI
150
    roi = orig[startY:endY, startX:endX]
151
152
    # in order to apply Tesseract v4 to OCR text we must supply
153
    # (1) a language, (2) an OEM flag of 4, indicating that the we
154
    # wish to use the LSTM neural net model for OCR, and finally
155
    # (3) an OEM value, in this case, 7 which implies that we are
156
    # treating the ROI as a single line of text
157
    config = ("-l eng --oem 1 --psm 7")
158
    text = pytesseract.image_to_string(roi, config=config)
159
160
    # add the bounding box coordinates and OCR'd text to the list
161
    # of results
162
    results.append(((startX, startY, endX, endY), text))
163
164
# sort the results bounding box coordinates from top to bottom
165
results = sorted(results, key=lambda r:r[0][1])
166
167
# loop over the results
168
for ((startX, startY, endX, endY), text) in results:
169
    # display the text OCR'd by Tesseract
170
    print("OCR TEXT")
171
    print("========")
172
    print("{}\n".format(text))
173
174
    # strip out non-ASCII text so we can draw the text on the image
175
    # using OpenCV, then draw the text and a bounding box surrounding
176
    # the text region of the input image
177
    text = "".join([c if ord(c) < 128 else "" for c in text]).strip()
178
    output = orig.copy()
179
    cv2.rectangle(output, (startX, startY), (endX, endY),
180
        (0, 0, 255), 2)
181
    cv2.putText(output, text, (startX, startY - 20),
182
        cv2.FONT_HERSHEY_SIMPLEX, 1.2, (0, 0, 255), 3)
183
184
    # show the output image
185
    cv2.imshow("Text Detection", output)
186
    cv2.waitKey(0)