a b/EAST_text_detection_video.py
1
# USAGE
2
# python text_detection_video.py --east frozen_east_text_detection.pb
3
4
# import the necessary packages
5
from imutils.video import VideoStream
6
from imutils.video import FPS
7
from imutils.object_detection import non_max_suppression
8
import numpy as np
9
import argparse
10
import imutils
11
import time
12
import cv2
13
14
def decode_predictions(scores, geometry):
15
    # grab the number of rows and columns from the scores volume, then
16
    # initialize our set of bounding box rectangles and corresponding
17
    # confidence scores
18
    (numRows, numCols) = scores.shape[2:4]
19
    rects = []
20
    confidences = []
21
22
    # loop over the number of rows
23
    for y in range(0, numRows):
24
        # extract the scores (probabilities), followed by the
25
        # geometrical data used to derive potential bounding box
26
        # coordinates that surround text
27
        scoresData = scores[0, 0, y]
28
        xData0 = geometry[0, 0, y]
29
        xData1 = geometry[0, 1, y]
30
        xData2 = geometry[0, 2, y]
31
        xData3 = geometry[0, 3, y]
32
        anglesData = geometry[0, 4, y]
33
34
        # loop over the number of columns
35
        for x in range(0, numCols):
36
            # if our score does not have sufficient probability,
37
            # ignore it
38
            if scoresData[x] < args["min_confidence"]:
39
                continue
40
41
            # compute the offset factor as our resulting feature
42
            # maps will be 4x smaller than the input image
43
            (offsetX, offsetY) = (x * 4.0, y * 4.0)
44
45
            # extract the rotation angle for the prediction and
46
            # then compute the sin and cosine
47
            angle = anglesData[x]
48
            cos = np.cos(angle)
49
            sin = np.sin(angle)
50
51
            # use the geometry volume to derive the width and height
52
            # of the bounding box
53
            h = xData0[x] + xData2[x]
54
            w = xData1[x] + xData3[x]
55
56
            # compute both the starting and ending (x, y)-coordinates
57
            # for the text prediction bounding box
58
            endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
59
            endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
60
            startX = int(endX - w)
61
            startY = int(endY - h)
62
63
            # add the bounding box coordinates and probability score
64
            # to our respective lists
65
            rects.append((startX, startY, endX, endY))
66
            confidences.append(scoresData[x])
67
68
    # return a tuple of the bounding boxes and associated confidences
69
    return (rects, confidences)
70
71
# construct the argument parser and parse the arguments
72
ap = argparse.ArgumentParser()
73
ap.add_argument("-east", "--east", type=str, required=True,
74
    help="path to input EAST text detector")
75
ap.add_argument("-v", "--video", type=str,
76
    help="path to optinal input video file")
77
ap.add_argument("-c", "--min-confidence", type=float, default=0.5,
78
    help="minimum probability required to inspect a region")
79
ap.add_argument("-w", "--width", type=int, default=320,
80
    help="resized image width (should be multiple of 32)")
81
ap.add_argument("-e", "--height", type=int, default=320,
82
    help="resized image height (should be multiple of 32)")
83
args = vars(ap.parse_args())
84
85
# initialize the original frame dimensions, new frame dimensions,
86
# and ratio between the dimensions
87
(W, H) = (None, None)
88
(newW, newH) = (args["width"], args["height"])
89
(rW, rH) = (None, None)
90
91
# define the two output layer names for the EAST detector model that
92
# we are interested -- the first is the output probabilities and the
93
# second can be used to derive the bounding box coordinates of text
94
layerNames = [
95
    "feature_fusion/Conv_7/Sigmoid",
96
    "feature_fusion/concat_3"]
97
98
# load the pre-trained EAST text detector
99
print("[INFO] loading EAST text detector...")
100
net = cv2.dnn.readNet(args["east"])
101
102
# if a video path was not supplied, grab the reference to the web cam
103
if not args.get("video", False):
104
    print("[INFO] starting video stream...")
105
    vs = VideoStream(src=0).start()
106
    time.sleep(1.0)
107
108
# otherwise, grab a reference to the video file
109
else:
110
    vs = cv2.VideoCapture(args["video"])
111
112
# start the FPS throughput estimator
113
fps = FPS().start()
114
115
# loop over frames from the video stream
116
while True:
117
    # grab the current frame, then handle if we are using a
118
    # VideoStream or VideoCapture object
119
    frame = vs.read()
120
    frame = frame[1] if args.get("video", False) else frame
121
122
    # check to see if we have reached the end of the stream
123
    if frame is None:
124
        break
125
126
    # resize the frame, maintaining the aspect ratio
127
    frame = imutils.resize(frame, width=1000)
128
    orig = frame.copy()
129
130
    # if our frame dimensions are None, we still need to compute the
131
    # ratio of old frame dimensions to new frame dimensions
132
    if W is None or H is None:
133
        (H, W) = frame.shape[:2]
134
        rW = W / float(newW)
135
        rH = H / float(newH)
136
137
    # resize the frame, this time ignoring aspect ratio
138
    frame = cv2.resize(frame, (newW, newH))
139
140
    # construct a blob from the frame and then perform a forward pass
141
    # of the model to obtain the two output layer sets
142
    blob = cv2.dnn.blobFromImage(frame, 1.0, (newW, newH),
143
        (123.68, 116.78, 103.94), swapRB=True, crop=False)
144
    net.setInput(blob)
145
    (scores, geometry) = net.forward(layerNames)
146
147
    # decode the predictions, then  apply non-maxima suppression to
148
    # suppress weak, overlapping bounding boxes
149
    (rects, confidences) = decode_predictions(scores, geometry)
150
    boxes = non_max_suppression(np.array(rects), probs=confidences)
151
152
    # loop over the bounding boxes
153
    for (startX, startY, endX, endY) in boxes:
154
        # scale the bounding box coordinates based on the respective
155
        # ratios
156
        startX = int(startX * rW)
157
        startY = int(startY * rH)
158
        endX = int(endX * rW)
159
        endY = int(endY * rH)
160
161
        # draw the bounding box on the frame
162
        cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2)
163
164
    # update the FPS counter
165
    fps.update()
166
167
    # show the output frame
168
    cv2.imshow("Text Detection", orig)
169
    key = cv2.waitKey(1) & 0xFF
170
171
    # if the `q` key was pressed, break from the loop
172
    if key == ord("q"):
173
        break
174
175
# stop the timer and display FPS information
176
fps.stop()
177
print("[INFO] elasped time: {:.2f}".format(fps.elapsed()))
178
print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))
179
180
# if we are using a webcam, release the pointer
181
if not args.get("video", False):
182
    vs.stop()
183
184
# otherwise, release the file pointer
185
else:
186
    vs.release()
187
188
# close all windows
189
cv2.destroyAllWindows()