|
a |
|
b/EAST_text_detection_video.py |
|
|
1 |
# USAGE |
|
|
2 |
# python text_detection_video.py --east frozen_east_text_detection.pb |
|
|
3 |
|
|
|
4 |
# import the necessary packages |
|
|
5 |
from imutils.video import VideoStream |
|
|
6 |
from imutils.video import FPS |
|
|
7 |
from imutils.object_detection import non_max_suppression |
|
|
8 |
import numpy as np |
|
|
9 |
import argparse |
|
|
10 |
import imutils |
|
|
11 |
import time |
|
|
12 |
import cv2 |
|
|
13 |
|
|
|
14 |
def decode_predictions(scores, geometry): |
|
|
15 |
# grab the number of rows and columns from the scores volume, then |
|
|
16 |
# initialize our set of bounding box rectangles and corresponding |
|
|
17 |
# confidence scores |
|
|
18 |
(numRows, numCols) = scores.shape[2:4] |
|
|
19 |
rects = [] |
|
|
20 |
confidences = [] |
|
|
21 |
|
|
|
22 |
# loop over the number of rows |
|
|
23 |
for y in range(0, numRows): |
|
|
24 |
# extract the scores (probabilities), followed by the |
|
|
25 |
# geometrical data used to derive potential bounding box |
|
|
26 |
# coordinates that surround text |
|
|
27 |
scoresData = scores[0, 0, y] |
|
|
28 |
xData0 = geometry[0, 0, y] |
|
|
29 |
xData1 = geometry[0, 1, y] |
|
|
30 |
xData2 = geometry[0, 2, y] |
|
|
31 |
xData3 = geometry[0, 3, y] |
|
|
32 |
anglesData = geometry[0, 4, y] |
|
|
33 |
|
|
|
34 |
# loop over the number of columns |
|
|
35 |
for x in range(0, numCols): |
|
|
36 |
# if our score does not have sufficient probability, |
|
|
37 |
# ignore it |
|
|
38 |
if scoresData[x] < args["min_confidence"]: |
|
|
39 |
continue |
|
|
40 |
|
|
|
41 |
# compute the offset factor as our resulting feature |
|
|
42 |
# maps will be 4x smaller than the input image |
|
|
43 |
(offsetX, offsetY) = (x * 4.0, y * 4.0) |
|
|
44 |
|
|
|
45 |
# extract the rotation angle for the prediction and |
|
|
46 |
# then compute the sin and cosine |
|
|
47 |
angle = anglesData[x] |
|
|
48 |
cos = np.cos(angle) |
|
|
49 |
sin = np.sin(angle) |
|
|
50 |
|
|
|
51 |
# use the geometry volume to derive the width and height |
|
|
52 |
# of the bounding box |
|
|
53 |
h = xData0[x] + xData2[x] |
|
|
54 |
w = xData1[x] + xData3[x] |
|
|
55 |
|
|
|
56 |
# compute both the starting and ending (x, y)-coordinates |
|
|
57 |
# for the text prediction bounding box |
|
|
58 |
endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x])) |
|
|
59 |
endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x])) |
|
|
60 |
startX = int(endX - w) |
|
|
61 |
startY = int(endY - h) |
|
|
62 |
|
|
|
63 |
# add the bounding box coordinates and probability score |
|
|
64 |
# to our respective lists |
|
|
65 |
rects.append((startX, startY, endX, endY)) |
|
|
66 |
confidences.append(scoresData[x]) |
|
|
67 |
|
|
|
68 |
# return a tuple of the bounding boxes and associated confidences |
|
|
69 |
return (rects, confidences) |
|
|
70 |
|
|
|
71 |
# construct the argument parser and parse the arguments |
|
|
72 |
ap = argparse.ArgumentParser() |
|
|
73 |
ap.add_argument("-east", "--east", type=str, required=True, |
|
|
74 |
help="path to input EAST text detector") |
|
|
75 |
ap.add_argument("-v", "--video", type=str, |
|
|
76 |
help="path to optinal input video file") |
|
|
77 |
ap.add_argument("-c", "--min-confidence", type=float, default=0.5, |
|
|
78 |
help="minimum probability required to inspect a region") |
|
|
79 |
ap.add_argument("-w", "--width", type=int, default=320, |
|
|
80 |
help="resized image width (should be multiple of 32)") |
|
|
81 |
ap.add_argument("-e", "--height", type=int, default=320, |
|
|
82 |
help="resized image height (should be multiple of 32)") |
|
|
83 |
args = vars(ap.parse_args()) |
|
|
84 |
|
|
|
85 |
# initialize the original frame dimensions, new frame dimensions, |
|
|
86 |
# and ratio between the dimensions |
|
|
87 |
(W, H) = (None, None) |
|
|
88 |
(newW, newH) = (args["width"], args["height"]) |
|
|
89 |
(rW, rH) = (None, None) |
|
|
90 |
|
|
|
91 |
# define the two output layer names for the EAST detector model that |
|
|
92 |
# we are interested -- the first is the output probabilities and the |
|
|
93 |
# second can be used to derive the bounding box coordinates of text |
|
|
94 |
layerNames = [ |
|
|
95 |
"feature_fusion/Conv_7/Sigmoid", |
|
|
96 |
"feature_fusion/concat_3"] |
|
|
97 |
|
|
|
98 |
# load the pre-trained EAST text detector |
|
|
99 |
print("[INFO] loading EAST text detector...") |
|
|
100 |
net = cv2.dnn.readNet(args["east"]) |
|
|
101 |
|
|
|
102 |
# if a video path was not supplied, grab the reference to the web cam |
|
|
103 |
if not args.get("video", False): |
|
|
104 |
print("[INFO] starting video stream...") |
|
|
105 |
vs = VideoStream(src=0).start() |
|
|
106 |
time.sleep(1.0) |
|
|
107 |
|
|
|
108 |
# otherwise, grab a reference to the video file |
|
|
109 |
else: |
|
|
110 |
vs = cv2.VideoCapture(args["video"]) |
|
|
111 |
|
|
|
112 |
# start the FPS throughput estimator |
|
|
113 |
fps = FPS().start() |
|
|
114 |
|
|
|
115 |
# loop over frames from the video stream |
|
|
116 |
while True: |
|
|
117 |
# grab the current frame, then handle if we are using a |
|
|
118 |
# VideoStream or VideoCapture object |
|
|
119 |
frame = vs.read() |
|
|
120 |
frame = frame[1] if args.get("video", False) else frame |
|
|
121 |
|
|
|
122 |
# check to see if we have reached the end of the stream |
|
|
123 |
if frame is None: |
|
|
124 |
break |
|
|
125 |
|
|
|
126 |
# resize the frame, maintaining the aspect ratio |
|
|
127 |
frame = imutils.resize(frame, width=1000) |
|
|
128 |
orig = frame.copy() |
|
|
129 |
|
|
|
130 |
# if our frame dimensions are None, we still need to compute the |
|
|
131 |
# ratio of old frame dimensions to new frame dimensions |
|
|
132 |
if W is None or H is None: |
|
|
133 |
(H, W) = frame.shape[:2] |
|
|
134 |
rW = W / float(newW) |
|
|
135 |
rH = H / float(newH) |
|
|
136 |
|
|
|
137 |
# resize the frame, this time ignoring aspect ratio |
|
|
138 |
frame = cv2.resize(frame, (newW, newH)) |
|
|
139 |
|
|
|
140 |
# construct a blob from the frame and then perform a forward pass |
|
|
141 |
# of the model to obtain the two output layer sets |
|
|
142 |
blob = cv2.dnn.blobFromImage(frame, 1.0, (newW, newH), |
|
|
143 |
(123.68, 116.78, 103.94), swapRB=True, crop=False) |
|
|
144 |
net.setInput(blob) |
|
|
145 |
(scores, geometry) = net.forward(layerNames) |
|
|
146 |
|
|
|
147 |
# decode the predictions, then apply non-maxima suppression to |
|
|
148 |
# suppress weak, overlapping bounding boxes |
|
|
149 |
(rects, confidences) = decode_predictions(scores, geometry) |
|
|
150 |
boxes = non_max_suppression(np.array(rects), probs=confidences) |
|
|
151 |
|
|
|
152 |
# loop over the bounding boxes |
|
|
153 |
for (startX, startY, endX, endY) in boxes: |
|
|
154 |
# scale the bounding box coordinates based on the respective |
|
|
155 |
# ratios |
|
|
156 |
startX = int(startX * rW) |
|
|
157 |
startY = int(startY * rH) |
|
|
158 |
endX = int(endX * rW) |
|
|
159 |
endY = int(endY * rH) |
|
|
160 |
|
|
|
161 |
# draw the bounding box on the frame |
|
|
162 |
cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 2) |
|
|
163 |
|
|
|
164 |
# update the FPS counter |
|
|
165 |
fps.update() |
|
|
166 |
|
|
|
167 |
# show the output frame |
|
|
168 |
cv2.imshow("Text Detection", orig) |
|
|
169 |
key = cv2.waitKey(1) & 0xFF |
|
|
170 |
|
|
|
171 |
# if the `q` key was pressed, break from the loop |
|
|
172 |
if key == ord("q"): |
|
|
173 |
break |
|
|
174 |
|
|
|
175 |
# stop the timer and display FPS information |
|
|
176 |
fps.stop() |
|
|
177 |
print("[INFO] elasped time: {:.2f}".format(fps.elapsed())) |
|
|
178 |
print("[INFO] approx. FPS: {:.2f}".format(fps.fps())) |
|
|
179 |
|
|
|
180 |
# if we are using a webcam, release the pointer |
|
|
181 |
if not args.get("video", False): |
|
|
182 |
vs.stop() |
|
|
183 |
|
|
|
184 |
# otherwise, release the file pointer |
|
|
185 |
else: |
|
|
186 |
vs.release() |
|
|
187 |
|
|
|
188 |
# close all windows |
|
|
189 |
cv2.destroyAllWindows() |