|
a |
|
b/Projects/NCS1/Trainer.py |
|
|
1 |
from __future__ import print_function |
|
|
2 |
############################################################################################ |
|
|
3 |
# |
|
|
4 |
# Project: Peter Moss Acute Myeloid & Lymphoblastic Leukemia AI Research Project |
|
|
5 |
# Repository: ALL Detection System 2019 |
|
|
6 |
# Project: Facial Authentication Server |
|
|
7 |
# |
|
|
8 |
# Author: Adam Milton-Barker (AdamMiltonBarker.com) |
|
|
9 |
# Contributors: |
|
|
10 |
# Title: Trainer Class |
|
|
11 |
# Description: Trainer class for the ALL Detection System 2019 NCS1 Classifier. |
|
|
12 |
# License: MIT License |
|
|
13 |
# Last Modified: 2020-07-16 |
|
|
14 |
# |
|
|
15 |
############################################################################################ |
|
|
16 |
|
|
|
17 |
import glob, json, math, os, random, sys, time |
|
|
18 |
|
|
|
19 |
import numpy as np |
|
|
20 |
import tensorflow as tf |
|
|
21 |
|
|
|
22 |
import Classes.inception_preprocessing |
|
|
23 |
|
|
|
24 |
from tensorflow.contrib.framework.python.ops.variables import get_or_create_global_step |
|
|
25 |
from tensorflow.python.platform import tf_logging as logging |
|
|
26 |
from tensorflow.python.framework import graph_util |
|
|
27 |
|
|
|
28 |
from sys import argv |
|
|
29 |
from datetime import datetime |
|
|
30 |
from builtins import range |
|
|
31 |
|
|
|
32 |
from Classes.Helpers import Helpers |
|
|
33 |
from Classes.Data import Data |
|
|
34 |
from Classes.inception_v3 import inception_v3, inception_v3_arg_scope |
|
|
35 |
|
|
|
36 |
slim = tf.contrib.slim |
|
|
37 |
|
|
|
38 |
|
|
|
39 |
class Trainer(): |
|
|
40 |
""" Trainer Class |
|
|
41 |
|
|
|
42 |
Trains the ALL Detection System 2019 NCS1 Trainer. |
|
|
43 |
""" |
|
|
44 |
|
|
|
45 |
def __init__(self): |
|
|
46 |
""" Initializes Trainer Class """ |
|
|
47 |
|
|
|
48 |
self.Helpers = Helpers("Trainer") |
|
|
49 |
self.confs = self.Helpers.confs |
|
|
50 |
|
|
|
51 |
self.Helpers.logger.info( |
|
|
52 |
"Trainer class initialization complete.") |
|
|
53 |
|
|
|
54 |
self.labelsToName = {} |
|
|
55 |
|
|
|
56 |
def getSplit(self, split_name): |
|
|
57 |
""" Gets the training/validation split """ |
|
|
58 |
|
|
|
59 |
# Check whether the split_name is train or validation |
|
|
60 |
if split_name not in ['train', 'validation']: |
|
|
61 |
raise ValueError( |
|
|
62 |
'The split_name %s is not recognized. Please input either train or validation as the split_name' % (split_name)) |
|
|
63 |
|
|
|
64 |
# Create the full path for a general FilePattern to locate the tfrecord_files |
|
|
65 |
FilePattern_path = os.path.join( |
|
|
66 |
self.confs["Classifier"]["DatasetDir"], self.confs["Classifier"]["FilePattern"] % (split_name)) |
|
|
67 |
|
|
|
68 |
# Count the total number of examples in all of these shard |
|
|
69 |
num_samples = 0 |
|
|
70 |
FilePattern_for_counting = 'ALL_' + split_name |
|
|
71 |
tfrecords_to_count = [os.path.join(self.confs["Classifier"]["DatasetDir"], file) for file in os.listdir( |
|
|
72 |
self.confs["Classifier"]["DatasetDir"]) if file.startswith(FilePattern_for_counting)] |
|
|
73 |
|
|
|
74 |
# print(tfrecords_to_count) |
|
|
75 |
for tfrecord_file in tfrecords_to_count: |
|
|
76 |
for record in tf.python_io.tf_record_iterator(tfrecord_file): |
|
|
77 |
num_samples += 1 |
|
|
78 |
|
|
|
79 |
# Create a reader, which must be a TFRecord reader in this case |
|
|
80 |
reader = tf.TFRecordReader |
|
|
81 |
|
|
|
82 |
# Create the keys_to_features dictionary for the decoder |
|
|
83 |
keys_to_features = { |
|
|
84 |
'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''), |
|
|
85 |
'image/format': tf.FixedLenFeature((), tf.string, default_value='jpg'), |
|
|
86 |
'image/class/label': tf.FixedLenFeature( |
|
|
87 |
[], tf.int64, default_value=tf.zeros([], dtype=tf.int64)), |
|
|
88 |
} |
|
|
89 |
|
|
|
90 |
# Create the items_to_handlers dictionary for the decoder. |
|
|
91 |
items_to_handlers = { |
|
|
92 |
'image': slim.tfexample_decoder.Image(), |
|
|
93 |
'label': slim.tfexample_decoder.Tensor('image/class/label'), |
|
|
94 |
} |
|
|
95 |
|
|
|
96 |
# Start to create the decoder |
|
|
97 |
decoder = slim.tfexample_decoder.TFExampleDecoder( |
|
|
98 |
keys_to_features, items_to_handlers) |
|
|
99 |
|
|
|
100 |
# Create the labels_to_name file |
|
|
101 |
labels_to_name_dict = self.labelsToName |
|
|
102 |
|
|
|
103 |
# Actually create the dataset |
|
|
104 |
dataset = slim.dataset.Dataset( |
|
|
105 |
data_sources=FilePattern_path, |
|
|
106 |
decoder=decoder, |
|
|
107 |
reader=reader, |
|
|
108 |
num_readers=4, |
|
|
109 |
num_samples=num_samples, |
|
|
110 |
num_classes=self.confs["Classifier"]["NumClasses"], |
|
|
111 |
labels_to_name=labels_to_name_dict, |
|
|
112 |
items_to_descriptions=self.items_to_descriptions) |
|
|
113 |
|
|
|
114 |
return dataset |
|
|
115 |
|
|
|
116 |
def loadBatch(self, dataset, is_training=True): |
|
|
117 |
""" Loads a batch for training """ |
|
|
118 |
|
|
|
119 |
# First create the data_provider object |
|
|
120 |
data_provider = slim.dataset_data_provider.DatasetDataProvider( |
|
|
121 |
dataset, |
|
|
122 |
common_queue_capacity=24 + 3 * |
|
|
123 |
self.confs["Classifier"]["BatchSize"], |
|
|
124 |
common_queue_min=24) |
|
|
125 |
|
|
|
126 |
# Obtain the raw image using the get method |
|
|
127 |
raw_image, label = data_provider.get(['image', 'label']) |
|
|
128 |
|
|
|
129 |
# Perform the correct preprocessing for this image depending if it is training or evaluating |
|
|
130 |
image = Classes.inception_preprocessing.preprocess_image( |
|
|
131 |
raw_image, self.confs["Classifier"]["ImageSize"], self.confs["Classifier"]["ImageSize"], is_training) |
|
|
132 |
|
|
|
133 |
# As for the raw images, we just do a simple reshape to batch it up |
|
|
134 |
raw_image = tf.image.resize_image_with_crop_or_pad( |
|
|
135 |
raw_image, self.confs["Classifier"]["ImageSize"], self.confs["Classifier"]["ImageSize"]) |
|
|
136 |
|
|
|
137 |
# Batch up the image by enqueing the tensors internally in a FIFO queue and dequeueing many elements with tf.train.batch. |
|
|
138 |
images, raw_images, labels = tf.train.batch( |
|
|
139 |
[image, raw_image, label], |
|
|
140 |
batch_size=self.confs["Classifier"]["BatchSize"], |
|
|
141 |
num_threads=4, |
|
|
142 |
capacity=4 * self.confs["Classifier"]["BatchSize"], |
|
|
143 |
allow_smaller_final_batch=True) |
|
|
144 |
|
|
|
145 |
return images, raw_images, labels |
|
|
146 |
|
|
|
147 |
|
|
|
148 |
Trainer = Trainer() |
|
|
149 |
|
|
|
150 |
|
|
|
151 |
def run(): |
|
|
152 |
""" Trainer Runner |
|
|
153 |
|
|
|
154 |
Runs the ALL Detection System 2019 NCS1 Classifier Trainer. |
|
|
155 |
""" |
|
|
156 |
|
|
|
157 |
humanStart, clockStart = Trainer.Helpers.timerStart() |
|
|
158 |
|
|
|
159 |
Trainer.Helpers.logger.info( |
|
|
160 |
"ALL Detection System 2019 NCS1 Trainer started.") |
|
|
161 |
|
|
|
162 |
# Open the labels file |
|
|
163 |
Trainer.labels = open( |
|
|
164 |
Trainer.confs["Classifier"]["DatasetDir"] + "/" + Trainer.confs["Classifier"]["Labels"], 'r') |
|
|
165 |
|
|
|
166 |
# Create a dictionary to refer each label to their string name |
|
|
167 |
for line in Trainer.labels: |
|
|
168 |
label, string_name = line.split(':') |
|
|
169 |
string_name = string_name[:-1] # Remove newline |
|
|
170 |
Trainer.labelsToName[int(label)] = string_name |
|
|
171 |
|
|
|
172 |
# Create a dictionary that will help people understand your dataset better. This is required by the Dataset class later. |
|
|
173 |
Trainer.items_to_descriptions = { |
|
|
174 |
'image': 'A 3-channel RGB coloured image that is ex: office, people', |
|
|
175 |
'label': 'A label that ,start from zero' |
|
|
176 |
} |
|
|
177 |
|
|
|
178 |
# Create the log directory here. Must be done here otherwise import will activate this unneededly. |
|
|
179 |
if not os.path.exists(Trainer.confs["Classifier"]["LogDir"]): |
|
|
180 |
os.mkdir(Trainer.confs["Classifier"]["LogDir"]) |
|
|
181 |
|
|
|
182 |
# Now we start to construct the graph and build our model |
|
|
183 |
with tf.Graph().as_default() as graph: |
|
|
184 |
# Set the verbosity to INFO level |
|
|
185 |
tf.logging.set_verbosity(tf.logging.INFO) |
|
|
186 |
|
|
|
187 |
# First create the dataset and load one batch |
|
|
188 |
dataset = Trainer.getSplit('train') |
|
|
189 |
images, _, labels = Trainer.loadBatch(dataset) |
|
|
190 |
|
|
|
191 |
# Know the number steps to take before decaying the learning rate and batches per epoch |
|
|
192 |
num_batches_per_epoch = dataset.num_samples // Trainer.confs["Classifier"]["BatchSize"] |
|
|
193 |
# Because one step is one batch processed |
|
|
194 |
num_steps_per_epoch = num_batches_per_epoch |
|
|
195 |
decay_steps = int( |
|
|
196 |
Trainer.confs["Classifier"]["EpochsBeforeDecay"] * num_steps_per_epoch) |
|
|
197 |
|
|
|
198 |
# Create the model inference |
|
|
199 |
with slim.arg_scope(inception_v3_arg_scope()): |
|
|
200 |
logits, end_points = inception_v3( |
|
|
201 |
images, num_classes=dataset.num_classes, is_training=True) |
|
|
202 |
|
|
|
203 |
# Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!) |
|
|
204 |
one_hot_labels = slim.one_hot_encoding(labels, dataset.num_classes) |
|
|
205 |
|
|
|
206 |
# Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks |
|
|
207 |
loss = tf.losses.softmax_cross_entropy( |
|
|
208 |
onehot_labels=one_hot_labels, logits=logits) |
|
|
209 |
# obtain the regularization losses as well |
|
|
210 |
total_loss = tf.losses.get_total_loss() |
|
|
211 |
|
|
|
212 |
# Create the global step for monitoring the learning_rate and training. |
|
|
213 |
global_step = get_or_create_global_step() |
|
|
214 |
|
|
|
215 |
# Define your exponentially decaying learning rate |
|
|
216 |
lr = tf.train.exponential_decay( |
|
|
217 |
learning_rate=Trainer.confs["Classifier"]["LearningRate"], |
|
|
218 |
global_step=global_step, |
|
|
219 |
decay_steps=decay_steps, |
|
|
220 |
decay_rate=Trainer.confs["Classifier"]["LearningRateDecay"], |
|
|
221 |
staircase=True) |
|
|
222 |
|
|
|
223 |
# Now we can define the optimizer that takes on the learning rate |
|
|
224 |
optimizer = tf.train.AdamOptimizer(learning_rate=lr) |
|
|
225 |
# optimizer = tf.train.RMSPropOptimizer(learning_rate = lr, momentum=0.9) |
|
|
226 |
|
|
|
227 |
# Create the train_op. |
|
|
228 |
train_op = slim.learning.create_train_op(total_loss, optimizer) |
|
|
229 |
|
|
|
230 |
# State the metrics that you want to predict. We get a predictions that is not one_hot_encoded. |
|
|
231 |
predictions = tf.argmax(end_points['Predictions'], 1) |
|
|
232 |
probabilities = end_points['Predictions'] |
|
|
233 |
accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy( |
|
|
234 |
predictions, labels) |
|
|
235 |
metrics_op = tf.group(accuracy_update, probabilities) |
|
|
236 |
|
|
|
237 |
# Now finally create all the summaries you need to monitor and group them into one summary op. |
|
|
238 |
tf.summary.scalar('losses/Total_Loss', total_loss) |
|
|
239 |
tf.summary.scalar('accuracy', accuracy) |
|
|
240 |
tf.summary.scalar('learning_rate', lr) |
|
|
241 |
my_summary_op = tf.summary.merge_all() |
|
|
242 |
|
|
|
243 |
# Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently. |
|
|
244 |
def train_step(sess, train_op, global_step, epochCount): |
|
|
245 |
''' |
|
|
246 |
Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step |
|
|
247 |
''' |
|
|
248 |
# Check the time for each sess run |
|
|
249 |
start_time = time.time() |
|
|
250 |
total_loss, global_step_count, _ = sess.run( |
|
|
251 |
[train_op, global_step, metrics_op]) |
|
|
252 |
time_elapsed = time.time() - start_time |
|
|
253 |
|
|
|
254 |
# Run the logging to print some results |
|
|
255 |
logging.info(' Epch %.2f Glb Stp %s: Loss: %.4f (%.2f sec/step)', |
|
|
256 |
epochCount, global_step_count, total_loss, time_elapsed) |
|
|
257 |
|
|
|
258 |
return total_loss, global_step_count |
|
|
259 |
|
|
|
260 |
# Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory |
|
|
261 |
sv = tf.train.Supervisor( |
|
|
262 |
logdir=Trainer.confs["Classifier"]["LogDir"], summary_op=None) |
|
|
263 |
|
|
|
264 |
# Run the managed session |
|
|
265 |
with sv.managed_session() as sess: |
|
|
266 |
for step in range(num_steps_per_epoch * Trainer.confs["Classifier"]["Epochs"]): |
|
|
267 |
# At the start of every epoch, show the vital information: |
|
|
268 |
if step % num_batches_per_epoch == 0: |
|
|
269 |
logging.info('Epoch %s/%s', step/num_batches_per_epoch + 1, |
|
|
270 |
Trainer.confs["Classifier"]["Epochs"]) |
|
|
271 |
learning_rate_value, accuracy_value = sess.run( |
|
|
272 |
[lr, accuracy]) |
|
|
273 |
logging.info('Current Learning Rate: %s', |
|
|
274 |
learning_rate_value) |
|
|
275 |
logging.info('Current Streaming Accuracy: %s', |
|
|
276 |
accuracy_value) |
|
|
277 |
|
|
|
278 |
# optionally, print your logits and predictions for a sanity check that things are going fine. |
|
|
279 |
logits_value, probabilities_value, predictions_value, labels_value = sess.run( |
|
|
280 |
[logits, probabilities, predictions, labels]) |
|
|
281 |
print('logits: \n', logits_value[:5]) |
|
|
282 |
print('Probabilities: \n', probabilities_value[:5]) |
|
|
283 |
print('predictions: \n', predictions_value[:100]) |
|
|
284 |
print('Labels:\n:', labels_value[:100]) |
|
|
285 |
|
|
|
286 |
# Log the summaries every 10 step. |
|
|
287 |
if step % 10 == 0: |
|
|
288 |
loss, _ = train_step( |
|
|
289 |
sess, train_op, sv.global_step, step/num_batches_per_epoch + 1) |
|
|
290 |
summaries = sess.run(my_summary_op) |
|
|
291 |
sv.summary_computed(sess, summaries) |
|
|
292 |
# If not, simply run the training step |
|
|
293 |
else: |
|
|
294 |
loss, _ = train_step( |
|
|
295 |
sess, train_op, sv.global_step, step/num_batches_per_epoch + 1) |
|
|
296 |
|
|
|
297 |
# We log the final training loss and accuracy |
|
|
298 |
logging.info('Final Loss: %s', loss) |
|
|
299 |
logging.info('Final Accuracy: %s', sess.run(accuracy)) |
|
|
300 |
|
|
|
301 |
# Once all the training has been done, save the log files and checkpoint model |
|
|
302 |
logging.info('Finished training! Saving model to disk now.') |
|
|
303 |
|
|
|
304 |
checkpoint_file = tf.train.latest_checkpoint( |
|
|
305 |
Trainer.confs["Classifier"]["LogDir"]) |
|
|
306 |
|
|
|
307 |
with tf.Graph().as_default() as graph: |
|
|
308 |
|
|
|
309 |
# images = tf.placeholder(shape=[None, ImageSize, ImageSize, 3], dtype=tf.float32, name = 'Placeholder_only') |
|
|
310 |
images = tf.placeholder("float", [1, Trainer.confs["Classifier"]["ImageSize"], |
|
|
311 |
Trainer.confs["Classifier"]["ImageSize"], 3], name="input") |
|
|
312 |
|
|
|
313 |
with slim.arg_scope(inception_v3_arg_scope()): |
|
|
314 |
logits, end_points = inception_v3( |
|
|
315 |
images, num_classes=Trainer.confs["Classifier"]["NumClasses"], is_training=False) |
|
|
316 |
probabilities = tf.nn.softmax(logits) |
|
|
317 |
saver = tf.train.Saver(slim.get_variables_to_restore()) |
|
|
318 |
|
|
|
319 |
# Setup graph def |
|
|
320 |
input_graph_def = graph.as_graph_def() |
|
|
321 |
output_node_names = Trainer.confs["Classifier"]["OutputNode"] |
|
|
322 |
output_graph_name = Trainer.confs["Classifier"]["ALLGraph"] |
|
|
323 |
|
|
|
324 |
with tf.Session() as sess: |
|
|
325 |
saver.restore(sess, checkpoint_file) |
|
|
326 |
|
|
|
327 |
# Exporting the graph |
|
|
328 |
print("Exporting graph...") |
|
|
329 |
output_graph_def = graph_util.convert_variables_to_constants( |
|
|
330 |
sess, |
|
|
331 |
input_graph_def, |
|
|
332 |
output_node_names.split(",")) |
|
|
333 |
|
|
|
334 |
with tf.gfile.GFile(output_graph_name, "wb") as f: |
|
|
335 |
f.write(output_graph_def.SerializeToString()) |
|
|
336 |
|
|
|
337 |
clockEnd, difference, humanEnd = Trainer.Helpers.timerEnd(clockStart) |
|
|
338 |
|
|
|
339 |
Trainer.Helpers.logger.info( |
|
|
340 |
"ALL Detection System 2019 NCS1 Trainer ended in " + str(difference)) |
|
|
341 |
|
|
|
342 |
if __name__ == "__main__": |
|
|
343 |
run() |