a b/Projects/NCS1/Trainer.py
1
from __future__ import print_function
2
############################################################################################
3
#
4
# Project:       Peter Moss Acute Myeloid & Lymphoblastic Leukemia AI Research Project
5
# Repository:    ALL Detection System 2019
6
# Project:       Facial Authentication Server
7
#
8
# Author:        Adam Milton-Barker (AdamMiltonBarker.com)
9
# Contributors:
10
# Title:         Trainer Class
11
# Description:   Trainer class for the ALL Detection System 2019 NCS1 Classifier.
12
# License:       MIT License
13
# Last Modified: 2020-07-16
14
#
15
############################################################################################
16
17
import glob, json, math, os, random, sys, time
18
19
import numpy as np
20
import tensorflow as tf
21
22
import Classes.inception_preprocessing
23
24
from tensorflow.contrib.framework.python.ops.variables import get_or_create_global_step
25
from tensorflow.python.platform import tf_logging as logging
26
from tensorflow.python.framework import graph_util
27
28
from sys import argv
29
from datetime import datetime
30
from builtins import range
31
32
from Classes.Helpers import Helpers
33
from Classes.Data import Data
34
from Classes.inception_v3 import inception_v3, inception_v3_arg_scope
35
36
slim = tf.contrib.slim
37
38
39
class Trainer():
40
    """ Trainer Class
41
42
    Trains the ALL Detection System 2019 NCS1 Trainer.
43
    """
44
45
    def __init__(self):
46
        """ Initializes Trainer Class """
47
48
        self.Helpers = Helpers("Trainer")
49
        self.confs = self.Helpers.confs
50
51
        self.Helpers.logger.info(
52
            "Trainer class initialization complete.")
53
54
        self.labelsToName = {}
55
56
    def getSplit(self, split_name):
57
        """ Gets the training/validation split """
58
59
        # Check whether the split_name is train or validation
60
        if split_name not in ['train', 'validation']:
61
            raise ValueError(
62
                'The split_name %s is not recognized. Please input either train or validation as the split_name' % (split_name))
63
64
        # Create the full path for a general FilePattern to locate the tfrecord_files
65
        FilePattern_path = os.path.join(
66
            self.confs["Classifier"]["DatasetDir"], self.confs["Classifier"]["FilePattern"] % (split_name))
67
68
        # Count the total number of examples in all of these shard
69
        num_samples = 0
70
        FilePattern_for_counting = 'ALL_' + split_name
71
        tfrecords_to_count = [os.path.join(self.confs["Classifier"]["DatasetDir"], file) for file in os.listdir(
72
            self.confs["Classifier"]["DatasetDir"]) if file.startswith(FilePattern_for_counting)]
73
74
        # print(tfrecords_to_count)
75
        for tfrecord_file in tfrecords_to_count:
76
            for record in tf.python_io.tf_record_iterator(tfrecord_file):
77
                num_samples += 1
78
79
        # Create a reader, which must be a TFRecord reader in this case
80
        reader = tf.TFRecordReader
81
82
        # Create the keys_to_features dictionary for the decoder
83
        keys_to_features = {
84
            'image/encoded': tf.FixedLenFeature((), tf.string, default_value=''),
85
            'image/format': tf.FixedLenFeature((), tf.string, default_value='jpg'),
86
            'image/class/label': tf.FixedLenFeature(
87
                [], tf.int64, default_value=tf.zeros([], dtype=tf.int64)),
88
        }
89
90
        # Create the items_to_handlers dictionary for the decoder.
91
        items_to_handlers = {
92
            'image': slim.tfexample_decoder.Image(),
93
            'label': slim.tfexample_decoder.Tensor('image/class/label'),
94
        }
95
96
        # Start to create the decoder
97
        decoder = slim.tfexample_decoder.TFExampleDecoder(
98
            keys_to_features, items_to_handlers)
99
100
        # Create the labels_to_name file
101
        labels_to_name_dict = self.labelsToName
102
103
        # Actually create the dataset
104
        dataset = slim.dataset.Dataset(
105
            data_sources=FilePattern_path,
106
            decoder=decoder,
107
            reader=reader,
108
            num_readers=4,
109
            num_samples=num_samples,
110
            num_classes=self.confs["Classifier"]["NumClasses"],
111
            labels_to_name=labels_to_name_dict,
112
            items_to_descriptions=self.items_to_descriptions)
113
114
        return dataset
115
116
    def loadBatch(self, dataset, is_training=True):
117
        """ Loads a batch for training """
118
119
        # First create the data_provider object
120
        data_provider = slim.dataset_data_provider.DatasetDataProvider(
121
            dataset,
122
            common_queue_capacity=24 + 3 *
123
            self.confs["Classifier"]["BatchSize"],
124
            common_queue_min=24)
125
126
        # Obtain the raw image using the get method
127
        raw_image, label = data_provider.get(['image', 'label'])
128
129
        # Perform the correct preprocessing for this image depending if it is training or evaluating
130
        image = Classes.inception_preprocessing.preprocess_image(
131
            raw_image, self.confs["Classifier"]["ImageSize"], self.confs["Classifier"]["ImageSize"], is_training)
132
133
        # As for the raw images, we just do a simple reshape to batch it up
134
        raw_image = tf.image.resize_image_with_crop_or_pad(
135
            raw_image, self.confs["Classifier"]["ImageSize"], self.confs["Classifier"]["ImageSize"])
136
137
        # Batch up the image by enqueing the tensors internally in a FIFO queue and dequeueing many elements with tf.train.batch.
138
        images, raw_images, labels = tf.train.batch(
139
            [image, raw_image, label],
140
            batch_size=self.confs["Classifier"]["BatchSize"],
141
            num_threads=4,
142
            capacity=4 * self.confs["Classifier"]["BatchSize"],
143
            allow_smaller_final_batch=True)
144
145
        return images, raw_images, labels
146
147
148
Trainer = Trainer()
149
150
151
def run():
152
    """ Trainer Runner
153
154
    Runs the ALL Detection System 2019 NCS1 Classifier Trainer.
155
    """
156
157
    humanStart, clockStart = Trainer.Helpers.timerStart()
158
159
    Trainer.Helpers.logger.info(
160
        "ALL Detection System 2019 NCS1 Trainer started.")
161
162
    # Open the labels file
163
    Trainer.labels = open(
164
        Trainer.confs["Classifier"]["DatasetDir"] + "/" + Trainer.confs["Classifier"]["Labels"], 'r')
165
166
    # Create a dictionary to refer each label to their string name
167
    for line in Trainer.labels:
168
        label, string_name = line.split(':')
169
        string_name = string_name[:-1]  # Remove newline
170
        Trainer.labelsToName[int(label)] = string_name
171
172
    # Create a dictionary that will help people understand your dataset better. This is required by the Dataset class later.
173
    Trainer.items_to_descriptions = {
174
        'image': 'A 3-channel RGB coloured  image that is ex: office, people',
175
        'label': 'A label that ,start from zero'
176
    }
177
178
    # Create the log directory here. Must be done here otherwise import will activate this unneededly.
179
    if not os.path.exists(Trainer.confs["Classifier"]["LogDir"]):
180
        os.mkdir(Trainer.confs["Classifier"]["LogDir"])
181
182
    # Now we start to construct the graph and build our model
183
    with tf.Graph().as_default() as graph:
184
        # Set the verbosity to INFO level
185
        tf.logging.set_verbosity(tf.logging.INFO)
186
187
        # First create the dataset and load one batch
188
        dataset = Trainer.getSplit('train')
189
        images, _, labels = Trainer.loadBatch(dataset)
190
191
        # Know the number steps to take before decaying the learning rate and batches per epoch
192
        num_batches_per_epoch = dataset.num_samples // Trainer.confs["Classifier"]["BatchSize"]
193
        # Because one step is one batch processed
194
        num_steps_per_epoch = num_batches_per_epoch
195
        decay_steps = int(
196
            Trainer.confs["Classifier"]["EpochsBeforeDecay"] * num_steps_per_epoch)
197
198
        # Create the model inference
199
        with slim.arg_scope(inception_v3_arg_scope()):
200
            logits, end_points = inception_v3(
201
                images, num_classes=dataset.num_classes, is_training=True)
202
203
        # Perform one-hot-encoding of the labels (Try one-hot-encoding within the load_batch function!)
204
        one_hot_labels = slim.one_hot_encoding(labels, dataset.num_classes)
205
206
        # Performs the equivalent to tf.nn.sparse_softmax_cross_entropy_with_logits but enhanced with checks
207
        loss = tf.losses.softmax_cross_entropy(
208
            onehot_labels=one_hot_labels, logits=logits)
209
        # obtain the regularization losses as well
210
        total_loss = tf.losses.get_total_loss()
211
212
        # Create the global step for monitoring the learning_rate and training.
213
        global_step = get_or_create_global_step()
214
215
        # Define your exponentially decaying learning rate
216
        lr = tf.train.exponential_decay(
217
            learning_rate=Trainer.confs["Classifier"]["LearningRate"],
218
            global_step=global_step,
219
            decay_steps=decay_steps,
220
            decay_rate=Trainer.confs["Classifier"]["LearningRateDecay"],
221
            staircase=True)
222
223
        # Now we can define the optimizer that takes on the learning rate
224
        optimizer = tf.train.AdamOptimizer(learning_rate=lr)
225
        # optimizer = tf.train.RMSPropOptimizer(learning_rate = lr, momentum=0.9)
226
227
        # Create the train_op.
228
        train_op = slim.learning.create_train_op(total_loss, optimizer)
229
230
        # State the metrics that you want to predict. We get a predictions that is not one_hot_encoded.
231
        predictions = tf.argmax(end_points['Predictions'], 1)
232
        probabilities = end_points['Predictions']
233
        accuracy, accuracy_update = tf.contrib.metrics.streaming_accuracy(
234
            predictions, labels)
235
        metrics_op = tf.group(accuracy_update, probabilities)
236
237
        # Now finally create all the summaries you need to monitor and group them into one summary op.
238
        tf.summary.scalar('losses/Total_Loss', total_loss)
239
        tf.summary.scalar('accuracy', accuracy)
240
        tf.summary.scalar('learning_rate', lr)
241
        my_summary_op = tf.summary.merge_all()
242
243
        # Now we need to create a training step function that runs both the train_op, metrics_op and updates the global_step concurrently.
244
        def train_step(sess, train_op, global_step, epochCount):
245
            '''
246
            Simply runs a session for the three arguments provided and gives a logging on the time elapsed for each global step
247
            '''
248
            # Check the time for each sess run
249
            start_time = time.time()
250
            total_loss, global_step_count, _ = sess.run(
251
                [train_op, global_step, metrics_op])
252
            time_elapsed = time.time() - start_time
253
254
            # Run the logging to print some results
255
            logging.info(' Epch %.2f Glb Stp %s: Loss: %.4f (%.2f sec/step)',
256
                         epochCount, global_step_count, total_loss, time_elapsed)
257
258
            return total_loss, global_step_count
259
260
        # Define your supervisor for running a managed session. Do not run the summary_op automatically or else it will consume too much memory
261
        sv = tf.train.Supervisor(
262
            logdir=Trainer.confs["Classifier"]["LogDir"], summary_op=None)
263
264
        # Run the managed session
265
        with sv.managed_session() as sess:
266
            for step in range(num_steps_per_epoch * Trainer.confs["Classifier"]["Epochs"]):
267
                # At the start of every epoch, show the vital information:
268
                if step % num_batches_per_epoch == 0:
269
                    logging.info('Epoch %s/%s', step/num_batches_per_epoch + 1,
270
                                 Trainer.confs["Classifier"]["Epochs"])
271
                    learning_rate_value, accuracy_value = sess.run(
272
                        [lr, accuracy])
273
                    logging.info('Current Learning Rate: %s',
274
                                 learning_rate_value)
275
                    logging.info('Current Streaming Accuracy: %s',
276
                                 accuracy_value)
277
278
                    # optionally, print your logits and predictions for a sanity check that things are going fine.
279
                    logits_value, probabilities_value, predictions_value, labels_value = sess.run(
280
                        [logits, probabilities, predictions, labels])
281
                    print('logits: \n', logits_value[:5])
282
                    print('Probabilities: \n', probabilities_value[:5])
283
                    print('predictions: \n', predictions_value[:100])
284
                    print('Labels:\n:', labels_value[:100])
285
286
                # Log the summaries every 10 step.
287
                if step % 10 == 0:
288
                    loss, _ = train_step(
289
                        sess, train_op, sv.global_step, step/num_batches_per_epoch + 1)
290
                    summaries = sess.run(my_summary_op)
291
                    sv.summary_computed(sess, summaries)
292
                # If not, simply run the training step
293
                else:
294
                    loss, _ = train_step(
295
                        sess, train_op, sv.global_step, step/num_batches_per_epoch + 1)
296
297
            # We log the final training loss and accuracy
298
            logging.info('Final Loss: %s', loss)
299
            logging.info('Final Accuracy: %s', sess.run(accuracy))
300
301
            # Once all the training has been done, save the log files and checkpoint model
302
            logging.info('Finished training! Saving model to disk now.')
303
304
    checkpoint_file = tf.train.latest_checkpoint(
305
        Trainer.confs["Classifier"]["LogDir"])
306
307
    with tf.Graph().as_default() as graph:
308
309
        # images = tf.placeholder(shape=[None, ImageSize, ImageSize, 3], dtype=tf.float32, name = 'Placeholder_only')
310
        images = tf.placeholder("float", [1, Trainer.confs["Classifier"]["ImageSize"],
311
                                          Trainer.confs["Classifier"]["ImageSize"], 3], name="input")
312
313
        with slim.arg_scope(inception_v3_arg_scope()):
314
            logits, end_points = inception_v3(
315
                images, num_classes=Trainer.confs["Classifier"]["NumClasses"], is_training=False)
316
        probabilities = tf.nn.softmax(logits)
317
        saver = tf.train.Saver(slim.get_variables_to_restore())
318
319
        # Setup graph def
320
        input_graph_def = graph.as_graph_def()
321
        output_node_names = Trainer.confs["Classifier"]["OutputNode"]
322
        output_graph_name = Trainer.confs["Classifier"]["ALLGraph"]
323
324
        with tf.Session() as sess:
325
            saver.restore(sess, checkpoint_file)
326
327
            # Exporting the graph
328
            print("Exporting graph...")
329
            output_graph_def = graph_util.convert_variables_to_constants(
330
                sess,
331
                input_graph_def,
332
                output_node_names.split(","))
333
334
            with tf.gfile.GFile(output_graph_name, "wb") as f:
335
                f.write(output_graph_def.SerializeToString())
336
337
        clockEnd, difference, humanEnd = Trainer.Helpers.timerEnd(clockStart)
338
339
    Trainer.Helpers.logger.info(
340
        "ALL Detection System 2019 NCS1 Trainer ended in " + str(difference))
341
342
if __name__ == "__main__":
343
    run()