Switch to side-by-side view

--- a
+++ b/Notebooks/Training the CNN.ipynb
@@ -0,0 +1,441 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python [conda env:datasci] *",
+      "language": "python",
+      "name": "conda-env-datasci-py"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.7.5"
+    },
+    "colab": {
+      "name": "Training the CNN.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "efCvb66QStgP",
+        "colab_type": "text"
+      },
+      "source": [
+        "*To run this notebook, please provide the following four file paths:*"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Iukqj1IzStyn",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "path_to_train = '/path/to/train/images'\n",
+        "path_to_test = '/path/to/test/images'\n",
+        "path_to_labels = '/path/to/labels.csv'\n",
+        "\n",
+        "path_to_save_model = '/path/to/save/model/to/cnn-checkpoint-{epoch:02d}-{val_accuracy:.2f}.hdf5'"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "XqBOX3Y4MWs3",
+        "colab_type": "text"
+      },
+      "source": [
+        "## **Installing & Importing Dependencies**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "jnq7FwiHKZwY",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "!pip install keras"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "OTnWqAwiKZxb",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import tensorflow as tf\n",
+        "import pandas as pd\n",
+        "\n",
+        "from keras import applications\n",
+        "from keras import optimizers\n",
+        "from keras import backend as k \n",
+        "\n",
+        "from keras.models import Sequential, Model\n",
+        "from keras.layers import Dropout, Flatten, Dense, GlobalAveragePooling2D\n",
+        "from keras.callbacks import ModelCheckpoint, LearningRateScheduler, TensorBoard, EarlyStopping\n",
+        "from keras.preprocessing.image import ImageDataGenerator"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "N60M6zljKZxs",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Make sure that GPU is available on the machine\n",
+        "assert tf.test.is_gpu_available()\n",
+        "assert tf.test.is_built_with_cuda()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "PIErVpE6NHx6",
+        "colab_type": "text"
+      },
+      "source": [
+        "## **Reading in Data Labels**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "SGUEeB-cKZx3",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "labels_df = pd.read_csv(path_to_labels)\n",
+        "\n",
+        "# For flow_from_dataframe to function, string datatype is required\n",
+        "labels_df = labels_df.astype(str)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "scrolled": true,
+        "id": "qShGxNkPKZx7",
+        "colab_type": "code",
+        "colab": {},
+        "outputId": "f6cf0aa8-a383-440a-e4ed-13499e3669af"
+      },
+      "source": [
+        "# Inspect the DataFrame containing the labels\n",
+        "labels_df.head(3)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/html": [
+              "<div>\n",
+              "<style scoped>\n",
+              "    .dataframe tbody tr th:only-of-type {\n",
+              "        vertical-align: middle;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe tbody tr th {\n",
+              "        vertical-align: top;\n",
+              "    }\n",
+              "\n",
+              "    .dataframe thead th {\n",
+              "        text-align: right;\n",
+              "    }\n",
+              "</style>\n",
+              "<table border=\"1\" class=\"dataframe\">\n",
+              "  <thead>\n",
+              "    <tr style=\"text-align: right;\">\n",
+              "      <th></th>\n",
+              "      <th>Unnamed: 0</th>\n",
+              "      <th>ID</th>\n",
+              "      <th>any</th>\n",
+              "    </tr>\n",
+              "  </thead>\n",
+              "  <tbody>\n",
+              "    <tr>\n",
+              "      <th>0</th>\n",
+              "      <td>0</td>\n",
+              "      <td>ID_000039fa0.png</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>1</th>\n",
+              "      <td>1</td>\n",
+              "      <td>ID_00005679d.png</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "    <tr>\n",
+              "      <th>2</th>\n",
+              "      <td>2</td>\n",
+              "      <td>ID_00008ce3c.png</td>\n",
+              "      <td>0</td>\n",
+              "    </tr>\n",
+              "  </tbody>\n",
+              "</table>\n",
+              "</div>"
+            ],
+            "text/plain": [
+              "  Unnamed: 0                ID any\n",
+              "0          0  ID_000039fa0.png   0\n",
+              "1          1  ID_00005679d.png   0\n",
+              "2          2  ID_00008ce3c.png   0"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 19
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ElzPsVvaODOk",
+        "colab_type": "text"
+      },
+      "source": [
+        "## **Building Transfer-Learning Model**"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "qFQ7sFZjx_ba",
+        "colab_type": "text"
+      },
+      "source": [
+        "*As our pretrained model, we choose VGG19 with ImageNet weights. Note that include_top = False (otherwise, we would be including VGG19's final 1000-node dense softmax prediction layer)*"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "vh0b4EQtKZyN",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model = applications.VGG19(weights = \"imagenet\", include_top=False, input_shape = (128, 128, 3))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "tszxYuOwyURR",
+        "colab_type": "text"
+      },
+      "source": [
+        "*Freeze the first three convolutional blocks, leave the last two unfrozen. Thereby, we transfer the model's knowledge of low-level features (like edges and angles) while allowing for it to learn new high-level features (like hemorrhages).*"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "BruLVtwJKZ0g",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "for layer in model.layers[0:12]:\n",
+        "    layer.trainable = False"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "g_Kb-J_Qylgz",
+        "colab_type": "text"
+      },
+      "source": [
+        "*We append our own custom layers to the end of VGG19. Note that a single sigmoid final prediction node is equivalent to two softmax final prediction nodes.*"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "Z7uPdX-8c5eU",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "x = model.output\n",
+        "x = Flatten()(x)\n",
+        "\n",
+        "x = Dense(1000, activation = 'relu')(x)\n",
+        "x = Dropout(0.5)(x)\n",
+        "x = Dense(1000, activation = 'relu')(x)\n",
+        "output = Dense(1, activation = 'sigmoid')(x)\n",
+        "\n",
+        "custom_model = Model(inputs = model.input, outputs = output)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "X49CwEDJy7tN",
+        "colab_type": "text"
+      },
+      "source": [
+        "*Compiling the model. The primary metric we care about is recall (that is, the CNN's ability to correctly detect intracranial hemorrhages).*"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xECXyzILnWTa",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "custom_model.compile(loss = 'binary_crossentropy',\n",
+        "                     optimizer = optimizers.Adam(lr=0.0001),\n",
+        "                     metrics=['accuracy',\n",
+        "                              tf.keras.metrics.Recall(),\n",
+        "                              tf.keras.metrics.AUC(),\n",
+        "                              tf.keras.metrics.Precision()])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "u7BWA55koyNp",
+        "colab_type": "text"
+      },
+      "source": [
+        "# **Creating Train & Test Generators**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "fgTeFSx5olWu",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Initializing train & test generators to flow train and test images straight from the folders\n",
+        "# that they are stored in\n",
+        "train_datagen = ImageDataGenerator(rescale = 1./255,\n",
+        "                                   horizontal_flip = True,\n",
+        "                                   fill_mode = \"nearest\",\n",
+        "                                   zoom_range = 0.3,\n",
+        "                                   width_shift_range = 0.3,\n",
+        "                                   height_shift_range=0.3,\n",
+        "                                   rotation_range=30)\n",
+        "\n",
+        "test_datagen = ImageDataGenerator(rescale = 1./255)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "7-zf69uioyiE",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# We flow from DataFrames, that is, our images are not stored in class-specific folders---instead,\n",
+        "# their labels are stored in separate files (specifically, in DataFrames)\n",
+        "train_generator = train_datagen.flow_from_dataframe(dataframe=labels_df,\n",
+        "                                                    directory= path_to_train,\n",
+        "                                                    x_col='ID',\n",
+        "                                                    y_col='any',\n",
+        "                                                    target_size=(128, 128),\n",
+        "                                                    class_mode='binary')\n",
+        "\n",
+        "test_generator = test_datagen.flow_from_dataframe(dataframe=labels_df,\n",
+        "                                                  directory=path_to_test,\n",
+        "                                                  x_col='ID',\n",
+        "                                                  y_col='any',\n",
+        "                                                  target_size=(128, 128),\n",
+        "                                                  class_mode='binary')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "_Xh6lg1tszMo",
+        "colab_type": "text"
+      },
+      "source": [
+        "# **Fitting the Model**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "HpcKttO0qdEe",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Callbacks\n",
+        "checkpoint = ModelCheckpoint(path_to_save_model, monitor='val_acc', verbose=1, save_best_only=False, save_weights_only=False, mode='auto', period=1)\n",
+        "early_stopper = EarlyStopping(monitor='val_acc', min_delta=0, patience=3, verbose=1, mode='auto')"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "5e5xPMF2KZ1h",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# Fitting the model \n",
+        "custom_model.fit_generator(train_generator,\n",
+        "                           epochs = 50,\n",
+        "                           validation_data = test_generator,\n",
+        "                           callbacks = [checkpoint, early_stopper])"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    }
+  ]
+}