--- a
+++ b/Notebooks/Feature Extraction.ipynb
@@ -0,0 +1,407 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "kernelspec": {
+      "display_name": "Python [conda env:work] *",
+      "language": "python",
+      "name": "conda-env-work-py"
+    },
+    "language_info": {
+      "codemirror_mode": {
+        "name": "ipython",
+        "version": 3
+      },
+      "file_extension": ".py",
+      "mimetype": "text/x-python",
+      "name": "python",
+      "nbconvert_exporter": "python",
+      "pygments_lexer": "ipython3",
+      "version": "3.7.3"
+    },
+    "colab": {
+      "name": "Feature Extraction.ipynb",
+      "provenance": [],
+      "collapsed_sections": []
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ns9Vglq6FAA2",
+        "colab_type": "text"
+      },
+      "source": [
+        "# **Importing Dependencies**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "HuDMOXP3E9_v",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "import pickle\n",
+        "import os\n",
+        "\n",
+        "import imageio\n",
+        "import tqdm\n",
+        "\n",
+        "import numpy as np\n",
+        "import pandas as pd\n",
+        "import matplotlib.pyplot as plt\n",
+        "\n",
+        "import tensorflow as tf\n",
+        "from tensorflow.compat.v1 import ConfigProto\n",
+        "from tensorflow.compat.v1 import InteractiveSession\n",
+        "\n",
+        "tf.compat.v1.disable_eager_execution() \n",
+        "tfK = tf.keras"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "xmtQnMCNF62P",
+        "colab_type": "text"
+      },
+      "source": [
+        "# **Allowing for Parallelized Model Training**"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "fY_Fkhq_F_rP",
+        "colab_type": "text"
+      },
+      "source": [
+        "By default, TensorFlow allocates all available GPU memory to the current training process. By enabling memory growth, however, we can train multiple models in parallel."
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "M4o5zDQ4E9_-",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "gpus = tf.config.experimental.list_physical_devices('GPU')\n",
+        "if gpus:\n",
+        "    try:\n",
+        "        for gpu in gpus:\n",
+        "            tf.config.experimental.set_memory_growth(gpu, True)\n",
+        "    except RuntimeError as e:\n",
+        "        print(e)\n",
+        "\n",
+        "config = ConfigProto()\n",
+        "config.gpu_options.allow_growth = True\n",
+        "session = InteractiveSession(config=config)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "nCMPWnwmG3Aj",
+        "colab_type": "text"
+      },
+      "source": [
+        "# **Loading the Feature Extractor**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "9H39UGuqE-AL",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "model_path = \"./models/trained_cnn_2.h5\"\n",
+        "model = tfK.models.load_model(model_path)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "ezSYj5mDHrQ_",
+        "colab_type": "text"
+      },
+      "source": [
+        "We use the trained CNN as a **feature extractor**. To do this, we simply \"chop off\" the dense and dropout layers following the CNN's last convolutional block, resulting in 8192 features being extracted per image fed to the CNN:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "xZ6k6PXAE-Ax",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "intermediate_layer_model = tfK.models.Model(inputs=model.input,\n",
+        "                                            outputs=model.get_layer(\"flatten_4\").output)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "w3_4qyKbHzn3",
+        "colab_type": "text"
+      },
+      "source": [
+        "# **Loading Data**"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "4oQ-ZiV1E-BB",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "with open(\"ordered_slices_by_patient_randsubset.pkl\", \"rb\") as f:\n",
+        "    patients_pkl = pickle.load(f)\n",
+        "\n",
+        "label_df = pd.read_csv(\"labels_cleaned.csv\")\n",
+        "label_df[\"ID_nopng\"] = label_df[\"ID\"].str.replace(\".png\", \"\")\n",
+        "ID_list = label_df[\"ID_nopng\"].tolist()"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Gw1kcXpyPeuS",
+        "colab_type": "text"
+      },
+      "source": [
+        "# **Preparing the Data for Feature Extraction**"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "sd27DM7kFG5z",
+        "colab_type": "text"
+      },
+      "source": [
+        "For some files present in the data, the *actual image data* (the PNG) is missing. Here, we remove these files:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "DgfyNkyWE-C7",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "patients_pkl_clean = dict()\n",
+        "\n",
+        "for key, item in patients_pkl.items():\n",
+        "    tmp = []\n",
+        "    for slice_id in item:\n",
+        "        if os.path.isfile(\"./Windowed-PNGs-FINAL-comb/\" + slice_id + \".png\"):\n",
+        "            tmp.append(slice_id)\n",
+        "\n",
+        "    patients_pkl_clean[key] = tmp"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "Pp48Z8r3FVEG",
+        "colab_type": "text"
+      },
+      "source": [
+        "Next, we determine how many brain slices each patient's CT scan contains (and what the smallest number of slices in any CT scan is):"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "ganaa_a6E-C3",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "min = float(\"inf\")\n",
+        "lens = []\n",
+        "\n",
+        "for key, item in patients_pkl.items():\n",
+        "    if len(item) < min:\n",
+        "        min = len(item)\n",
+        "    lens.append(len(item))"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "u2W-60ZpOUBx",
+        "colab_type": "text"
+      },
+      "source": [
+        "We find that some CT scans do not contain enough slices to lend themselves well to our **sequential approach**. We ensure that only patients with a sufficient number of slices are considered:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "h1IcGOoTE-DC",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "n_slices = 24\n",
+        "\n",
+        "patients_long_enough = dict()\n",
+        "for key, item in patients_pkl_clean.items():\n",
+        "    if len(item) >= n_slices:\n",
+        "        mid_slice = len(item)//2\n",
+        "        truncated_slice_IDs = item.copy()[mid_slice - n_slices//2:mid_slice + n_slices//2]\n",
+        "        patients_long_enough[key] = truncated_slice_IDs"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "8bVDQCyiFzQk",
+        "colab_type": "text"
+      },
+      "source": [
+        "Finally, we verify that we still have enough patients left to adequately train our sequential-convolutional model (indeed, 2418 patients remain):"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "oEFa8nbQE-DI",
+        "colab_type": "code",
+        "colab": {},
+        "outputId": "ef1190be-aeb0-4a3b-a2c7-a01cdf455e2a"
+      },
+      "source": [
+        "n_patients = len(patients_long_enough)\n",
+        "n_features = 8192\n",
+        "\n",
+        "len(patients_long_enough)"
+      ],
+      "execution_count": 0,
+      "outputs": [
+        {
+          "output_type": "execute_result",
+          "data": {
+            "text/plain": [
+              "2418"
+            ]
+          },
+          "metadata": {
+            "tags": []
+          },
+          "execution_count": 17
+        }
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "r-7UghrGQBMc",
+        "colab_type": "text"
+      },
+      "source": [
+        "# **Performing the Feature Extraction**"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "6fHSRY7nGJ0g",
+        "colab_type": "text"
+      },
+      "source": [
+        "We extract features for the training of our **bidirectional LSTM** by feeding all training PNGs to our previously-trained CNN, letting it run its inference, and then---for each PNG---grabbing the 8192 values from the last convolutional block:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "scrolled": true,
+        "id": "QjOn8YqrE-Dg",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "# This list will contain the extracted features for all training PNGs\n",
+        "data_list = []\n",
+        "# List of corresponding labels for the extracted features\n",
+        "label_list = []\n",
+        "\n",
+        "for i, (patient_ID, slice_IDs) in enumerate(tqdm.tqdm(patients_long_enough.items())):\n",
+        "    data_patient_list = []\n",
+        "    label_patient_list = []\n",
+        "    for j, slice_ID in enumerate(slice_IDs):\n",
+        "        # Load respective PNG\n",
+        "        png_array = np.expand_dims(imageio.imread(\"./Windowed-PNGs-FINAL-comb/\" + slice_ID + \".png\"), 0)\n",
+        "        # Extract features\n",
+        "        layer_features = intermediate_layer_model.predict(png_array).flatten()\n",
+        "        \n",
+        "        data_patient_list.append(layer_features)\n",
+        "        \n",
+        "        label_patient_list.append(label_df[label_df[\"ID_nopng\"]==slice_ID][\"any\"].iloc[0])\n",
+        "\n",
+        "    data_list.append(data_patient_list)\n",
+        "    label_list.append(label_patient_list)\n",
+        "    \n",
+        "data_array = np.array(data_list)\n",
+        "label_array = np.array(label_list)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "metadata": {
+        "id": "-URtcQq0F-sG",
+        "colab_type": "text"
+      },
+      "source": [
+        "Writing the extracted features and corresponding labels to files:"
+      ]
+    },
+    {
+      "cell_type": "code",
+      "metadata": {
+        "id": "zdXxmoS0E-Dr",
+        "colab_type": "code",
+        "colab": {}
+      },
+      "source": [
+        "np.save(\"rcnn-data-array\", data_array)\n",
+        "np.save(\"rcnn-label-array\", label_array)"
+      ],
+      "execution_count": 0,
+      "outputs": []
+    }
+  ]
+}
\ No newline at end of file