--- a +++ b/Notebooks/Feature Extraction.ipynb @@ -0,0 +1,407 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "kernelspec": { + "display_name": "Python [conda env:work] *", + "language": "python", + "name": "conda-env-work-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "colab": { + "name": "Feature Extraction.ipynb", + "provenance": [], + "collapsed_sections": [] + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "ns9Vglq6FAA2", + "colab_type": "text" + }, + "source": [ + "# **Importing Dependencies**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "HuDMOXP3E9_v", + "colab_type": "code", + "colab": {} + }, + "source": [ + "import pickle\n", + "import os\n", + "\n", + "import imageio\n", + "import tqdm\n", + "\n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "\n", + "import tensorflow as tf\n", + "from tensorflow.compat.v1 import ConfigProto\n", + "from tensorflow.compat.v1 import InteractiveSession\n", + "\n", + "tf.compat.v1.disable_eager_execution() \n", + "tfK = tf.keras" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "xmtQnMCNF62P", + "colab_type": "text" + }, + "source": [ + "# **Allowing for Parallelized Model Training**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "fY_Fkhq_F_rP", + "colab_type": "text" + }, + "source": [ + "By default, TensorFlow allocates all available GPU memory to the current training process. By enabling memory growth, however, we can train multiple models in parallel." + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "M4o5zDQ4E9_-", + "colab_type": "code", + "colab": {} + }, + "source": [ + "gpus = tf.config.experimental.list_physical_devices('GPU')\n", + "if gpus:\n", + " try:\n", + " for gpu in gpus:\n", + " tf.config.experimental.set_memory_growth(gpu, True)\n", + " except RuntimeError as e:\n", + " print(e)\n", + "\n", + "config = ConfigProto()\n", + "config.gpu_options.allow_growth = True\n", + "session = InteractiveSession(config=config)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "nCMPWnwmG3Aj", + "colab_type": "text" + }, + "source": [ + "# **Loading the Feature Extractor**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "9H39UGuqE-AL", + "colab_type": "code", + "colab": {} + }, + "source": [ + "model_path = \"./models/trained_cnn_2.h5\"\n", + "model = tfK.models.load_model(model_path)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ezSYj5mDHrQ_", + "colab_type": "text" + }, + "source": [ + "We use the trained CNN as a **feature extractor**. To do this, we simply \"chop off\" the dense and dropout layers following the CNN's last convolutional block, resulting in 8192 features being extracted per image fed to the CNN:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "xZ6k6PXAE-Ax", + "colab_type": "code", + "colab": {} + }, + "source": [ + "intermediate_layer_model = tfK.models.Model(inputs=model.input,\n", + " outputs=model.get_layer(\"flatten_4\").output)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "w3_4qyKbHzn3", + "colab_type": "text" + }, + "source": [ + "# **Loading Data**" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "4oQ-ZiV1E-BB", + "colab_type": "code", + "colab": {} + }, + "source": [ + "with open(\"ordered_slices_by_patient_randsubset.pkl\", \"rb\") as f:\n", + " patients_pkl = pickle.load(f)\n", + "\n", + "label_df = pd.read_csv(\"labels_cleaned.csv\")\n", + "label_df[\"ID_nopng\"] = label_df[\"ID\"].str.replace(\".png\", \"\")\n", + "ID_list = label_df[\"ID_nopng\"].tolist()" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Gw1kcXpyPeuS", + "colab_type": "text" + }, + "source": [ + "# **Preparing the Data for Feature Extraction**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sd27DM7kFG5z", + "colab_type": "text" + }, + "source": [ + "For some files present in the data, the *actual image data* (the PNG) is missing. Here, we remove these files:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "DgfyNkyWE-C7", + "colab_type": "code", + "colab": {} + }, + "source": [ + "patients_pkl_clean = dict()\n", + "\n", + "for key, item in patients_pkl.items():\n", + " tmp = []\n", + " for slice_id in item:\n", + " if os.path.isfile(\"./Windowed-PNGs-FINAL-comb/\" + slice_id + \".png\"):\n", + " tmp.append(slice_id)\n", + "\n", + " patients_pkl_clean[key] = tmp" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Pp48Z8r3FVEG", + "colab_type": "text" + }, + "source": [ + "Next, we determine how many brain slices each patient's CT scan contains (and what the smallest number of slices in any CT scan is):" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "ganaa_a6E-C3", + "colab_type": "code", + "colab": {} + }, + "source": [ + "min = float(\"inf\")\n", + "lens = []\n", + "\n", + "for key, item in patients_pkl.items():\n", + " if len(item) < min:\n", + " min = len(item)\n", + " lens.append(len(item))" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "u2W-60ZpOUBx", + "colab_type": "text" + }, + "source": [ + "We find that some CT scans do not contain enough slices to lend themselves well to our **sequential approach**. We ensure that only patients with a sufficient number of slices are considered:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "h1IcGOoTE-DC", + "colab_type": "code", + "colab": {} + }, + "source": [ + "n_slices = 24\n", + "\n", + "patients_long_enough = dict()\n", + "for key, item in patients_pkl_clean.items():\n", + " if len(item) >= n_slices:\n", + " mid_slice = len(item)//2\n", + " truncated_slice_IDs = item.copy()[mid_slice - n_slices//2:mid_slice + n_slices//2]\n", + " patients_long_enough[key] = truncated_slice_IDs" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "8bVDQCyiFzQk", + "colab_type": "text" + }, + "source": [ + "Finally, we verify that we still have enough patients left to adequately train our sequential-convolutional model (indeed, 2418 patients remain):" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "oEFa8nbQE-DI", + "colab_type": "code", + "colab": {}, + "outputId": "ef1190be-aeb0-4a3b-a2c7-a01cdf455e2a" + }, + "source": [ + "n_patients = len(patients_long_enough)\n", + "n_features = 8192\n", + "\n", + "len(patients_long_enough)" + ], + "execution_count": 0, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "2418" + ] + }, + "metadata": { + "tags": [] + }, + "execution_count": 17 + } + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "r-7UghrGQBMc", + "colab_type": "text" + }, + "source": [ + "# **Performing the Feature Extraction**" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6fHSRY7nGJ0g", + "colab_type": "text" + }, + "source": [ + "We extract features for the training of our **bidirectional LSTM** by feeding all training PNGs to our previously-trained CNN, letting it run its inference, and then---for each PNG---grabbing the 8192 values from the last convolutional block:" + ] + }, + { + "cell_type": "code", + "metadata": { + "scrolled": true, + "id": "QjOn8YqrE-Dg", + "colab_type": "code", + "colab": {} + }, + "source": [ + "# This list will contain the extracted features for all training PNGs\n", + "data_list = []\n", + "# List of corresponding labels for the extracted features\n", + "label_list = []\n", + "\n", + "for i, (patient_ID, slice_IDs) in enumerate(tqdm.tqdm(patients_long_enough.items())):\n", + " data_patient_list = []\n", + " label_patient_list = []\n", + " for j, slice_ID in enumerate(slice_IDs):\n", + " # Load respective PNG\n", + " png_array = np.expand_dims(imageio.imread(\"./Windowed-PNGs-FINAL-comb/\" + slice_ID + \".png\"), 0)\n", + " # Extract features\n", + " layer_features = intermediate_layer_model.predict(png_array).flatten()\n", + " \n", + " data_patient_list.append(layer_features)\n", + " \n", + " label_patient_list.append(label_df[label_df[\"ID_nopng\"]==slice_ID][\"any\"].iloc[0])\n", + "\n", + " data_list.append(data_patient_list)\n", + " label_list.append(label_patient_list)\n", + " \n", + "data_array = np.array(data_list)\n", + "label_array = np.array(label_list)" + ], + "execution_count": 0, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-URtcQq0F-sG", + "colab_type": "text" + }, + "source": [ + "Writing the extracted features and corresponding labels to files:" + ] + }, + { + "cell_type": "code", + "metadata": { + "id": "zdXxmoS0E-Dr", + "colab_type": "code", + "colab": {} + }, + "source": [ + "np.save(\"rcnn-data-array\", data_array)\n", + "np.save(\"rcnn-label-array\", label_array)" + ], + "execution_count": 0, + "outputs": [] + } + ] +} \ No newline at end of file