Switch to unified view

a b/Notebooks/Feature Extraction.ipynb
1
{
2
  "nbformat": 4,
3
  "nbformat_minor": 0,
4
  "metadata": {
5
    "kernelspec": {
6
      "display_name": "Python [conda env:work] *",
7
      "language": "python",
8
      "name": "conda-env-work-py"
9
    },
10
    "language_info": {
11
      "codemirror_mode": {
12
        "name": "ipython",
13
        "version": 3
14
      },
15
      "file_extension": ".py",
16
      "mimetype": "text/x-python",
17
      "name": "python",
18
      "nbconvert_exporter": "python",
19
      "pygments_lexer": "ipython3",
20
      "version": "3.7.3"
21
    },
22
    "colab": {
23
      "name": "Feature Extraction.ipynb",
24
      "provenance": [],
25
      "collapsed_sections": []
26
    }
27
  },
28
  "cells": [
29
    {
30
      "cell_type": "markdown",
31
      "metadata": {
32
        "id": "ns9Vglq6FAA2",
33
        "colab_type": "text"
34
      },
35
      "source": [
36
        "# **Importing Dependencies**"
37
      ]
38
    },
39
    {
40
      "cell_type": "code",
41
      "metadata": {
42
        "id": "HuDMOXP3E9_v",
43
        "colab_type": "code",
44
        "colab": {}
45
      },
46
      "source": [
47
        "import pickle\n",
48
        "import os\n",
49
        "\n",
50
        "import imageio\n",
51
        "import tqdm\n",
52
        "\n",
53
        "import numpy as np\n",
54
        "import pandas as pd\n",
55
        "import matplotlib.pyplot as plt\n",
56
        "\n",
57
        "import tensorflow as tf\n",
58
        "from tensorflow.compat.v1 import ConfigProto\n",
59
        "from tensorflow.compat.v1 import InteractiveSession\n",
60
        "\n",
61
        "tf.compat.v1.disable_eager_execution() \n",
62
        "tfK = tf.keras"
63
      ],
64
      "execution_count": 0,
65
      "outputs": []
66
    },
67
    {
68
      "cell_type": "markdown",
69
      "metadata": {
70
        "id": "xmtQnMCNF62P",
71
        "colab_type": "text"
72
      },
73
      "source": [
74
        "# **Allowing for Parallelized Model Training**"
75
      ]
76
    },
77
    {
78
      "cell_type": "markdown",
79
      "metadata": {
80
        "id": "fY_Fkhq_F_rP",
81
        "colab_type": "text"
82
      },
83
      "source": [
84
        "By default, TensorFlow allocates all available GPU memory to the current training process. By enabling memory growth, however, we can train multiple models in parallel."
85
      ]
86
    },
87
    {
88
      "cell_type": "code",
89
      "metadata": {
90
        "id": "M4o5zDQ4E9_-",
91
        "colab_type": "code",
92
        "colab": {}
93
      },
94
      "source": [
95
        "gpus = tf.config.experimental.list_physical_devices('GPU')\n",
96
        "if gpus:\n",
97
        "    try:\n",
98
        "        for gpu in gpus:\n",
99
        "            tf.config.experimental.set_memory_growth(gpu, True)\n",
100
        "    except RuntimeError as e:\n",
101
        "        print(e)\n",
102
        "\n",
103
        "config = ConfigProto()\n",
104
        "config.gpu_options.allow_growth = True\n",
105
        "session = InteractiveSession(config=config)"
106
      ],
107
      "execution_count": 0,
108
      "outputs": []
109
    },
110
    {
111
      "cell_type": "markdown",
112
      "metadata": {
113
        "id": "nCMPWnwmG3Aj",
114
        "colab_type": "text"
115
      },
116
      "source": [
117
        "# **Loading the Feature Extractor**"
118
      ]
119
    },
120
    {
121
      "cell_type": "code",
122
      "metadata": {
123
        "id": "9H39UGuqE-AL",
124
        "colab_type": "code",
125
        "colab": {}
126
      },
127
      "source": [
128
        "model_path = \"./models/trained_cnn_2.h5\"\n",
129
        "model = tfK.models.load_model(model_path)"
130
      ],
131
      "execution_count": 0,
132
      "outputs": []
133
    },
134
    {
135
      "cell_type": "markdown",
136
      "metadata": {
137
        "id": "ezSYj5mDHrQ_",
138
        "colab_type": "text"
139
      },
140
      "source": [
141
        "We use the trained CNN as a **feature extractor**. To do this, we simply \"chop off\" the dense and dropout layers following the CNN's last convolutional block, resulting in 8192 features being extracted per image fed to the CNN:"
142
      ]
143
    },
144
    {
145
      "cell_type": "code",
146
      "metadata": {
147
        "id": "xZ6k6PXAE-Ax",
148
        "colab_type": "code",
149
        "colab": {}
150
      },
151
      "source": [
152
        "intermediate_layer_model = tfK.models.Model(inputs=model.input,\n",
153
        "                                            outputs=model.get_layer(\"flatten_4\").output)"
154
      ],
155
      "execution_count": 0,
156
      "outputs": []
157
    },
158
    {
159
      "cell_type": "markdown",
160
      "metadata": {
161
        "id": "w3_4qyKbHzn3",
162
        "colab_type": "text"
163
      },
164
      "source": [
165
        "# **Loading Data**"
166
      ]
167
    },
168
    {
169
      "cell_type": "code",
170
      "metadata": {
171
        "id": "4oQ-ZiV1E-BB",
172
        "colab_type": "code",
173
        "colab": {}
174
      },
175
      "source": [
176
        "with open(\"ordered_slices_by_patient_randsubset.pkl\", \"rb\") as f:\n",
177
        "    patients_pkl = pickle.load(f)\n",
178
        "\n",
179
        "label_df = pd.read_csv(\"labels_cleaned.csv\")\n",
180
        "label_df[\"ID_nopng\"] = label_df[\"ID\"].str.replace(\".png\", \"\")\n",
181
        "ID_list = label_df[\"ID_nopng\"].tolist()"
182
      ],
183
      "execution_count": 0,
184
      "outputs": []
185
    },
186
    {
187
      "cell_type": "markdown",
188
      "metadata": {
189
        "id": "Gw1kcXpyPeuS",
190
        "colab_type": "text"
191
      },
192
      "source": [
193
        "# **Preparing the Data for Feature Extraction**"
194
      ]
195
    },
196
    {
197
      "cell_type": "markdown",
198
      "metadata": {
199
        "id": "sd27DM7kFG5z",
200
        "colab_type": "text"
201
      },
202
      "source": [
203
        "For some files present in the data, the *actual image data* (the PNG) is missing. Here, we remove these files:"
204
      ]
205
    },
206
    {
207
      "cell_type": "code",
208
      "metadata": {
209
        "id": "DgfyNkyWE-C7",
210
        "colab_type": "code",
211
        "colab": {}
212
      },
213
      "source": [
214
        "patients_pkl_clean = dict()\n",
215
        "\n",
216
        "for key, item in patients_pkl.items():\n",
217
        "    tmp = []\n",
218
        "    for slice_id in item:\n",
219
        "        if os.path.isfile(\"./Windowed-PNGs-FINAL-comb/\" + slice_id + \".png\"):\n",
220
        "            tmp.append(slice_id)\n",
221
        "\n",
222
        "    patients_pkl_clean[key] = tmp"
223
      ],
224
      "execution_count": 0,
225
      "outputs": []
226
    },
227
    {
228
      "cell_type": "markdown",
229
      "metadata": {
230
        "id": "Pp48Z8r3FVEG",
231
        "colab_type": "text"
232
      },
233
      "source": [
234
        "Next, we determine how many brain slices each patient's CT scan contains (and what the smallest number of slices in any CT scan is):"
235
      ]
236
    },
237
    {
238
      "cell_type": "code",
239
      "metadata": {
240
        "id": "ganaa_a6E-C3",
241
        "colab_type": "code",
242
        "colab": {}
243
      },
244
      "source": [
245
        "min = float(\"inf\")\n",
246
        "lens = []\n",
247
        "\n",
248
        "for key, item in patients_pkl.items():\n",
249
        "    if len(item) < min:\n",
250
        "        min = len(item)\n",
251
        "    lens.append(len(item))"
252
      ],
253
      "execution_count": 0,
254
      "outputs": []
255
    },
256
    {
257
      "cell_type": "markdown",
258
      "metadata": {
259
        "id": "u2W-60ZpOUBx",
260
        "colab_type": "text"
261
      },
262
      "source": [
263
        "We find that some CT scans do not contain enough slices to lend themselves well to our **sequential approach**. We ensure that only patients with a sufficient number of slices are considered:"
264
      ]
265
    },
266
    {
267
      "cell_type": "code",
268
      "metadata": {
269
        "id": "h1IcGOoTE-DC",
270
        "colab_type": "code",
271
        "colab": {}
272
      },
273
      "source": [
274
        "n_slices = 24\n",
275
        "\n",
276
        "patients_long_enough = dict()\n",
277
        "for key, item in patients_pkl_clean.items():\n",
278
        "    if len(item) >= n_slices:\n",
279
        "        mid_slice = len(item)//2\n",
280
        "        truncated_slice_IDs = item.copy()[mid_slice - n_slices//2:mid_slice + n_slices//2]\n",
281
        "        patients_long_enough[key] = truncated_slice_IDs"
282
      ],
283
      "execution_count": 0,
284
      "outputs": []
285
    },
286
    {
287
      "cell_type": "markdown",
288
      "metadata": {
289
        "id": "8bVDQCyiFzQk",
290
        "colab_type": "text"
291
      },
292
      "source": [
293
        "Finally, we verify that we still have enough patients left to adequately train our sequential-convolutional model (indeed, 2418 patients remain):"
294
      ]
295
    },
296
    {
297
      "cell_type": "code",
298
      "metadata": {
299
        "id": "oEFa8nbQE-DI",
300
        "colab_type": "code",
301
        "colab": {},
302
        "outputId": "ef1190be-aeb0-4a3b-a2c7-a01cdf455e2a"
303
      },
304
      "source": [
305
        "n_patients = len(patients_long_enough)\n",
306
        "n_features = 8192\n",
307
        "\n",
308
        "len(patients_long_enough)"
309
      ],
310
      "execution_count": 0,
311
      "outputs": [
312
        {
313
          "output_type": "execute_result",
314
          "data": {
315
            "text/plain": [
316
              "2418"
317
            ]
318
          },
319
          "metadata": {
320
            "tags": []
321
          },
322
          "execution_count": 17
323
        }
324
      ]
325
    },
326
    {
327
      "cell_type": "markdown",
328
      "metadata": {
329
        "id": "r-7UghrGQBMc",
330
        "colab_type": "text"
331
      },
332
      "source": [
333
        "# **Performing the Feature Extraction**"
334
      ]
335
    },
336
    {
337
      "cell_type": "markdown",
338
      "metadata": {
339
        "id": "6fHSRY7nGJ0g",
340
        "colab_type": "text"
341
      },
342
      "source": [
343
        "We extract features for the training of our **bidirectional LSTM** by feeding all training PNGs to our previously-trained CNN, letting it run its inference, and then---for each PNG---grabbing the 8192 values from the last convolutional block:"
344
      ]
345
    },
346
    {
347
      "cell_type": "code",
348
      "metadata": {
349
        "scrolled": true,
350
        "id": "QjOn8YqrE-Dg",
351
        "colab_type": "code",
352
        "colab": {}
353
      },
354
      "source": [
355
        "# This list will contain the extracted features for all training PNGs\n",
356
        "data_list = []\n",
357
        "# List of corresponding labels for the extracted features\n",
358
        "label_list = []\n",
359
        "\n",
360
        "for i, (patient_ID, slice_IDs) in enumerate(tqdm.tqdm(patients_long_enough.items())):\n",
361
        "    data_patient_list = []\n",
362
        "    label_patient_list = []\n",
363
        "    for j, slice_ID in enumerate(slice_IDs):\n",
364
        "        # Load respective PNG\n",
365
        "        png_array = np.expand_dims(imageio.imread(\"./Windowed-PNGs-FINAL-comb/\" + slice_ID + \".png\"), 0)\n",
366
        "        # Extract features\n",
367
        "        layer_features = intermediate_layer_model.predict(png_array).flatten()\n",
368
        "        \n",
369
        "        data_patient_list.append(layer_features)\n",
370
        "        \n",
371
        "        label_patient_list.append(label_df[label_df[\"ID_nopng\"]==slice_ID][\"any\"].iloc[0])\n",
372
        "\n",
373
        "    data_list.append(data_patient_list)\n",
374
        "    label_list.append(label_patient_list)\n",
375
        "    \n",
376
        "data_array = np.array(data_list)\n",
377
        "label_array = np.array(label_list)"
378
      ],
379
      "execution_count": 0,
380
      "outputs": []
381
    },
382
    {
383
      "cell_type": "markdown",
384
      "metadata": {
385
        "id": "-URtcQq0F-sG",
386
        "colab_type": "text"
387
      },
388
      "source": [
389
        "Writing the extracted features and corresponding labels to files:"
390
      ]
391
    },
392
    {
393
      "cell_type": "code",
394
      "metadata": {
395
        "id": "zdXxmoS0E-Dr",
396
        "colab_type": "code",
397
        "colab": {}
398
      },
399
      "source": [
400
        "np.save(\"rcnn-data-array\", data_array)\n",
401
        "np.save(\"rcnn-label-array\", label_array)"
402
      ],
403
      "execution_count": 0,
404
      "outputs": []
405
    }
406
  ]
407
}