a b/Random Forest.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "markdown",
5
   "metadata": {},
6
   "source": [
7
    "<h1 align=\"center\">Machine learning-based prediction of early recurrence in glioblastoma patients: a glance towards precision medicine <br><br>[Random Forest]</h1>"
8
   ]
9
  },
10
  {
11
   "cell_type": "markdown",
12
   "metadata": {},
13
   "source": [
14
    "<h2>[1] Library</h2>"
15
   ]
16
  },
17
  {
18
   "cell_type": "code",
19
   "execution_count": null,
20
   "metadata": {},
21
   "outputs": [],
22
   "source": [
23
    "# OS library\n",
24
    "import os\n",
25
    "import sys\n",
26
    "import argparse\n",
27
    "import itertools\n",
28
    "import random\n",
29
    "\n",
30
    "# Analysis\n",
31
    "import numpy as np\n",
32
    "import pandas as pd\n",
33
    "import seaborn as sns\n",
34
    "import matplotlib.pyplot as plt\n",
35
    "\n",
36
    "# Sklearn\n",
37
    "from boruta import BorutaPy\n",
38
    "from sklearn.preprocessing import LabelEncoder\n",
39
    "from sklearn.model_selection import train_test_split\n",
40
    "from sklearn.ensemble import RandomForestClassifier\n",
41
    "from sklearn.metrics import confusion_matrix, f1_score, recall_score, classification_report, accuracy_score, auc, roc_curve\n",
42
    "from sklearn.model_selection import RandomizedSearchCV\n",
43
    "\n",
44
    "import scikitplot as skplt\n",
45
    "from imblearn.over_sampling import RandomOverSampler, SMOTENC, SMOTE"
46
   ]
47
  },
48
  {
49
   "cell_type": "markdown",
50
   "metadata": {},
51
   "source": [
52
    "<h2>[2] Exploratory data analysis and Data Preprocessing</h2>"
53
   ]
54
  },
55
  {
56
   "cell_type": "markdown",
57
   "metadata": {},
58
   "source": [
59
    "<h4>[-] Load the database</h4>"
60
   ]
61
  },
62
  {
63
   "cell_type": "code",
64
   "execution_count": null,
65
   "metadata": {},
66
   "outputs": [],
67
   "source": [
68
    "file = os.path.join(sys.path[0], \"db.xlsx\")\n",
69
    "db = pd.read_excel(file)\n",
70
    "\n",
71
    "print(\"N° of patients: {}\".format(len(db)))\n",
72
    "print(\"N° of columns: {}\".format(db.shape[1]))\n",
73
    "db.head()"
74
   ]
75
  },
76
  {
77
   "cell_type": "markdown",
78
   "metadata": {},
79
   "source": [
80
    "<h4>[-] Drop unwanted columns + create <i>'results'</i> column</h4>"
81
   ]
82
  },
83
  {
84
   "cell_type": "code",
85
   "execution_count": null,
86
   "metadata": {},
87
   "outputs": [],
88
   "source": [
89
    "df = db.drop(['Name_Surname','...'], axis = 'columns')\n",
90
    "\n",
91
    "print(\"Effective features to consider: {} \".format(len(df.columns)-1))\n",
92
    "print(\"Creating 'result' column...\")\n",
93
    "\n",
94
    "# 0 = No relapse\n",
95
    "df.loc[df['PFS'] > 6, 'result'] = 0\n",
96
    "\n",
97
    "# 1 = Early relapse (within 6 months)\n",
98
    "df.loc[df['PFS'] <= 6, 'result'] = 1\n",
99
    "\n",
100
    "df.head()"
101
   ]
102
  },
103
  {
104
   "cell_type": "markdown",
105
   "metadata": {},
106
   "source": [
107
    "<h4>[-] Check for class imbalance in the <i>'results'</i> column </h4>"
108
   ]
109
  },
110
  {
111
   "cell_type": "code",
112
   "execution_count": null,
113
   "metadata": {},
114
   "outputs": [],
115
   "source": [
116
    "print(\"PFS Overview\")\n",
117
    "print(df.result.value_counts())\n",
118
    "\n",
119
    "df.result.value_counts().plot(kind='pie', autopct='%1.0f%%', colors=['skyblue', 'orange'], explode=(0.05, 0.05))"
120
   ]
121
  },
122
  {
123
   "cell_type": "markdown",
124
   "metadata": {},
125
   "source": [
126
    "<h4>[-] Label encoding of the categorical variables </h4>"
127
   ]
128
  },
129
  {
130
   "cell_type": "code",
131
   "execution_count": null,
132
   "metadata": {},
133
   "outputs": [],
134
   "source": [
135
    "df['sex'] =df['sex'].astype('category')\n",
136
    "\n",
137
    "#df['Ki67'] =df['Ki67'].astype(int)\n",
138
    "df['MGMT'] =df['MGMT'].astype('category')\n",
139
    "df['IDH1'] =df['IDH1'].astype('category')\n",
140
    "\n",
141
    "df['side'] =df['side'].astype('category')\n",
142
    "df['ependima'] =df['ependima'].astype('category')\n",
143
    "df['cc'] =df['cc'].astype('category')\n",
144
    "df['necrotico_cistico'] =df['necrotico_cistico'].astype('category')\n",
145
    "df['shift'] =df['shift'].astype('category')\n",
146
    "\n",
147
    "## VARIABLE TO ONE-HOT-ENCODE\n",
148
    "df['localization'] =df['localization'].astype(int)\n",
149
    "df['clinica_esordio'] =df['clinica_esordio'].astype(int)\n",
150
    "df['immediate_p_o'] =df['immediate_p_o'].astype(int)\n",
151
    "df['onco_Protocol'] =df['onco_Protocol'].astype(int)\n",
152
    "\n",
153
    "df['result'] =df['result'].astype(int)\n",
154
    "\n",
155
    "dummy_v = ['localization', 'clinica_esordio', 'onco_Protocol', 'immediate_p_o']\n",
156
    "\n",
157
    "df = pd.get_dummies(df, columns = dummy_v, prefix = dummy_v)"
158
   ]
159
  },
160
  {
161
   "cell_type": "markdown",
162
   "metadata": {},
163
   "source": [
164
    "<h4>[-] Evaluate variables' correlation with <u>'PFS'</u> columns </h4>"
165
   ]
166
  },
167
  {
168
   "cell_type": "code",
169
   "execution_count": null,
170
   "metadata": {},
171
   "outputs": [],
172
   "source": [
173
    "corr = df.corr()\n",
174
    "ax = sns.heatmap(\n",
175
    "    corr, \n",
176
    "    vmin=-1, vmax=1, center=0,\n",
177
    "    cmap=sns.diverging_palette(20, 220, n=200),\n",
178
    "    square=True\n",
179
    ")\n",
180
    "ax.set_xticklabels(\n",
181
    "    ax.get_xticklabels(),\n",
182
    "    rotation=60,\n",
183
    "    horizontalalignment='right'\n",
184
    ");"
185
   ]
186
  },
187
  {
188
   "cell_type": "markdown",
189
   "metadata": {},
190
   "source": [
191
    "<h2>[3] Prediction Models</h2>"
192
   ]
193
  },
194
  {
195
   "cell_type": "markdown",
196
   "metadata": {},
197
   "source": [
198
    "<h4> [-] Training and testing set splitting</h4>"
199
   ]
200
  },
201
  {
202
   "cell_type": "code",
203
   "execution_count": 5,
204
   "metadata": {
205
    "collapsed": true
206
   },
207
   "outputs": [
208
    {
209
     "ename": "NameError",
210
     "evalue": "name 'df' is not defined",
211
     "output_type": "error",
212
     "traceback": [
213
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
214
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
215
      "\u001b[0;32m<ipython-input-5-48cdcc32916c>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtarget\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'result'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      2\u001b[0m \u001b[0minputs\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdrop\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'result'\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m'PFS'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0maxis\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0;34m'columns'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
216
      "\u001b[0;31mNameError\u001b[0m: name 'df' is not defined"
217
     ]
218
    }
219
   ],
220
   "source": [
221
    "target = df['result']\n",
222
    "inputs = df.drop(['result', 'PFS'], axis = 'columns')\n",
223
    "x_train, x_test, y_train, y_test = train_test_split(inputs['...'],target,test_size=0.20, random_state=10)"
224
   ]
225
  },
226
  {
227
   "cell_type": "markdown",
228
   "metadata": {},
229
   "source": [
230
    "<h4> [-] BORUTA Features Selection</h4>"
231
   ]
232
  },
233
  {
234
   "cell_type": "code",
235
   "execution_count": null,
236
   "metadata": {},
237
   "outputs": [],
238
   "source": [
239
    "x = x_train.values\n",
240
    "y = y_train.values\n",
241
    "y = y.ravel()\n",
242
    "\n",
243
    "rf_boruta = RandomForestClassifier(n_jobs=-1, class_weight={0:1, 1:3}, max_depth=3)\n",
244
    "feat_selector = BorutaPy(rf_boruta, n_estimators='auto', verbose=0, random_state=42, perc='...')\n",
245
    "feat_selector.fit(x, y)\n",
246
    "\n",
247
    "cols = inputs.columns[feat_selector.support_]\n",
248
    "print(\"N° of selected features: {}\".format(len(cols)))\n",
249
    "print(cols) "
250
   ]
251
  },
252
  {
253
   "cell_type": "markdown",
254
   "metadata": {},
255
   "source": [
256
    "<h4> [-] Random Grid Search Hyperparameter tuning</h4>"
257
   ]
258
  },
259
  {
260
   "cell_type": "code",
261
   "execution_count": null,
262
   "metadata": {},
263
   "outputs": [],
264
   "source": [
265
    "# The function to measure the quality of a split\n",
266
    "criterion = ['gini', 'entropy']\n",
267
    "\n",
268
    "# Number of trees in random forest\n",
269
    "n_estimators = [int(x) for x in np.linspace(start = 20, stop = 50, num = 5)]\n",
270
    "\n",
271
    "# Number of features to consider at every split\n",
272
    "max_features = ['auto', 'sqrt']\n",
273
    "\n",
274
    "# Maximum number of levels in tree\n",
275
    "max_depth = [int(x) for x in np.linspace(14, 30, num = 2)]\n",
276
    "max_depth.append(None)\n",
277
    "\n",
278
    "# Minimum number of samples required to split a node\n",
279
    "min_samples_split = [ 2, 3, 4, 5, 8]\n",
280
    "\n",
281
    "# Minimum number of samples required at each leaf node\n",
282
    "min_samples_leaf = [1, 2, 3, 4, 5, 6]\n",
283
    "\n",
284
    "max_leaf_nodes = [None, 2, 3, 4, 5, 6]\n",
285
    "# Method of selecting samples for training each tree\n",
286
    "bootstrap = [True, False]\n",
287
    "\n",
288
    "random_grid = {'criterion': criterion,\n",
289
    "               'n_estimators': n_estimators,\n",
290
    "               'max_features': max_features,\n",
291
    "               'max_depth': max_depth,\n",
292
    "               'min_samples_split': min_samples_split,\n",
293
    "               'min_samples_leaf': min_samples_leaf,\n",
294
    "               'max_leaf_nodes': max_leaf_nodes,\n",
295
    "               'bootstrap':bootstrap\n",
296
    "              }\n",
297
    "\n",
298
    "# First create the base model to tune\n",
299
    "rf = RandomForestClassifier(random_state=42,\n",
300
    "                            n_jobs = -1, \n",
301
    "                            class_weight=class_weight)\n",
302
    "\n",
303
    "# Random search of parameters, using 5 fold cross validation, different combinations, and use all available cores\n",
304
    "rf_random = RandomizedSearchCV(estimator = rf, \n",
305
    "                               param_distributions = random_grid, \n",
306
    "                               n_iter = 500, \n",
307
    "                               cv = 5)\n",
308
    "# Fit the random search model\n",
309
    "rf_random.fit(x_train, y_train)\n",
310
    "rf_random.best_params_"
311
   ]
312
  },
313
  {
314
   "cell_type": "markdown",
315
   "metadata": {},
316
   "source": [
317
    "<h4>[-] SMOTE-NC</h4>"
318
   ]
319
  },
320
  {
321
   "cell_type": "code",
322
   "execution_count": null,
323
   "metadata": {},
324
   "outputs": [],
325
   "source": [
326
    "smote_nc = SMOTENC(categorical_features=[3,4,10,11], k_neighbors= 3, random_state=42)\n",
327
    "x_smote_train, y_smote_train = smote_nc.fit_resample(x_train, y_train)"
328
   ]
329
  },
330
  {
331
   "cell_type": "markdown",
332
   "metadata": {},
333
   "source": [
334
    "<h4>[-] Random Forest Model</h4>"
335
   ]
336
  },
337
  {
338
   "cell_type": "code",
339
   "execution_count": null,
340
   "metadata": {},
341
   "outputs": [],
342
   "source": [
343
    "rm_smote = RandomForestClassifier(random_state = 42,\n",
344
    "                                       criterion= '...',\n",
345
    "                                       n_estimators = '...',\n",
346
    "                                       min_samples_split = '...',\n",
347
    "                                       min_samples_leaf = '...',\n",
348
    "                                       max_leaf_nodes = '...',\n",
349
    "                                       max_features = '...',\n",
350
    "                                       max_depth = '...',\n",
351
    "                                       bootstrap = '...')\n",
352
    "\n",
353
    "rm_smote.fit(x_smote_train, y_smote_train)\n",
354
    "print(\"Trained \\n\")\n",
355
    "\n",
356
    "score_rf_smote = rm_smote.score(x_test, y_test)\n",
357
    "print(\"Random Forest accuracy: \", round(score_rf_smote*100,2), \"% \\n\")\n",
358
    "\n",
359
    "y_smote_predicted = rm_smote.predict(x_test)\n",
360
    "cm_rf_smote = confusion_matrix(y_test, y_smote_predicted)\n",
361
    "print(cm_rf_smote, \"\\n\")\n",
362
    "\n",
363
    "false_positive_rate, true_positive_rate, thresholds = roc_curve(y_test, y_smote_predicted)\n",
364
    "roc_auc = auc(false_positive_rate, true_positive_rate)\n",
365
    "\n",
366
    "print('1. The F-1 Score of the model {} \\n '.format(round(f1_score(y_test, y_smote_predicted, average = 'macro'), 2)))\n",
367
    "print('2. The Recall Score of the model {} \\n '.format(round(recall_score(y_test, y_smote_predicted, average = 'macro'), 2)))\n",
368
    "print('3. Classification report \\n {} \\n'.format(classification_report(y_test, y_smote_predicted)))\n",
369
    "print('3. AUC: \\n {} \\n'.format(roc_auc))\n",
370
    "\n",
371
    "tn, fp, fn, tp = cm_rf_smote.ravel()\n",
372
    "\n",
373
    "# Sensitivity, hit rate, Recall, or true positive rate\n",
374
    "tpr = tp/(tp+fn)\n",
375
    "print(\"Sensitivity (TPR): {}\".format(tpr))\n",
376
    "\n",
377
    "# Specificity or true negative rate\n",
378
    "tnr = tn/(tn+fp)\n",
379
    "print(\"Specificity (TNR): {}\".format(tnr))\n",
380
    "\n",
381
    "# Precision or positive predictive value\n",
382
    "ppv = tp/(tp+fp)\n",
383
    "print(\"Precision (PPV): {}\".format(ppv))\n",
384
    "\n",
385
    "# Negative predictive value\n",
386
    "npv = tn/(tn+fn)\n",
387
    "print(\"Negative Predictive Value (NPV): {}\".format(npv))\n",
388
    "\n",
389
    "# False positive rate\n",
390
    "fpr = fp / (fp + tn)\n",
391
    "print(\"False Positive Rate (FPV): {}\".format(fpr))"
392
   ]
393
  },
394
  {
395
   "cell_type": "markdown",
396
   "metadata": {},
397
   "source": [
398
    "<h4> [-] Features Importance Plot </h4>"
399
   ]
400
  },
401
  {
402
   "cell_type": "code",
403
   "execution_count": null,
404
   "metadata": {},
405
   "outputs": [],
406
   "source": [
407
    "features = x_train.columns.values\n",
408
    "\n",
409
    "features[0] = 'Age'\n",
410
    "features[6] = 'Tumor volume T1'\n",
411
    "features[7] = 'edema volume'\n",
412
    "features[8] = 'Residual tumor'\n",
413
    "features[9] = 'Pre-operative KPS'\n",
414
    "features[10] = 'Post-operative KPS'\n",
415
    "features[11] = 'Onset neurological symptoms = 1'\n",
416
    "features[12] = 'Oncological protocol = 0'\n",
417
    "features[13] = 'Oncological protocol = 1'\n",
418
    "features[14] = 'Oncological protocol = 2'\n",
419
    "\n",
420
    "indices = np.argsort(importances)\n",
421
    "\n",
422
    "plt.title('Random Forest Classifier Features Importance')\n",
423
    "plt.barh(range(len(indices)), importances[indices], color='g', align='center')\n",
424
    "plt.yticks(range(len(indices)), [features[i] for i in indices])\n",
425
    "plt.xlabel('Relative Importance')\n",
426
    "\n",
427
    "plt.savefig(\"RF Features importance.jpg\", dpi = 400, facecolor='w', edgecolor='w',\n",
428
    "        orientation='landscape', papertype=None, format=None,\n",
429
    "        transparent=False, bbox_inches='tight', pad_inches=0.3,\n",
430
    "        frameon=None)\n",
431
    "\n",
432
    "plt.show()"
433
   ]
434
  }
435
 ],
436
 "metadata": {
437
  "kernelspec": {
438
   "display_name": "Python 3",
439
   "language": "python",
440
   "name": "python3"
441
  },
442
  "language_info": {
443
   "codemirror_mode": {
444
    "name": "ipython",
445
    "version": 3
446
   },
447
   "file_extension": ".py",
448
   "mimetype": "text/x-python",
449
   "name": "python",
450
   "nbconvert_exporter": "python",
451
   "pygments_lexer": "ipython3",
452
   "version": "3.7.4"
453
  }
454
 },
455
 "nbformat": 4,
456
 "nbformat_minor": 2
457
}