a b/Statistical Analysis.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "markdown",
5
   "metadata": {},
6
   "source": [
7
    "<h1 align=\"center\"> Machine learning-based prediction of early recurrence in glioblastoma patients: a glance towards precision medicine <br><br> [Statistical Analysis]</h1>"
8
   ]
9
  },
10
  {
11
   "cell_type": "markdown",
12
   "metadata": {},
13
   "source": [
14
    "<h2>[1] Library</h2>"
15
   ]
16
  },
17
  {
18
   "cell_type": "code",
19
   "execution_count": null,
20
   "metadata": {},
21
   "outputs": [],
22
   "source": [
23
    "# OS library\n",
24
    "import os\n",
25
    "import sys\n",
26
    "import argparse\n",
27
    "import random\n",
28
    "from math import sqrt\n",
29
    "\n",
30
    "# Analysis\n",
31
    "import numpy as np\n",
32
    "import pandas as pd\n",
33
    "import seaborn as sns\n",
34
    "import matplotlib.pyplot as plt\n",
35
    "\n",
36
    "from sklearn.linear_model import LogisticRegression\n",
37
    "from scipy import stats\n",
38
    "import statsmodels.api as sm\n",
39
    "from statsmodels.stats.proportion import proportion_confint\n",
40
    "\n",
41
    "import pingouin as pg\n",
42
    "%matplotlib inline"
43
   ]
44
  },
45
  {
46
   "cell_type": "markdown",
47
   "metadata": {},
48
   "source": [
49
    "<h2>[2] Data Preprocessing</h2>"
50
   ]
51
  },
52
  {
53
   "cell_type": "markdown",
54
   "metadata": {},
55
   "source": [
56
    "<h4>[-] Load the database</h4>"
57
   ]
58
  },
59
  {
60
   "cell_type": "code",
61
   "execution_count": null,
62
   "metadata": {},
63
   "outputs": [],
64
   "source": [
65
    "file = os.path.join(sys.path[0], \"db.xlsx\")\n",
66
    "db = pd.read_excel(file)\n",
67
    "\n",
68
    "print(\"N° of patients: {}\".format(len(db)))\n",
69
    "print(\"N° of columns: {}\".format(db.shape[1]))\n",
70
    "db.head()"
71
   ]
72
  },
73
  {
74
   "cell_type": "markdown",
75
   "metadata": {},
76
   "source": [
77
    "<h4>[-] Drop unwanted columns + create <i>'results'</i> column</h4>"
78
   ]
79
  },
80
  {
81
   "cell_type": "code",
82
   "execution_count": null,
83
   "metadata": {},
84
   "outputs": [],
85
   "source": [
86
    "df = db.drop(['Name_Surname','SURVIVAL', 'OS', '...'], axis = 'columns')\n",
87
    "\n",
88
    "print(\"Effective features to consider: {} \".format(len(df.columns)-1))\n",
89
    "print(\"Creating 'result' column...\")\n",
90
    "\n",
91
    "# 0 = No relapse\n",
92
    "df.loc[df['PFS'] > 6, 'outcome'] = 0\n",
93
    "\n",
94
    "# 1 = Early relapse (within 6 months)\n",
95
    "df.loc[df['PFS'] <= 6, 'outcome'] = 1"
96
   ]
97
  },
98
  {
99
   "cell_type": "markdown",
100
   "metadata": {},
101
   "source": [
102
    "<h2>[3] Count and Frequency</h2>"
103
   ]
104
  },
105
  {
106
   "cell_type": "code",
107
   "execution_count": null,
108
   "metadata": {},
109
   "outputs": [],
110
   "source": [
111
    "df.groupby(['outcome', '...']).count()"
112
   ]
113
  },
114
  {
115
   "cell_type": "code",
116
   "execution_count": null,
117
   "metadata": {},
118
   "outputs": [],
119
   "source": [
120
    "df['...'].describe()"
121
   ]
122
  },
123
  {
124
   "cell_type": "markdown",
125
   "metadata": {},
126
   "source": [
127
    "<h2>[4] Statistical Association</h2>\n",
128
    "<ul>\n",
129
    "    <li>Levene's test is an inferential statistic used to assess the equality of variances for a variable calculated for two or more groups. If p-value >> 0.05, no difference in variances between the groups</li>\n",
130
    "    <li>F-one way ANOVA test is performed if the variance is the same</li>\n",
131
    "</ul>"
132
   ]
133
  },
134
  {
135
   "cell_type": "code",
136
   "execution_count": null,
137
   "metadata": {},
138
   "outputs": [],
139
   "source": [
140
    "non_early = df[df['outcome'] == 0]['...']\n",
141
    "early_relapse = df[df['outcome'] == 1]['...']\n",
142
    "\n",
143
    "print(non_early.shape)\n",
144
    "print(stats.levene(non_early, early_relapse))\n",
145
    "print(stats.f_oneway(non_early, early_relapse))\n",
146
    "\n",
147
    "## Change equal_var to False if Levene p-value is below 0.05\n",
148
    "print(stats.ttest_ind(non_early, early_relapse, equal_var=True))"
149
   ]
150
  },
151
  {
152
   "cell_type": "code",
153
   "execution_count": null,
154
   "metadata": {},
155
   "outputs": [],
156
   "source": [
157
    "sex_ct = pd.crosstab(df['...'], df['outcome'])\n",
158
    "print(\"--- *** Contingency Table *** --- \\n\",sex_ct)\n",
159
    "\n",
160
    "print(\"\\n--- *** Chi-Square *** ---\")\n",
161
    "stat, p, dof, expected = stats.chi2_contingency(sex_ct, correction = False)\n",
162
    "print(\"DOF=%d\" % dof)\n",
163
    "print(\"Expected values = \", expected)\n",
164
    "print(\"p-value = \", p)\n",
165
    "print(\"stat = \", stat)\n",
166
    "\n",
167
    "prob = 0.95\n",
168
    "critical = stats.chi2.ppf(prob, dof)\n",
169
    "if abs(stat) >= critical:\n",
170
    "    print('\\nDependent (reject H0), [Critical: {}]'.format(critical))\n",
171
    "else:\n",
172
    "    print('\\nIndependent (fail to reject H0), [Critical: {}]'.format(critical))"
173
   ]
174
  },
175
  {
176
   "cell_type": "markdown",
177
   "metadata": {},
178
   "source": [
179
    "<h4>[-] Holm-Bonferroni correction</h4>"
180
   ]
181
  },
182
  {
183
   "cell_type": "code",
184
   "execution_count": null,
185
   "metadata": {},
186
   "outputs": [],
187
   "source": [
188
    "pvals = [...]\n",
189
    "significant, adjusted = pg.multicomp(pvals, alpha=0.05, method='holm')\n",
190
    "tab = {'Uncorrected':pvals, 'Adjusted':adjusted, 'Significant':significant}\n",
191
    "df = pd.DataFrame(tab)\n",
192
    "df"
193
   ]
194
  },
195
  {
196
   "cell_type": "markdown",
197
   "metadata": {},
198
   "source": [
199
    "<h2>[5] Multivariable Analysis</h2>"
200
   ]
201
  },
202
  {
203
   "cell_type": "markdown",
204
   "metadata": {},
205
   "source": [
206
    "<h4>[-] Label encoding</h4>"
207
   ]
208
  },
209
  {
210
   "cell_type": "code",
211
   "execution_count": null,
212
   "metadata": {},
213
   "outputs": [],
214
   "source": [
215
    "dummy_v = ['localization', '...']\n",
216
    "df = pd.get_dummies(df, columns = dummy_v, prefix = dummy_v)\n",
217
    "df[['..']].astype(float)\n",
218
    "df.head(5)"
219
   ]
220
  },
221
  {
222
   "cell_type": "code",
223
   "execution_count": null,
224
   "metadata": {},
225
   "outputs": [],
226
   "source": [
227
    "cols_to_keep = ['...']\n",
228
    "data = df[cols_to_keep]\n",
229
    "\n",
230
    "# manually add the intercept\n",
231
    "data['intercept'] = 1.0\n",
232
    "data.head()\n",
233
    "data.columns"
234
   ]
235
  },
236
  {
237
   "cell_type": "code",
238
   "execution_count": null,
239
   "metadata": {},
240
   "outputs": [],
241
   "source": [
242
    "train_cols = ['...']\n",
243
    "logit = sm.Logit(data['outcome'], data[train_cols], missing = 'drop')\n",
244
    "result = logit.fit()"
245
   ]
246
  },
247
  {
248
   "cell_type": "code",
249
   "execution_count": null,
250
   "metadata": {},
251
   "outputs": [],
252
   "source": [
253
    "result.summary(alpha = 0.05)"
254
   ]
255
  },
256
  {
257
   "cell_type": "code",
258
   "execution_count": null,
259
   "metadata": {},
260
   "outputs": [],
261
   "source": [
262
    "coef = result.params\n",
263
    "p = result.pvalues\n",
264
    "conf = result.conf_int(alpha = 0.05)\n",
265
    "\n",
266
    "conf['OR'] = coef\n",
267
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
268
    "\n",
269
    "conf = np.exp(conf)\n",
270
    "conf['p-value'] = p"
271
   ]
272
  },
273
  {
274
   "cell_type": "markdown",
275
   "metadata": {},
276
   "source": [
277
    "<h4>[-] Export Multivariable as Excel file</h4>"
278
   ]
279
  },
280
  {
281
   "cell_type": "code",
282
   "execution_count": null,
283
   "metadata": {},
284
   "outputs": [],
285
   "source": [
286
    "conf.to_excel(\"multivariable.xlsx\")"
287
   ]
288
  },
289
  {
290
   "cell_type": "code",
291
   "execution_count": null,
292
   "metadata": {},
293
   "outputs": [],
294
   "source": []
295
  },
296
  {
297
   "cell_type": "code",
298
   "execution_count": null,
299
   "metadata": {},
300
   "outputs": [],
301
   "source": []
302
  }
303
 ],
304
 "metadata": {
305
  "kernelspec": {
306
   "display_name": "Python 3",
307
   "language": "python",
308
   "name": "python3"
309
  },
310
  "language_info": {
311
   "codemirror_mode": {
312
    "name": "ipython",
313
    "version": 3
314
   },
315
   "file_extension": ".py",
316
   "mimetype": "text/x-python",
317
   "name": "python",
318
   "nbconvert_exporter": "python",
319
   "pygments_lexer": "ipython3",
320
   "version": "3.7.4"
321
  }
322
 },
323
 "nbformat": 4,
324
 "nbformat_minor": 2
325
}