326 lines (325 with data), 7.6 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h1 align=\"center\"> Machine learning-based prediction of early recurrence in glioblastoma patients: a glance towards precision medicine <br><br> [Statistical Analysis]</h1>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>[1] Library</h2>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# OS library\n",
"import os\n",
"import sys\n",
"import argparse\n",
"import random\n",
"from math import sqrt\n",
"\n",
"# Analysis\n",
"import numpy as np\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"\n",
"from sklearn.linear_model import LogisticRegression\n",
"from scipy import stats\n",
"import statsmodels.api as sm\n",
"from statsmodels.stats.proportion import proportion_confint\n",
"\n",
"import pingouin as pg\n",
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>[2] Data Preprocessing</h2>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h4>[-] Load the database</h4>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"file = os.path.join(sys.path[0], \"db.xlsx\")\n",
"db = pd.read_excel(file)\n",
"\n",
"print(\"N° of patients: {}\".format(len(db)))\n",
"print(\"N° of columns: {}\".format(db.shape[1]))\n",
"db.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h4>[-] Drop unwanted columns + create <i>'results'</i> column</h4>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df = db.drop(['Name_Surname','SURVIVAL', 'OS', '...'], axis = 'columns')\n",
"\n",
"print(\"Effective features to consider: {} \".format(len(df.columns)-1))\n",
"print(\"Creating 'result' column...\")\n",
"\n",
"# 0 = No relapse\n",
"df.loc[df['PFS'] > 6, 'outcome'] = 0\n",
"\n",
"# 1 = Early relapse (within 6 months)\n",
"df.loc[df['PFS'] <= 6, 'outcome'] = 1"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>[3] Count and Frequency</h2>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df.groupby(['outcome', '...']).count()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"df['...'].describe()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>[4] Statistical Association</h2>\n",
"<ul>\n",
" <li>Levene's test is an inferential statistic used to assess the equality of variances for a variable calculated for two or more groups. If p-value >> 0.05, no difference in variances between the groups</li>\n",
" <li>F-one way ANOVA test is performed if the variance is the same</li>\n",
"</ul>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"non_early = df[df['outcome'] == 0]['...']\n",
"early_relapse = df[df['outcome'] == 1]['...']\n",
"\n",
"print(non_early.shape)\n",
"print(stats.levene(non_early, early_relapse))\n",
"print(stats.f_oneway(non_early, early_relapse))\n",
"\n",
"## Change equal_var to False if Levene p-value is below 0.05\n",
"print(stats.ttest_ind(non_early, early_relapse, equal_var=True))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"sex_ct = pd.crosstab(df['...'], df['outcome'])\n",
"print(\"--- *** Contingency Table *** --- \\n\",sex_ct)\n",
"\n",
"print(\"\\n--- *** Chi-Square *** ---\")\n",
"stat, p, dof, expected = stats.chi2_contingency(sex_ct, correction = False)\n",
"print(\"DOF=%d\" % dof)\n",
"print(\"Expected values = \", expected)\n",
"print(\"p-value = \", p)\n",
"print(\"stat = \", stat)\n",
"\n",
"prob = 0.95\n",
"critical = stats.chi2.ppf(prob, dof)\n",
"if abs(stat) >= critical:\n",
" print('\\nDependent (reject H0), [Critical: {}]'.format(critical))\n",
"else:\n",
" print('\\nIndependent (fail to reject H0), [Critical: {}]'.format(critical))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h4>[-] Holm-Bonferroni correction</h4>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"pvals = [...]\n",
"significant, adjusted = pg.multicomp(pvals, alpha=0.05, method='holm')\n",
"tab = {'Uncorrected':pvals, 'Adjusted':adjusted, 'Significant':significant}\n",
"df = pd.DataFrame(tab)\n",
"df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h2>[5] Multivariable Analysis</h2>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h4>[-] Label encoding</h4>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dummy_v = ['localization', '...']\n",
"df = pd.get_dummies(df, columns = dummy_v, prefix = dummy_v)\n",
"df[['..']].astype(float)\n",
"df.head(5)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"cols_to_keep = ['...']\n",
"data = df[cols_to_keep]\n",
"\n",
"# manually add the intercept\n",
"data['intercept'] = 1.0\n",
"data.head()\n",
"data.columns"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"train_cols = ['...']\n",
"logit = sm.Logit(data['outcome'], data[train_cols], missing = 'drop')\n",
"result = logit.fit()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"result.summary(alpha = 0.05)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"coef = result.params\n",
"p = result.pvalues\n",
"conf = result.conf_int(alpha = 0.05)\n",
"\n",
"conf['OR'] = coef\n",
"conf.columns = ['2.5%', '97.5%', 'OR']\n",
"\n",
"conf = np.exp(conf)\n",
"conf['p-value'] = p"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<h4>[-] Export Multivariable as Excel file</h4>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"conf.to_excel(\"multivariable.xlsx\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}