--- a +++ b/Statistical Analysis.ipynb @@ -0,0 +1,325 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h1 align=\"center\"> Machine learning-based prediction of early recurrence in glioblastoma patients: a glance towards precision medicine <br><br> [Statistical Analysis]</h1>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h2>[1] Library</h2>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# OS library\n", + "import os\n", + "import sys\n", + "import argparse\n", + "import random\n", + "from math import sqrt\n", + "\n", + "# Analysis\n", + "import numpy as np\n", + "import pandas as pd\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "from sklearn.linear_model import LogisticRegression\n", + "from scipy import stats\n", + "import statsmodels.api as sm\n", + "from statsmodels.stats.proportion import proportion_confint\n", + "\n", + "import pingouin as pg\n", + "%matplotlib inline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h2>[2] Data Preprocessing</h2>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h4>[-] Load the database</h4>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "file = os.path.join(sys.path[0], \"db.xlsx\")\n", + "db = pd.read_excel(file)\n", + "\n", + "print(\"N° of patients: {}\".format(len(db)))\n", + "print(\"N° of columns: {}\".format(db.shape[1]))\n", + "db.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h4>[-] Drop unwanted columns + create <i>'results'</i> column</h4>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df = db.drop(['Name_Surname','SURVIVAL', 'OS', '...'], axis = 'columns')\n", + "\n", + "print(\"Effective features to consider: {} \".format(len(df.columns)-1))\n", + "print(\"Creating 'result' column...\")\n", + "\n", + "# 0 = No relapse\n", + "df.loc[df['PFS'] > 6, 'outcome'] = 0\n", + "\n", + "# 1 = Early relapse (within 6 months)\n", + "df.loc[df['PFS'] <= 6, 'outcome'] = 1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h2>[3] Count and Frequency</h2>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df.groupby(['outcome', '...']).count()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df['...'].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h2>[4] Statistical Association</h2>\n", + "<ul>\n", + " <li>Levene's test is an inferential statistic used to assess the equality of variances for a variable calculated for two or more groups. If p-value >> 0.05, no difference in variances between the groups</li>\n", + " <li>F-one way ANOVA test is performed if the variance is the same</li>\n", + "</ul>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "non_early = df[df['outcome'] == 0]['...']\n", + "early_relapse = df[df['outcome'] == 1]['...']\n", + "\n", + "print(non_early.shape)\n", + "print(stats.levene(non_early, early_relapse))\n", + "print(stats.f_oneway(non_early, early_relapse))\n", + "\n", + "## Change equal_var to False if Levene p-value is below 0.05\n", + "print(stats.ttest_ind(non_early, early_relapse, equal_var=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "sex_ct = pd.crosstab(df['...'], df['outcome'])\n", + "print(\"--- *** Contingency Table *** --- \\n\",sex_ct)\n", + "\n", + "print(\"\\n--- *** Chi-Square *** ---\")\n", + "stat, p, dof, expected = stats.chi2_contingency(sex_ct, correction = False)\n", + "print(\"DOF=%d\" % dof)\n", + "print(\"Expected values = \", expected)\n", + "print(\"p-value = \", p)\n", + "print(\"stat = \", stat)\n", + "\n", + "prob = 0.95\n", + "critical = stats.chi2.ppf(prob, dof)\n", + "if abs(stat) >= critical:\n", + " print('\\nDependent (reject H0), [Critical: {}]'.format(critical))\n", + "else:\n", + " print('\\nIndependent (fail to reject H0), [Critical: {}]'.format(critical))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h4>[-] Holm-Bonferroni correction</h4>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "pvals = [...]\n", + "significant, adjusted = pg.multicomp(pvals, alpha=0.05, method='holm')\n", + "tab = {'Uncorrected':pvals, 'Adjusted':adjusted, 'Significant':significant}\n", + "df = pd.DataFrame(tab)\n", + "df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h2>[5] Multivariable Analysis</h2>" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h4>[-] Label encoding</h4>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dummy_v = ['localization', '...']\n", + "df = pd.get_dummies(df, columns = dummy_v, prefix = dummy_v)\n", + "df[['..']].astype(float)\n", + "df.head(5)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "cols_to_keep = ['...']\n", + "data = df[cols_to_keep]\n", + "\n", + "# manually add the intercept\n", + "data['intercept'] = 1.0\n", + "data.head()\n", + "data.columns" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "train_cols = ['...']\n", + "logit = sm.Logit(data['outcome'], data[train_cols], missing = 'drop')\n", + "result = logit.fit()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "result.summary(alpha = 0.05)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "coef = result.params\n", + "p = result.pvalues\n", + "conf = result.conf_int(alpha = 0.05)\n", + "\n", + "conf['OR'] = coef\n", + "conf.columns = ['2.5%', '97.5%', 'OR']\n", + "\n", + "conf = np.exp(conf)\n", + "conf['p-value'] = p" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "<h4>[-] Export Multivariable as Excel file</h4>" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "conf.to_excel(\"multivariable.xlsx\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}