ML-6months-PFS-GBM / Git / Diff of /Statistical Analysis.ipynb

Models:
MarcoTheBlack/
ML-6months-PFS-GBM
Downloads: 1
Diff of /Statistical Analysis.ipynb [000000] .. [505a55]
Switch to side-by-side view

--- a
+++ b/Statistical Analysis.ipynb
@@ -0,0 +1,325 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h1 align=\"center\"> Machine learning-based prediction of early recurrence in glioblastoma patients: a glance towards precision medicine <br><br> [Statistical Analysis]</h1>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h2>[1] Library</h2>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# OS library\n",
+    "import os\n",
+    "import sys\n",
+    "import argparse\n",
+    "import random\n",
+    "from math import sqrt\n",
+    "\n",
+    "# Analysis\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "from scipy import stats\n",
+    "import statsmodels.api as sm\n",
+    "from statsmodels.stats.proportion import proportion_confint\n",
+    "\n",
+    "import pingouin as pg\n",
+    "%matplotlib inline"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h2>[2] Data Preprocessing</h2>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h4>[-] Load the database</h4>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "file = os.path.join(sys.path[0], \"db.xlsx\")\n",
+    "db = pd.read_excel(file)\n",
+    "\n",
+    "print(\"N° of patients: {}\".format(len(db)))\n",
+    "print(\"N° of columns: {}\".format(db.shape[1]))\n",
+    "db.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h4>[-] Drop unwanted columns + create <i>'results'</i> column</h4>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = db.drop(['Name_Surname','SURVIVAL', 'OS', '...'], axis = 'columns')\n",
+    "\n",
+    "print(\"Effective features to consider: {} \".format(len(df.columns)-1))\n",
+    "print(\"Creating 'result' column...\")\n",
+    "\n",
+    "# 0 = No relapse\n",
+    "df.loc[df['PFS'] > 6, 'outcome'] = 0\n",
+    "\n",
+    "# 1 = Early relapse (within 6 months)\n",
+    "df.loc[df['PFS'] <= 6, 'outcome'] = 1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h2>[3] Count and Frequency</h2>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.groupby(['outcome', '...']).count()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['...'].describe()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h2>[4] Statistical Association</h2>\n",
+    "<ul>\n",
+    "    <li>Levene's test is an inferential statistic used to assess the equality of variances for a variable calculated for two or more groups. If p-value >> 0.05, no difference in variances between the groups</li>\n",
+    "    <li>F-one way ANOVA test is performed if the variance is the same</li>\n",
+    "</ul>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "non_early = df[df['outcome'] == 0]['...']\n",
+    "early_relapse = df[df['outcome'] == 1]['...']\n",
+    "\n",
+    "print(non_early.shape)\n",
+    "print(stats.levene(non_early, early_relapse))\n",
+    "print(stats.f_oneway(non_early, early_relapse))\n",
+    "\n",
+    "## Change equal_var to False if Levene p-value is below 0.05\n",
+    "print(stats.ttest_ind(non_early, early_relapse, equal_var=True))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sex_ct = pd.crosstab(df['...'], df['outcome'])\n",
+    "print(\"--- *** Contingency Table *** --- \\n\",sex_ct)\n",
+    "\n",
+    "print(\"\\n--- *** Chi-Square *** ---\")\n",
+    "stat, p, dof, expected = stats.chi2_contingency(sex_ct, correction = False)\n",
+    "print(\"DOF=%d\" % dof)\n",
+    "print(\"Expected values = \", expected)\n",
+    "print(\"p-value = \", p)\n",
+    "print(\"stat = \", stat)\n",
+    "\n",
+    "prob = 0.95\n",
+    "critical = stats.chi2.ppf(prob, dof)\n",
+    "if abs(stat) >= critical:\n",
+    "    print('\\nDependent (reject H0), [Critical: {}]'.format(critical))\n",
+    "else:\n",
+    "    print('\\nIndependent (fail to reject H0), [Critical: {}]'.format(critical))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h4>[-] Holm-Bonferroni correction</h4>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "pvals = [...]\n",
+    "significant, adjusted = pg.multicomp(pvals, alpha=0.05, method='holm')\n",
+    "tab = {'Uncorrected':pvals, 'Adjusted':adjusted, 'Significant':significant}\n",
+    "df = pd.DataFrame(tab)\n",
+    "df"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h2>[5] Multivariable Analysis</h2>"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h4>[-] Label encoding</h4>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "dummy_v = ['localization', '...']\n",
+    "df = pd.get_dummies(df, columns = dummy_v, prefix = dummy_v)\n",
+    "df[['..']].astype(float)\n",
+    "df.head(5)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "cols_to_keep = ['...']\n",
+    "data = df[cols_to_keep]\n",
+    "\n",
+    "# manually add the intercept\n",
+    "data['intercept'] = 1.0\n",
+    "data.head()\n",
+    "data.columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_cols = ['...']\n",
+    "logit = sm.Logit(data['outcome'], data[train_cols], missing = 'drop')\n",
+    "result = logit.fit()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "result.summary(alpha = 0.05)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "coef = result.params\n",
+    "p = result.pvalues\n",
+    "conf = result.conf_int(alpha = 0.05)\n",
+    "\n",
+    "conf['OR'] = coef\n",
+    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
+    "\n",
+    "conf = np.exp(conf)\n",
+    "conf['p-value'] = p"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<h4>[-] Export Multivariable as Excel file</h4>"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conf.to_excel(\"multivariable.xlsx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.7.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}