[bda6f5]: / Statistical Analysis.ipynb

Download this file

326 lines (325 with data), 7.6 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h1 align=\"center\"> Machine learning-based prediction of early recurrence in glioblastoma patients: a glance towards precision medicine <br><br> [Statistical Analysis]</h1>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>[1] Library</h2>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# OS library\n",
    "import os\n",
    "import sys\n",
    "import argparse\n",
    "import random\n",
    "from math import sqrt\n",
    "\n",
    "# Analysis\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from scipy import stats\n",
    "import statsmodels.api as sm\n",
    "from statsmodels.stats.proportion import proportion_confint\n",
    "\n",
    "import pingouin as pg\n",
    "%matplotlib inline"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>[2] Data Preprocessing</h2>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h4>[-] Load the database</h4>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "file = os.path.join(sys.path[0], \"db.xlsx\")\n",
    "db = pd.read_excel(file)\n",
    "\n",
    "print(\"N° of patients: {}\".format(len(db)))\n",
    "print(\"N° of columns: {}\".format(db.shape[1]))\n",
    "db.head()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h4>[-] Drop unwanted columns + create <i>'results'</i> column</h4>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = db.drop(['Name_Surname','SURVIVAL', 'OS', '...'], axis = 'columns')\n",
    "\n",
    "print(\"Effective features to consider: {} \".format(len(df.columns)-1))\n",
    "print(\"Creating 'result' column...\")\n",
    "\n",
    "# 0 = No relapse\n",
    "df.loc[df['PFS'] > 6, 'outcome'] = 0\n",
    "\n",
    "# 1 = Early relapse (within 6 months)\n",
    "df.loc[df['PFS'] <= 6, 'outcome'] = 1"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>[3] Count and Frequency</h2>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df.groupby(['outcome', '...']).count()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['...'].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>[4] Statistical Association</h2>\n",
    "<ul>\n",
    "    <li>Levene's test is an inferential statistic used to assess the equality of variances for a variable calculated for two or more groups. If p-value >> 0.05, no difference in variances between the groups</li>\n",
    "    <li>F-one way ANOVA test is performed if the variance is the same</li>\n",
    "</ul>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "non_early = df[df['outcome'] == 0]['...']\n",
    "early_relapse = df[df['outcome'] == 1]['...']\n",
    "\n",
    "print(non_early.shape)\n",
    "print(stats.levene(non_early, early_relapse))\n",
    "print(stats.f_oneway(non_early, early_relapse))\n",
    "\n",
    "## Change equal_var to False if Levene p-value is below 0.05\n",
    "print(stats.ttest_ind(non_early, early_relapse, equal_var=True))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "sex_ct = pd.crosstab(df['...'], df['outcome'])\n",
    "print(\"--- *** Contingency Table *** --- \\n\",sex_ct)\n",
    "\n",
    "print(\"\\n--- *** Chi-Square *** ---\")\n",
    "stat, p, dof, expected = stats.chi2_contingency(sex_ct, correction = False)\n",
    "print(\"DOF=%d\" % dof)\n",
    "print(\"Expected values = \", expected)\n",
    "print(\"p-value = \", p)\n",
    "print(\"stat = \", stat)\n",
    "\n",
    "prob = 0.95\n",
    "critical = stats.chi2.ppf(prob, dof)\n",
    "if abs(stat) >= critical:\n",
    "    print('\\nDependent (reject H0), [Critical: {}]'.format(critical))\n",
    "else:\n",
    "    print('\\nIndependent (fail to reject H0), [Critical: {}]'.format(critical))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h4>[-] Holm-Bonferroni correction</h4>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pvals = [...]\n",
    "significant, adjusted = pg.multicomp(pvals, alpha=0.05, method='holm')\n",
    "tab = {'Uncorrected':pvals, 'Adjusted':adjusted, 'Significant':significant}\n",
    "df = pd.DataFrame(tab)\n",
    "df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h2>[5] Multivariable Analysis</h2>"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h4>[-] Label encoding</h4>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "dummy_v = ['localization', '...']\n",
    "df = pd.get_dummies(df, columns = dummy_v, prefix = dummy_v)\n",
    "df[['..']].astype(float)\n",
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "cols_to_keep = ['...']\n",
    "data = df[cols_to_keep]\n",
    "\n",
    "# manually add the intercept\n",
    "data['intercept'] = 1.0\n",
    "data.head()\n",
    "data.columns"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_cols = ['...']\n",
    "logit = sm.Logit(data['outcome'], data[train_cols], missing = 'drop')\n",
    "result = logit.fit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "result.summary(alpha = 0.05)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "coef = result.params\n",
    "p = result.pvalues\n",
    "conf = result.conf_int(alpha = 0.05)\n",
    "\n",
    "conf['OR'] = coef\n",
    "conf.columns = ['2.5%', '97.5%', 'OR']\n",
    "\n",
    "conf = np.exp(conf)\n",
    "conf['p-value'] = p"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "<h4>[-] Export Multivariable as Excel file</h4>"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "conf.to_excel(\"multivariable.xlsx\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}