--- a +++ b/evaluation/Evaluation_Primary.ipynb @@ -0,0 +1,203 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pickle\n", + "import pandas as pd\n", + "import numpy as np\n", + "import scipy\n", + "import joblib" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load ensemble of models\n", + "\n", + "models_dict = joblib.load('models_dict.joblib')\n", + "models_dict.keys()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load data: Data should be formatted as sample cohort. See README for example\n", + "\n", + "df_cohort = pd.read_csv('sample_cohort.csv')\n", + "test_hosp, test_window, test_y = df_cohort['hosp_id'], df_cohort['window_id'], df_cohort['y']\n", + "cohort_IDs = df_cohort.set_index('ID')[[]]" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_cohort" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Functions to assist with evaluation \n", + "\n", + "from sklearn import metrics, utils\n", + "from joblib import Parallel, delayed\n", + "\n", + "\n", + "def bootstrap_func(i, y_true, y_score):\n", + " # Bootstrap resample to calculate AUROC\n", + " yte_true_b, yte_pred_b = utils.resample(y_true, y_score, replace=True, random_state=i)\n", + " return metrics.roc_curve(yte_true_b, yte_pred_b), metrics.roc_auc_score(yte_true_b, yte_pred_b)\n", + "\n", + "def get_roc_CI(y_true, y_score):\n", + " # Bootstrap confidence intervals \n", + " roc_curves, auc_scores = zip(*Parallel(n_jobs=4)(delayed(bootstrap_func)(i, y_true, y_score) for i in range(1000)))\n", + " print('Test AUC: ({:.3f}, {:.3f}) percentile 95% CI'.format(np.percentile(auc_scores, 2.5), np.percentile(auc_scores, 97.5)))\n", + "\n", + " tprs = []\n", + " aucs = []\n", + " mean_fpr = np.linspace(0, 1, 100)\n", + " for fpr, tpr, _ in roc_curves:\n", + " tprs.append(np.interp(mean_fpr, fpr, tpr))\n", + " tprs[-1][0] = 0.0\n", + " aucs.append(metrics.auc(fpr, tpr))\n", + "\n", + " mean_tpr = np.mean(tprs, axis=0)\n", + " std_tpr = np.std(tprs, axis=0)\n", + " tprs_upper = np.minimum(mean_tpr + 1.96 * std_tpr, 1)\n", + " tprs_lower = np.maximum(mean_tpr - 1.96 * std_tpr, 0)\n", + " return roc_curves, auc_scores, mean_fpr, tprs_lower, tprs_upper\n", + "\n", + "def eval3():\n", + " # Calculate hospital admission level AUROC for every complete window\n", + " df_Yte = df_Yte_all.copy()\n", + " df_Yte = df_Yte[df_Yte['window_id'] >= 1]\n", + " df_Yte_agg = df_Yte.groupby(['hosp_id']).max()\n", + " return df_Yte_agg" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## M-CURES model performance and scores" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Load exact model and features of patients. Features should be generated from preprocessing script. \n", + "\n", + "mcures_clfs = models_dict['M-CURES']\n", + "df_mcures = pd.read_csv('../preprocessing/sample_output/mcures.csv').set_index('ID')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "df_mcures" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "### AUROC on given dataset\n", + "\n", + "# Calculate model outputs for all patients, average over all models\n", + "eval_matrix = scipy.sparse.csr_matrix(cohort_IDs.join(df_mcures).values.astype(float))\n", + "all_y = np.array([clf.predict_proba(eval_matrix)[:,1] for clf in mcures_clfs])\n", + "y_scores = all_y.mean(0)\n", + "\n", + "# To evaluate models, take maximum over all windows\n", + "df_Yte_all = pd.DataFrame({'hosp_id': test_hosp, 'window_id': test_window, 'y': test_y, 'y_score': y_scores})\n", + "df_Yte_agg = eval3()\n", + "y_score = df_Yte_agg['y_score']\n", + "y_true = df_Yte_agg['y']\n", + "fpr, tpr, thresholds = metrics.roc_curve(y_true, y_score)\n", + "print('Test AUC: {:.3f}'.format(metrics.roc_auc_score(y_true, y_score)))\n", + "\n", + "# # Optionally: Generate 95% CI\n", + "# try:\n", + "# roc_curves, auc_scores, mean_fpr, tprs_lower, tprs_upper = get_roc_CI(y_true, y_score)\n", + "# except:\n", + "# pass" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate list of scores for each example \n", + "\n", + "y_score_lst = df_Yte_all.groupby(['hosp_id'])['y_score'].apply(list)\n", + "df1 = pd.DataFrame({'y_scores_mcures_lst': y_score_lst})\n", + "df2 = pd.DataFrame({'id': df_Yte_agg.index, 'y_scores_mcures': y_score})\n", + "\n", + "outcome_outputs = df1.merge(df2, left_index=True, right_index=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "outcome_outputs " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.7" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}