999 lines (998 with data), 38.2 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.7/site-packages/psycopg2/__init__.py:144: UserWarning: The psycopg2 wheel package will be renamed from release 2.8; in order to keep installing from binary please use \"pip install psycopg2-binary\" instead. For details see: <http://initd.org/psycopg/docs/install.html#binary-install-from-pypi>.\n",
" \"\"\")\n"
]
},
{
"data": {
"text/html": [
" <script type=\"text/javascript\">\n",
" window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
" if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
" if (typeof require !== 'undefined') {\n",
" require.undef(\"plotly\");\n",
" requirejs.config({\n",
" paths: {\n",
" 'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n",
" }\n",
" });\n",
" require(['plotly'], function(Plotly) {\n",
" window._Plotly = Plotly;\n",
" });\n",
" }\n",
" </script>\n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Import libraries\n",
"import numpy as np\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import psycopg2\n",
"import getpass\n",
"import pdvega\n",
"import plotly.graph_objs as go\n",
"\n",
"from plotly.offline import iplot, init_notebook_mode\n",
"import plotly.io as pio\n",
"from plotly.graph_objs import *\n",
"\n",
"# for configuring connection \n",
"from configobj import ConfigObj\n",
"import os\n",
"\n",
"%matplotlib inline\n",
"\n",
"\n",
"import os\n",
"\n",
"\n",
"from sklearn import linear_model\n",
"from sklearn import metrics\n",
"from sklearn.model_selection import train_test_split\n",
"\n",
"#configure the notebook for use in offline mode\n",
"init_notebook_mode(connected=True)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>Unnamed: 0</th>\n",
" <th>hospitalid</th>\n",
" <th>sodium</th>\n",
" <th>electivesurgery</th>\n",
" <th>vent</th>\n",
" <th>dialysis</th>\n",
" <th>gcs</th>\n",
" <th>urine</th>\n",
" <th>wbc</th>\n",
" <th>temperature</th>\n",
" <th>...</th>\n",
" <th>m11_True</th>\n",
" <th>m12_True</th>\n",
" <th>m13_True</th>\n",
" <th>m14_True</th>\n",
" <th>m15_True</th>\n",
" <th>m16_True</th>\n",
" <th>m17_True</th>\n",
" <th>m18_True</th>\n",
" <th>m19_True</th>\n",
" <th>m20_True</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>59.0</td>\n",
" <td>139.0</td>\n",
" <td>-1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>15.0</td>\n",
" <td>-1.0</td>\n",
" <td>14.7</td>\n",
" <td>36.1</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>73.0</td>\n",
" <td>134.0</td>\n",
" <td>-1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>13.0</td>\n",
" <td>-1.0</td>\n",
" <td>14.1</td>\n",
" <td>39.3</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>73.0</td>\n",
" <td>-1.0</td>\n",
" <td>1.0</td>\n",
" <td>1.0</td>\n",
" <td>0.0</td>\n",
" <td>15.0</td>\n",
" <td>-1.0</td>\n",
" <td>8.0</td>\n",
" <td>34.8</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>63.0</td>\n",
" <td>137.0</td>\n",
" <td>-1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>15.0</td>\n",
" <td>-1.0</td>\n",
" <td>10.9</td>\n",
" <td>36.6</td>\n",
" <td>...</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>63.0</td>\n",
" <td>135.0</td>\n",
" <td>-1.0</td>\n",
" <td>0.0</td>\n",
" <td>0.0</td>\n",
" <td>15.0</td>\n",
" <td>-1.0</td>\n",
" <td>5.9</td>\n",
" <td>35.0</td>\n",
" <td>...</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" <td>1</td>\n",
" <td>0</td>\n",
" <td>0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<p>5 rows × 85 columns</p>\n",
"</div>"
],
"text/plain": [
" Unnamed: 0 hospitalid sodium electivesurgery vent dialysis gcs \\\n",
"0 0 59.0 139.0 -1.0 0.0 0.0 15.0 \n",
"1 1 73.0 134.0 -1.0 0.0 0.0 13.0 \n",
"2 2 73.0 -1.0 1.0 1.0 0.0 15.0 \n",
"3 3 63.0 137.0 -1.0 0.0 0.0 15.0 \n",
"4 4 63.0 135.0 -1.0 0.0 0.0 15.0 \n",
"\n",
" urine wbc temperature ... m11_True m12_True m13_True m14_True \\\n",
"0 -1.0 14.7 36.1 ... 1 0 0 1 \n",
"1 -1.0 14.1 39.3 ... 1 0 0 1 \n",
"2 -1.0 8.0 34.8 ... 0 0 1 0 \n",
"3 -1.0 10.9 36.6 ... 1 0 1 1 \n",
"4 -1.0 5.9 35.0 ... 0 0 1 0 \n",
"\n",
" m15_True m16_True m17_True m18_True m19_True m20_True \n",
"0 1 0 0 0 1 0 \n",
"1 1 0 0 0 1 0 \n",
"2 0 1 0 1 0 0 \n",
"3 1 0 0 1 1 0 \n",
"4 0 0 0 1 0 0 \n",
"\n",
"[5 rows x 85 columns]"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2= pd.read_csv(\"Analysis.csv\")\n",
"df2.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(95148, 85)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.shape"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"del df2['hospitalid']\n",
"\n",
"df2 = df2.drop(df2.columns[[0]], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"cols_to_norm=['gcs', 'urine', 'wbc', 'sodium',\n",
" 'temperature', 'respiratoryrate', 'heartrate', 'meanbp', 'creatinine',\n",
" 'ph', 'hematocrit', 'albumin', 'pao2', 'pco2', 'bun', 'glucose',\n",
" 'bilirubin', 'fio2', 'age', 'offset']\n",
"\n",
"X=df2.drop('destcopy', 1)\n",
"y=df2['destcopy']\n",
"df_cols = list(X) #fancy impute removes column names."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**We moved all the pre-processing including splitting>imputation>Standardization to the CV iterations**"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['sodium', 'electivesurgery', 'vent', 'dialysis', 'gcs', 'urine', 'wbc',\n",
" 'temperature', 'respiratoryrate', 'heartrate', 'meanbp', 'creatinine',\n",
" 'ph', 'hematocrit', 'albumin', 'pao2', 'pco2', 'bun', 'glucose',\n",
" 'bilirubin', 'fio2', 'age', 'thrombolytics', 'aids', 'hepaticfailure',\n",
" 'lymphoma', 'metastaticcancer', 'leukemia', 'immunosuppression',\n",
" 'cirrhosis', 'readmit', 'offset', 'destcopy', 'admitsource_1.0',\n",
" 'admitsource_2.0', 'admitsource_3.0', 'admitsource_4.0',\n",
" 'admitsource_5.0', 'admitsource_6.0', 'admitsource_7.0',\n",
" 'admitsource_8.0', 'diaggroup_ARF', 'diaggroup_Asthma-Emphys',\n",
" 'diaggroup_CABG', 'diaggroup_CHF', 'diaggroup_CVA', 'diaggroup_CVOther',\n",
" 'diaggroup_CardiacArrest', 'diaggroup_ChestPainUnknown',\n",
" 'diaggroup_Coma', 'diaggroup_DKA', 'diaggroup_GIBleed',\n",
" 'diaggroup_GIObstruction', 'diaggroup_Neuro', 'diaggroup_Other',\n",
" 'diaggroup_Overdose', 'diaggroup_PNA', 'diaggroup_RespMedOther',\n",
" 'diaggroup_Sepsis', 'diaggroup_Trauma', 'diaggroup_ValveDz',\n",
" 'gender_Male', 'gender_Other', 'm1_True', 'm2_True', 'm3_True',\n",
" 'm4_True', 'm5_True', 'm6_True', 'm7_True', 'm8_True', 'm9_True',\n",
" 'm10_True', 'm11_True', 'm12_True', 'm13_True', 'm14_True', 'm15_True',\n",
" 'm16_True', 'm17_True', 'm18_True', 'm19_True', 'm20_True'],\n",
" dtype='object')"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df2.columns"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"from sklearn import svm\n",
"from sklearn.decomposition import TruncatedSVD\n",
"from sklearn.metrics import classification_report\n",
"from sklearn.model_selection import train_test_split\n",
"from sklearn.pipeline import make_pipeline\n",
"\n",
"from sklearn_hierarchical_classification.classifier import HierarchicalClassifier\n",
"from sklearn_hierarchical_classification.constants import ROOT\n",
"from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Random Forest**"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"class_hierarchy = {\n",
" ROOT: [\"1\", \"A\"],\n",
" \"A\": [\"2\", \"B\"],\n",
" \"B\": [\"3\", \"4\"],\n",
" }"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Enabling notebook extension jupyter-js-widgets/extension...\n",
" - Validating: \u001b[32mOK\u001b[0m\n"
]
}
],
"source": [
"!jupyter nbextension enable --py --sys-prefix widgetsnbextension"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [],
"source": [
"from collections import Counter"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n",
"/anaconda3/lib/python3.7/site-packages/lightgbm/__init__.py:46: UserWarning:\n",
"\n",
"Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_9.4.1) compiler.\n",
"This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
"Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
"You can install the OpenMP library by the following command: ``brew install libomp``.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"[('1', 8852), ('2', 59596), ('3', 12940), ('4', 4244)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
"\n",
"Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"For fold 1:\n",
"Accuracy: 0.6075031525851198\n",
"f-score: 0.6075031525851198\n",
"[('1', 984), ('2', 6622), ('3', 1438), ('4', 472)]\n",
" pre rec spe f1 geo iba sup\n",
"\n",
" 1 0.33 0.77 0.82 0.46 0.79 0.62 984\n",
" 2 0.90 0.67 0.82 0.77 0.74 0.55 6622\n",
" 3 0.32 0.30 0.88 0.31 0.52 0.25 1438\n",
" 4 0.16 0.29 0.92 0.20 0.52 0.25 472\n",
"\n",
"avg / total 0.71 0.61 0.84 0.64 0.70 0.50 9516\n",
"\n",
"[('1', 8852), ('2', 59596), ('3', 12940), ('4', 4244)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
"\n",
"Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"For fold 2:\n",
"Accuracy: 0.5763976460697772\n",
"f-score: 0.5763976460697772\n",
"[('1', 984), ('2', 6622), ('3', 1438), ('4', 472)]\n",
" pre rec spe f1 geo iba sup\n",
"\n",
" 1 0.29 0.82 0.77 0.43 0.80 0.63 984\n",
" 2 0.89 0.65 0.82 0.75 0.73 0.52 6622\n",
" 3 0.26 0.21 0.90 0.23 0.43 0.17 1438\n",
" 4 0.12 0.21 0.92 0.15 0.43 0.18 472\n",
"\n",
"avg / total 0.70 0.58 0.83 0.61 0.68 0.46 9516\n",
"\n",
"[('1', 8852), ('2', 59596), ('3', 12940), ('4', 4244)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
"\n",
"Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"For fold 3:\n",
"Accuracy: 0.5468684321143338\n",
"f-score: 0.5468684321143338\n",
"[('1', 984), ('2', 6622), ('3', 1438), ('4', 472)]\n",
" pre rec spe f1 geo iba sup\n",
"\n",
" 1 0.30 0.83 0.78 0.45 0.81 0.65 984\n",
" 2 0.87 0.61 0.79 0.71 0.69 0.47 6622\n",
" 3 0.21 0.18 0.88 0.19 0.40 0.15 1438\n",
" 4 0.11 0.24 0.90 0.15 0.46 0.20 472\n",
"\n",
"avg / total 0.67 0.55 0.81 0.58 0.65 0.43 9516\n",
"\n",
"[('1', 8852), ('2', 59596), ('3', 12940), ('4', 4244)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
"\n",
"Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"For fold 4:\n",
"Accuracy: 0.5755569567044977\n",
"f-score: 0.5755569567044977\n",
"[('1', 984), ('2', 6622), ('3', 1438), ('4', 472)]\n",
" pre rec spe f1 geo iba sup\n",
"\n",
" 1 0.31 0.78 0.80 0.44 0.79 0.62 984\n",
" 2 0.87 0.66 0.77 0.75 0.71 0.50 6622\n",
" 3 0.23 0.14 0.91 0.18 0.36 0.12 1438\n",
" 4 0.10 0.23 0.90 0.14 0.45 0.19 472\n",
"\n",
"avg / total 0.67 0.58 0.80 0.60 0.66 0.44 9516\n",
"\n",
"[('1', 8852), ('2', 59596), ('3', 12940), ('4', 4244)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
"\n",
"Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"For fold 5:\n",
"Accuracy: 0.6144388398486759\n",
"f-score: 0.6144388398486759\n",
"[('1', 984), ('2', 6622), ('3', 1438), ('4', 472)]\n",
" pre rec spe f1 geo iba sup\n",
"\n",
" 1 0.35 0.82 0.83 0.49 0.82 0.67 984\n",
" 2 0.88 0.70 0.79 0.78 0.74 0.55 6622\n",
" 3 0.26 0.19 0.90 0.22 0.41 0.16 1438\n",
" 4 0.13 0.25 0.91 0.17 0.47 0.21 472\n",
"\n",
"avg / total 0.70 0.61 0.82 0.64 0.69 0.49 9516\n",
"\n",
"[('1', 8852), ('2', 59596), ('3', 12940), ('4', 4244)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
"\n",
"Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"For fold 6:\n",
"Accuracy: 0.6281000420344682\n",
"f-score: 0.6281000420344682\n",
"[('1', 984), ('2', 6622), ('3', 1438), ('4', 472)]\n",
" pre rec spe f1 geo iba sup\n",
"\n",
" 1 0.34 0.79 0.83 0.48 0.81 0.65 984\n",
" 2 0.88 0.73 0.78 0.80 0.76 0.57 6622\n",
" 3 0.24 0.18 0.90 0.21 0.40 0.15 1438\n",
" 4 0.15 0.21 0.94 0.17 0.45 0.18 472\n",
"\n",
"avg / total 0.69 0.63 0.81 0.65 0.69 0.49 9516\n",
"\n",
"[('1', 8853), ('2', 59596), ('3', 12940), ('4', 4245)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
"\n",
"Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"For fold 7:\n",
"Accuracy: 0.6210847172587766\n",
"f-score: 0.6210847172587766\n",
"[('1', 983), ('2', 6622), ('3', 1438), ('4', 471)]\n",
" pre rec spe f1 geo iba sup\n",
"\n",
" 1 0.35 0.77 0.83 0.48 0.80 0.64 983\n",
" 2 0.89 0.69 0.81 0.78 0.75 0.56 6622\n",
" 3 0.31 0.31 0.88 0.31 0.52 0.26 1438\n",
" 4 0.15 0.24 0.93 0.18 0.48 0.21 471\n",
"\n",
"avg / total 0.71 0.62 0.83 0.65 0.71 0.50 9514\n",
"\n",
"[('1', 8853), ('2', 59596), ('3', 12940), ('4', 4245)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
"\n",
"Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"For fold 8:\n",
"Accuracy: 0.6231868824889636\n",
"f-score: 0.6231868824889636\n",
"[('1', 983), ('2', 6622), ('3', 1438), ('4', 471)]\n",
" pre rec spe f1 geo iba sup\n",
"\n",
" 1 0.37 0.75 0.85 0.49 0.80 0.63 983\n",
" 2 0.88 0.70 0.79 0.78 0.74 0.55 6622\n",
" 3 0.30 0.29 0.88 0.30 0.51 0.24 1438\n",
" 4 0.16 0.29 0.92 0.21 0.52 0.25 471\n",
"\n",
"avg / total 0.71 0.62 0.81 0.65 0.70 0.49 9514\n",
"\n",
"[('1', 8853), ('2', 59597), ('3', 12941), ('4', 4245)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
"\n",
"Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"For fold 9:\n",
"Accuracy: 0.6289949537426409\n",
"f-score: 0.6289949537426409\n",
"[('1', 983), ('2', 6621), ('3', 1437), ('4', 471)]\n",
" pre rec spe f1 geo iba sup\n",
"\n",
" 1 0.37 0.80 0.84 0.51 0.82 0.68 983\n",
" 2 0.89 0.72 0.80 0.80 0.76 0.57 6621\n",
" 3 0.29 0.22 0.90 0.25 0.45 0.19 1437\n",
" 4 0.10 0.20 0.91 0.14 0.43 0.17 471\n",
"\n",
"avg / total 0.71 0.63 0.82 0.65 0.70 0.50 9512\n",
"\n",
"[('1', 8853), ('2', 59597), ('3', 12941), ('4', 4245)]\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
"\n",
"Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
"\n",
"/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
"\n",
"Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"For fold 10:\n",
"Accuracy: 0.6135407905803196\n",
"f-score: 0.6135407905803196\n",
"[('1', 983), ('2', 6621), ('3', 1437), ('4', 471)]\n",
" pre rec spe f1 geo iba sup\n",
"\n",
" 1 0.32 0.82 0.80 0.46 0.81 0.66 983\n",
" 2 0.88 0.70 0.78 0.78 0.74 0.54 6621\n",
" 3 0.34 0.17 0.94 0.23 0.40 0.15 1437\n",
" 4 0.13 0.26 0.91 0.17 0.49 0.22 471\n",
"\n",
"avg / total 0.70 0.61 0.81 0.63 0.68 0.48 9512\n",
"\n"
]
},
{
"data": {
"text/plain": [
"<Figure size 576x396 with 0 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from sklearn.model_selection import KFold\n",
"from sklearn import preprocessing\n",
"from imblearn.over_sampling import SMOTE\n",
"from imblearn.over_sampling import SMOTENC\n",
"from sklearn.metrics import f1_score\n",
"from imblearn.metrics import classification_report_imbalanced\n",
"from fancyimpute import IterativeImputer\n",
"from yellowbrick.classifier import ROCAUC\n",
"from sklearn.linear_model import LogisticRegression\n",
"import numpy as np\n",
"import pandas as pd\n",
"from hyperopt import hp, tpe\n",
"from hyperopt.fmin import fmin\n",
"from sklearn.model_selection import cross_val_score, StratifiedKFold\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"from sklearn.metrics import make_scorer\n",
"import xgboost as xgb\n",
"import lightgbm as lgbm\n",
"from sklearn.model_selection import StratifiedKFold\n",
"from collections import Counter\n",
"import io \n",
"\n",
"classes=['Death','Home','Nursing Home','Rehabilitation']\n",
"\n",
"\n",
"kf = StratifiedKFold(n_splits=10)\n",
"y = y.astype(str)\n",
"\n",
"for fold, (train_index, test_index) in enumerate(kf.split(X,y), 1):\n",
" X_train = X.iloc[train_index]\n",
" y_train = y.iloc[train_index] # Based on your code, you might need a ravel call here, but I would look into how you're generating your y\n",
" X_test = X.iloc[test_index]\n",
" y_test = y.iloc[test_index] # See comment on ravel and y_train\n",
" \n",
" \n",
"#------------------------------IMPUTE Training Set------------------------------------\n",
" \n",
" # Use MICE to fill in each row's missing features\n",
" X_train = pd.DataFrame(IterativeImputer(verbose=False, sample_posterior=True).fit_transform(X_train))\n",
" X_train.columns = df_cols\n",
"\n",
"#------------------------------IMPUTE Testing Set------------------------------------ \n",
"\n",
" # Use MICE to fill in each row's missing features\n",
" X_test = pd.DataFrame(IterativeImputer(verbose=False, sample_posterior=True).fit_transform(X_test))\n",
" X_test.columns = df_cols\n",
" \n",
"#------------------------------Standardize Testing Set------------------------------------\n",
" \n",
" std_scale = preprocessing.StandardScaler().fit(X_train[cols_to_norm])\n",
" X_train[cols_to_norm] = std_scale.transform(X_train[cols_to_norm])\n",
" X_test[cols_to_norm] = std_scale.transform(X_test[cols_to_norm])\n",
"#------------------------------------------------------------------------------------------\n",
"\n",
" # Hyperparameters are optimized using hyperopt\n",
"\n",
" #sm = SMOTE()\n",
" \n",
" #sm = SMOTENC(random_state=50, categorical_features=[1,2,3,22,23,24,25,26,27,28,29,30,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61, 62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81])\n",
" #X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)\n",
" print(sorted(Counter(y_train).items()))\n",
" \n",
" model = linear_model.LogisticRegression(class_weight='balanced') \n",
" clf = HierarchicalClassifier(\n",
" base_estimator=model,\n",
" class_hierarchy=class_hierarchy,\n",
" )\n",
" clf.fit(X_train, y_train) \n",
" y_pred = clf.predict(X_test)\n",
" visualizer = ROCAUC(model, classes=classes)\n",
" visualizer.fit(X_train, y_train) # Fit the training data to the visualizer\n",
" visualizer.score(X_test, y_test) # Evaluate the model on the test data\n",
" visualizer.poof(\"LR_Hierarchy_{}.pdf\".format(fold), clear_figure=True) \n",
" print(f'For fold {fold}:')\n",
" print(f'Accuracy: {clf.score(X_test, y_test)}')\n",
" f1=f1_score(y_test, y_pred, average='micro')\n",
" print(f'f-score: {f1}')\n",
" print(sorted(Counter(y_test).items()))\n",
" print(classification_report_imbalanced(y_test, y_pred))\n",
" K= classification_report_imbalanced(y_test, y_pred)\n",
" df = pd.read_fwf(io.StringIO(K))\n",
" df.loc[\"1\":\"1\",\"pre\":\"sup\"].to_csv(\"LR-Hierarchy-D.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n",
" df.loc[\"2\":\"2\",\"pre\":\"sup\"].to_csv(\"LR-Hierarchy-H.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n",
" df.loc[\"3\":\"3\",\"pre\":\"sup\"].to_csv(\"LR-Hierarchy-N.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n",
" df.loc[\"4\":\"4\",\"pre\":\"sup\"].to_csv(\"LR-Hierarchy-R.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n",
" df.iloc[6:7,:].to_csv(\"LR-Hierarchy-avg.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n",
"\n",
" #\n",
" \n",
" #\n",
"\n",
" \n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"X.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"y.shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.1"
}
},
"nbformat": 4,
"nbformat_minor": 2
}