a b/Hierarchical/R2-LR-Hierarchical.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "execution_count": 1,
6
   "metadata": {},
7
   "outputs": [
8
    {
9
     "name": "stderr",
10
     "output_type": "stream",
11
     "text": [
12
      "/anaconda3/lib/python3.7/site-packages/psycopg2/__init__.py:144: UserWarning: The psycopg2 wheel package will be renamed from release 2.8; in order to keep installing from binary please use \"pip install psycopg2-binary\" instead. For details see: <http://initd.org/psycopg/docs/install.html#binary-install-from-pypi>.\n",
13
      "  \"\"\")\n"
14
     ]
15
    },
16
    {
17
     "data": {
18
      "text/html": [
19
       "        <script type=\"text/javascript\">\n",
20
       "        window.PlotlyConfig = {MathJaxConfig: 'local'};\n",
21
       "        if (window.MathJax) {MathJax.Hub.Config({SVG: {font: \"STIX-Web\"}});}\n",
22
       "        if (typeof require !== 'undefined') {\n",
23
       "        require.undef(\"plotly\");\n",
24
       "        requirejs.config({\n",
25
       "            paths: {\n",
26
       "                'plotly': ['https://cdn.plot.ly/plotly-latest.min']\n",
27
       "            }\n",
28
       "        });\n",
29
       "        require(['plotly'], function(Plotly) {\n",
30
       "            window._Plotly = Plotly;\n",
31
       "        });\n",
32
       "        }\n",
33
       "        </script>\n",
34
       "        "
35
      ]
36
     },
37
     "metadata": {},
38
     "output_type": "display_data"
39
    }
40
   ],
41
   "source": [
42
    "# Import libraries\n",
43
    "import numpy as np\n",
44
    "import pandas as pd\n",
45
    "import matplotlib.pyplot as plt\n",
46
    "import psycopg2\n",
47
    "import getpass\n",
48
    "import pdvega\n",
49
    "import plotly.graph_objs as go\n",
50
    "\n",
51
    "from plotly.offline import iplot, init_notebook_mode\n",
52
    "import plotly.io as pio\n",
53
    "from plotly.graph_objs import *\n",
54
    "\n",
55
    "# for configuring connection \n",
56
    "from configobj import ConfigObj\n",
57
    "import os\n",
58
    "\n",
59
    "%matplotlib inline\n",
60
    "\n",
61
    "\n",
62
    "import os\n",
63
    "\n",
64
    "\n",
65
    "from sklearn import linear_model\n",
66
    "from sklearn import metrics\n",
67
    "from sklearn.model_selection import train_test_split\n",
68
    "\n",
69
    "#configure the notebook for use in offline mode\n",
70
    "init_notebook_mode(connected=True)"
71
   ]
72
  },
73
  {
74
   "cell_type": "code",
75
   "execution_count": 3,
76
   "metadata": {},
77
   "outputs": [
78
    {
79
     "data": {
80
      "text/html": [
81
       "<div>\n",
82
       "<style scoped>\n",
83
       "    .dataframe tbody tr th:only-of-type {\n",
84
       "        vertical-align: middle;\n",
85
       "    }\n",
86
       "\n",
87
       "    .dataframe tbody tr th {\n",
88
       "        vertical-align: top;\n",
89
       "    }\n",
90
       "\n",
91
       "    .dataframe thead th {\n",
92
       "        text-align: right;\n",
93
       "    }\n",
94
       "</style>\n",
95
       "<table border=\"1\" class=\"dataframe\">\n",
96
       "  <thead>\n",
97
       "    <tr style=\"text-align: right;\">\n",
98
       "      <th></th>\n",
99
       "      <th>Unnamed: 0</th>\n",
100
       "      <th>hospitalid</th>\n",
101
       "      <th>sodium</th>\n",
102
       "      <th>electivesurgery</th>\n",
103
       "      <th>vent</th>\n",
104
       "      <th>dialysis</th>\n",
105
       "      <th>gcs</th>\n",
106
       "      <th>urine</th>\n",
107
       "      <th>wbc</th>\n",
108
       "      <th>temperature</th>\n",
109
       "      <th>...</th>\n",
110
       "      <th>m11_True</th>\n",
111
       "      <th>m12_True</th>\n",
112
       "      <th>m13_True</th>\n",
113
       "      <th>m14_True</th>\n",
114
       "      <th>m15_True</th>\n",
115
       "      <th>m16_True</th>\n",
116
       "      <th>m17_True</th>\n",
117
       "      <th>m18_True</th>\n",
118
       "      <th>m19_True</th>\n",
119
       "      <th>m20_True</th>\n",
120
       "    </tr>\n",
121
       "  </thead>\n",
122
       "  <tbody>\n",
123
       "    <tr>\n",
124
       "      <th>0</th>\n",
125
       "      <td>0</td>\n",
126
       "      <td>59.0</td>\n",
127
       "      <td>139.0</td>\n",
128
       "      <td>-1.0</td>\n",
129
       "      <td>0.0</td>\n",
130
       "      <td>0.0</td>\n",
131
       "      <td>15.0</td>\n",
132
       "      <td>-1.0</td>\n",
133
       "      <td>14.7</td>\n",
134
       "      <td>36.1</td>\n",
135
       "      <td>...</td>\n",
136
       "      <td>1</td>\n",
137
       "      <td>0</td>\n",
138
       "      <td>0</td>\n",
139
       "      <td>1</td>\n",
140
       "      <td>1</td>\n",
141
       "      <td>0</td>\n",
142
       "      <td>0</td>\n",
143
       "      <td>0</td>\n",
144
       "      <td>1</td>\n",
145
       "      <td>0</td>\n",
146
       "    </tr>\n",
147
       "    <tr>\n",
148
       "      <th>1</th>\n",
149
       "      <td>1</td>\n",
150
       "      <td>73.0</td>\n",
151
       "      <td>134.0</td>\n",
152
       "      <td>-1.0</td>\n",
153
       "      <td>0.0</td>\n",
154
       "      <td>0.0</td>\n",
155
       "      <td>13.0</td>\n",
156
       "      <td>-1.0</td>\n",
157
       "      <td>14.1</td>\n",
158
       "      <td>39.3</td>\n",
159
       "      <td>...</td>\n",
160
       "      <td>1</td>\n",
161
       "      <td>0</td>\n",
162
       "      <td>0</td>\n",
163
       "      <td>1</td>\n",
164
       "      <td>1</td>\n",
165
       "      <td>0</td>\n",
166
       "      <td>0</td>\n",
167
       "      <td>0</td>\n",
168
       "      <td>1</td>\n",
169
       "      <td>0</td>\n",
170
       "    </tr>\n",
171
       "    <tr>\n",
172
       "      <th>2</th>\n",
173
       "      <td>2</td>\n",
174
       "      <td>73.0</td>\n",
175
       "      <td>-1.0</td>\n",
176
       "      <td>1.0</td>\n",
177
       "      <td>1.0</td>\n",
178
       "      <td>0.0</td>\n",
179
       "      <td>15.0</td>\n",
180
       "      <td>-1.0</td>\n",
181
       "      <td>8.0</td>\n",
182
       "      <td>34.8</td>\n",
183
       "      <td>...</td>\n",
184
       "      <td>0</td>\n",
185
       "      <td>0</td>\n",
186
       "      <td>1</td>\n",
187
       "      <td>0</td>\n",
188
       "      <td>0</td>\n",
189
       "      <td>1</td>\n",
190
       "      <td>0</td>\n",
191
       "      <td>1</td>\n",
192
       "      <td>0</td>\n",
193
       "      <td>0</td>\n",
194
       "    </tr>\n",
195
       "    <tr>\n",
196
       "      <th>3</th>\n",
197
       "      <td>3</td>\n",
198
       "      <td>63.0</td>\n",
199
       "      <td>137.0</td>\n",
200
       "      <td>-1.0</td>\n",
201
       "      <td>0.0</td>\n",
202
       "      <td>0.0</td>\n",
203
       "      <td>15.0</td>\n",
204
       "      <td>-1.0</td>\n",
205
       "      <td>10.9</td>\n",
206
       "      <td>36.6</td>\n",
207
       "      <td>...</td>\n",
208
       "      <td>1</td>\n",
209
       "      <td>0</td>\n",
210
       "      <td>1</td>\n",
211
       "      <td>1</td>\n",
212
       "      <td>1</td>\n",
213
       "      <td>0</td>\n",
214
       "      <td>0</td>\n",
215
       "      <td>1</td>\n",
216
       "      <td>1</td>\n",
217
       "      <td>0</td>\n",
218
       "    </tr>\n",
219
       "    <tr>\n",
220
       "      <th>4</th>\n",
221
       "      <td>4</td>\n",
222
       "      <td>63.0</td>\n",
223
       "      <td>135.0</td>\n",
224
       "      <td>-1.0</td>\n",
225
       "      <td>0.0</td>\n",
226
       "      <td>0.0</td>\n",
227
       "      <td>15.0</td>\n",
228
       "      <td>-1.0</td>\n",
229
       "      <td>5.9</td>\n",
230
       "      <td>35.0</td>\n",
231
       "      <td>...</td>\n",
232
       "      <td>0</td>\n",
233
       "      <td>0</td>\n",
234
       "      <td>1</td>\n",
235
       "      <td>0</td>\n",
236
       "      <td>0</td>\n",
237
       "      <td>0</td>\n",
238
       "      <td>0</td>\n",
239
       "      <td>1</td>\n",
240
       "      <td>0</td>\n",
241
       "      <td>0</td>\n",
242
       "    </tr>\n",
243
       "  </tbody>\n",
244
       "</table>\n",
245
       "<p>5 rows × 85 columns</p>\n",
246
       "</div>"
247
      ],
248
      "text/plain": [
249
       "   Unnamed: 0  hospitalid  sodium  electivesurgery  vent  dialysis   gcs  \\\n",
250
       "0           0        59.0   139.0             -1.0   0.0       0.0  15.0   \n",
251
       "1           1        73.0   134.0             -1.0   0.0       0.0  13.0   \n",
252
       "2           2        73.0    -1.0              1.0   1.0       0.0  15.0   \n",
253
       "3           3        63.0   137.0             -1.0   0.0       0.0  15.0   \n",
254
       "4           4        63.0   135.0             -1.0   0.0       0.0  15.0   \n",
255
       "\n",
256
       "   urine   wbc  temperature    ...     m11_True  m12_True  m13_True  m14_True  \\\n",
257
       "0   -1.0  14.7         36.1    ...            1         0         0         1   \n",
258
       "1   -1.0  14.1         39.3    ...            1         0         0         1   \n",
259
       "2   -1.0   8.0         34.8    ...            0         0         1         0   \n",
260
       "3   -1.0  10.9         36.6    ...            1         0         1         1   \n",
261
       "4   -1.0   5.9         35.0    ...            0         0         1         0   \n",
262
       "\n",
263
       "   m15_True  m16_True  m17_True  m18_True  m19_True  m20_True  \n",
264
       "0         1         0         0         0         1         0  \n",
265
       "1         1         0         0         0         1         0  \n",
266
       "2         0         1         0         1         0         0  \n",
267
       "3         1         0         0         1         1         0  \n",
268
       "4         0         0         0         1         0         0  \n",
269
       "\n",
270
       "[5 rows x 85 columns]"
271
      ]
272
     },
273
     "execution_count": 3,
274
     "metadata": {},
275
     "output_type": "execute_result"
276
    }
277
   ],
278
   "source": [
279
    "df2= pd.read_csv(\"Analysis.csv\")\n",
280
    "df2.head()"
281
   ]
282
  },
283
  {
284
   "cell_type": "code",
285
   "execution_count": 4,
286
   "metadata": {},
287
   "outputs": [
288
    {
289
     "data": {
290
      "text/plain": [
291
       "(95148, 85)"
292
      ]
293
     },
294
     "execution_count": 4,
295
     "metadata": {},
296
     "output_type": "execute_result"
297
    }
298
   ],
299
   "source": [
300
    "df2.shape"
301
   ]
302
  },
303
  {
304
   "cell_type": "code",
305
   "execution_count": 5,
306
   "metadata": {},
307
   "outputs": [],
308
   "source": [
309
    "del df2['hospitalid']\n",
310
    "\n",
311
    "df2 = df2.drop(df2.columns[[0]], axis=1)"
312
   ]
313
  },
314
  {
315
   "cell_type": "code",
316
   "execution_count": 6,
317
   "metadata": {},
318
   "outputs": [],
319
   "source": [
320
    "cols_to_norm=['gcs', 'urine', 'wbc', 'sodium',\n",
321
    "       'temperature', 'respiratoryrate', 'heartrate', 'meanbp', 'creatinine',\n",
322
    "       'ph', 'hematocrit', 'albumin', 'pao2', 'pco2', 'bun', 'glucose',\n",
323
    "       'bilirubin', 'fio2', 'age', 'offset']\n",
324
    "\n",
325
    "X=df2.drop('destcopy', 1)\n",
326
    "y=df2['destcopy']\n",
327
    "df_cols = list(X)     #fancy impute removes column names."
328
   ]
329
  },
330
  {
331
   "cell_type": "markdown",
332
   "metadata": {},
333
   "source": [
334
    "**We moved all the pre-processing including splitting>imputation>Standardization to the CV iterations**"
335
   ]
336
  },
337
  {
338
   "cell_type": "code",
339
   "execution_count": 7,
340
   "metadata": {},
341
   "outputs": [
342
    {
343
     "data": {
344
      "text/plain": [
345
       "Index(['sodium', 'electivesurgery', 'vent', 'dialysis', 'gcs', 'urine', 'wbc',\n",
346
       "       'temperature', 'respiratoryrate', 'heartrate', 'meanbp', 'creatinine',\n",
347
       "       'ph', 'hematocrit', 'albumin', 'pao2', 'pco2', 'bun', 'glucose',\n",
348
       "       'bilirubin', 'fio2', 'age', 'thrombolytics', 'aids', 'hepaticfailure',\n",
349
       "       'lymphoma', 'metastaticcancer', 'leukemia', 'immunosuppression',\n",
350
       "       'cirrhosis', 'readmit', 'offset', 'destcopy', 'admitsource_1.0',\n",
351
       "       'admitsource_2.0', 'admitsource_3.0', 'admitsource_4.0',\n",
352
       "       'admitsource_5.0', 'admitsource_6.0', 'admitsource_7.0',\n",
353
       "       'admitsource_8.0', 'diaggroup_ARF', 'diaggroup_Asthma-Emphys',\n",
354
       "       'diaggroup_CABG', 'diaggroup_CHF', 'diaggroup_CVA', 'diaggroup_CVOther',\n",
355
       "       'diaggroup_CardiacArrest', 'diaggroup_ChestPainUnknown',\n",
356
       "       'diaggroup_Coma', 'diaggroup_DKA', 'diaggroup_GIBleed',\n",
357
       "       'diaggroup_GIObstruction', 'diaggroup_Neuro', 'diaggroup_Other',\n",
358
       "       'diaggroup_Overdose', 'diaggroup_PNA', 'diaggroup_RespMedOther',\n",
359
       "       'diaggroup_Sepsis', 'diaggroup_Trauma', 'diaggroup_ValveDz',\n",
360
       "       'gender_Male', 'gender_Other', 'm1_True', 'm2_True', 'm3_True',\n",
361
       "       'm4_True', 'm5_True', 'm6_True', 'm7_True', 'm8_True', 'm9_True',\n",
362
       "       'm10_True', 'm11_True', 'm12_True', 'm13_True', 'm14_True', 'm15_True',\n",
363
       "       'm16_True', 'm17_True', 'm18_True', 'm19_True', 'm20_True'],\n",
364
       "      dtype='object')"
365
      ]
366
     },
367
     "execution_count": 7,
368
     "metadata": {},
369
     "output_type": "execute_result"
370
    }
371
   ],
372
   "source": [
373
    "df2.columns"
374
   ]
375
  },
376
  {
377
   "cell_type": "code",
378
   "execution_count": 8,
379
   "metadata": {},
380
   "outputs": [],
381
   "source": [
382
    "from sklearn import svm\n",
383
    "from sklearn.decomposition import TruncatedSVD\n",
384
    "from sklearn.metrics import classification_report\n",
385
    "from sklearn.model_selection import train_test_split\n",
386
    "from sklearn.pipeline import make_pipeline\n",
387
    "\n",
388
    "from sklearn_hierarchical_classification.classifier import HierarchicalClassifier\n",
389
    "from sklearn_hierarchical_classification.constants import ROOT\n",
390
    "from sklearn_hierarchical_classification.metrics import h_fbeta_score, multi_labeled"
391
   ]
392
  },
393
  {
394
   "cell_type": "markdown",
395
   "metadata": {},
396
   "source": [
397
    "**Random Forest**"
398
   ]
399
  },
400
  {
401
   "cell_type": "code",
402
   "execution_count": 9,
403
   "metadata": {},
404
   "outputs": [],
405
   "source": [
406
    "class_hierarchy = {\n",
407
    "        ROOT: [\"1\", \"A\"],\n",
408
    "        \"A\": [\"2\", \"B\"],\n",
409
    "        \"B\": [\"3\", \"4\"],\n",
410
    "    }"
411
   ]
412
  },
413
  {
414
   "cell_type": "code",
415
   "execution_count": 10,
416
   "metadata": {},
417
   "outputs": [
418
    {
419
     "name": "stdout",
420
     "output_type": "stream",
421
     "text": [
422
      "Enabling notebook extension jupyter-js-widgets/extension...\n",
423
      "      - Validating: \u001b[32mOK\u001b[0m\n"
424
     ]
425
    }
426
   ],
427
   "source": [
428
    "!jupyter nbextension enable --py --sys-prefix widgetsnbextension"
429
   ]
430
  },
431
  {
432
   "cell_type": "code",
433
   "execution_count": 11,
434
   "metadata": {},
435
   "outputs": [],
436
   "source": [
437
    "from collections import Counter"
438
   ]
439
  },
440
  {
441
   "cell_type": "code",
442
   "execution_count": 12,
443
   "metadata": {},
444
   "outputs": [
445
    {
446
     "name": "stderr",
447
     "output_type": "stream",
448
     "text": [
449
      "Using TensorFlow backend.\n",
450
      "/anaconda3/lib/python3.7/site-packages/lightgbm/__init__.py:46: UserWarning:\n",
451
      "\n",
452
      "Starting from version 2.2.1, the library file in distribution wheels for macOS is built by the Apple Clang (Xcode_9.4.1) compiler.\n",
453
      "This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.\n",
454
      "Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.\n",
455
      "You can install the OpenMP library by the following command: ``brew install libomp``.\n",
456
      "\n"
457
     ]
458
    },
459
    {
460
     "name": "stdout",
461
     "output_type": "stream",
462
     "text": [
463
      "[('1', 8852), ('2', 59596), ('3', 12940), ('4', 4244)]\n"
464
     ]
465
    },
466
    {
467
     "name": "stderr",
468
     "output_type": "stream",
469
     "text": [
470
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
471
      "\n",
472
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
473
      "\n",
474
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
475
      "\n",
476
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
477
      "\n",
478
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
479
      "\n",
480
      "Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
481
      "\n"
482
     ]
483
    },
484
    {
485
     "name": "stdout",
486
     "output_type": "stream",
487
     "text": [
488
      "For fold 1:\n",
489
      "Accuracy: 0.6075031525851198\n",
490
      "f-score: 0.6075031525851198\n",
491
      "[('1', 984), ('2', 6622), ('3', 1438), ('4', 472)]\n",
492
      "                   pre       rec       spe        f1       geo       iba       sup\n",
493
      "\n",
494
      "          1       0.33      0.77      0.82      0.46      0.79      0.62       984\n",
495
      "          2       0.90      0.67      0.82      0.77      0.74      0.55      6622\n",
496
      "          3       0.32      0.30      0.88      0.31      0.52      0.25      1438\n",
497
      "          4       0.16      0.29      0.92      0.20      0.52      0.25       472\n",
498
      "\n",
499
      "avg / total       0.71      0.61      0.84      0.64      0.70      0.50      9516\n",
500
      "\n",
501
      "[('1', 8852), ('2', 59596), ('3', 12940), ('4', 4244)]\n"
502
     ]
503
    },
504
    {
505
     "name": "stderr",
506
     "output_type": "stream",
507
     "text": [
508
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
509
      "\n",
510
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
511
      "\n",
512
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
513
      "\n",
514
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
515
      "\n",
516
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
517
      "\n",
518
      "Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
519
      "\n"
520
     ]
521
    },
522
    {
523
     "name": "stdout",
524
     "output_type": "stream",
525
     "text": [
526
      "For fold 2:\n",
527
      "Accuracy: 0.5763976460697772\n",
528
      "f-score: 0.5763976460697772\n",
529
      "[('1', 984), ('2', 6622), ('3', 1438), ('4', 472)]\n",
530
      "                   pre       rec       spe        f1       geo       iba       sup\n",
531
      "\n",
532
      "          1       0.29      0.82      0.77      0.43      0.80      0.63       984\n",
533
      "          2       0.89      0.65      0.82      0.75      0.73      0.52      6622\n",
534
      "          3       0.26      0.21      0.90      0.23      0.43      0.17      1438\n",
535
      "          4       0.12      0.21      0.92      0.15      0.43      0.18       472\n",
536
      "\n",
537
      "avg / total       0.70      0.58      0.83      0.61      0.68      0.46      9516\n",
538
      "\n",
539
      "[('1', 8852), ('2', 59596), ('3', 12940), ('4', 4244)]\n"
540
     ]
541
    },
542
    {
543
     "name": "stderr",
544
     "output_type": "stream",
545
     "text": [
546
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
547
      "\n",
548
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
549
      "\n",
550
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
551
      "\n",
552
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
553
      "\n",
554
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
555
      "\n",
556
      "Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
557
      "\n"
558
     ]
559
    },
560
    {
561
     "name": "stdout",
562
     "output_type": "stream",
563
     "text": [
564
      "For fold 3:\n",
565
      "Accuracy: 0.5468684321143338\n",
566
      "f-score: 0.5468684321143338\n",
567
      "[('1', 984), ('2', 6622), ('3', 1438), ('4', 472)]\n",
568
      "                   pre       rec       spe        f1       geo       iba       sup\n",
569
      "\n",
570
      "          1       0.30      0.83      0.78      0.45      0.81      0.65       984\n",
571
      "          2       0.87      0.61      0.79      0.71      0.69      0.47      6622\n",
572
      "          3       0.21      0.18      0.88      0.19      0.40      0.15      1438\n",
573
      "          4       0.11      0.24      0.90      0.15      0.46      0.20       472\n",
574
      "\n",
575
      "avg / total       0.67      0.55      0.81      0.58      0.65      0.43      9516\n",
576
      "\n",
577
      "[('1', 8852), ('2', 59596), ('3', 12940), ('4', 4244)]\n"
578
     ]
579
    },
580
    {
581
     "name": "stderr",
582
     "output_type": "stream",
583
     "text": [
584
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
585
      "\n",
586
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
587
      "\n",
588
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
589
      "\n",
590
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
591
      "\n",
592
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
593
      "\n",
594
      "Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
595
      "\n"
596
     ]
597
    },
598
    {
599
     "name": "stdout",
600
     "output_type": "stream",
601
     "text": [
602
      "For fold 4:\n",
603
      "Accuracy: 0.5755569567044977\n",
604
      "f-score: 0.5755569567044977\n",
605
      "[('1', 984), ('2', 6622), ('3', 1438), ('4', 472)]\n",
606
      "                   pre       rec       spe        f1       geo       iba       sup\n",
607
      "\n",
608
      "          1       0.31      0.78      0.80      0.44      0.79      0.62       984\n",
609
      "          2       0.87      0.66      0.77      0.75      0.71      0.50      6622\n",
610
      "          3       0.23      0.14      0.91      0.18      0.36      0.12      1438\n",
611
      "          4       0.10      0.23      0.90      0.14      0.45      0.19       472\n",
612
      "\n",
613
      "avg / total       0.67      0.58      0.80      0.60      0.66      0.44      9516\n",
614
      "\n",
615
      "[('1', 8852), ('2', 59596), ('3', 12940), ('4', 4244)]\n"
616
     ]
617
    },
618
    {
619
     "name": "stderr",
620
     "output_type": "stream",
621
     "text": [
622
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
623
      "\n",
624
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
625
      "\n",
626
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
627
      "\n",
628
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
629
      "\n",
630
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
631
      "\n",
632
      "Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
633
      "\n"
634
     ]
635
    },
636
    {
637
     "name": "stdout",
638
     "output_type": "stream",
639
     "text": [
640
      "For fold 5:\n",
641
      "Accuracy: 0.6144388398486759\n",
642
      "f-score: 0.6144388398486759\n",
643
      "[('1', 984), ('2', 6622), ('3', 1438), ('4', 472)]\n",
644
      "                   pre       rec       spe        f1       geo       iba       sup\n",
645
      "\n",
646
      "          1       0.35      0.82      0.83      0.49      0.82      0.67       984\n",
647
      "          2       0.88      0.70      0.79      0.78      0.74      0.55      6622\n",
648
      "          3       0.26      0.19      0.90      0.22      0.41      0.16      1438\n",
649
      "          4       0.13      0.25      0.91      0.17      0.47      0.21       472\n",
650
      "\n",
651
      "avg / total       0.70      0.61      0.82      0.64      0.69      0.49      9516\n",
652
      "\n",
653
      "[('1', 8852), ('2', 59596), ('3', 12940), ('4', 4244)]\n"
654
     ]
655
    },
656
    {
657
     "name": "stderr",
658
     "output_type": "stream",
659
     "text": [
660
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
661
      "\n",
662
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
663
      "\n",
664
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
665
      "\n",
666
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
667
      "\n",
668
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
669
      "\n",
670
      "Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
671
      "\n"
672
     ]
673
    },
674
    {
675
     "name": "stdout",
676
     "output_type": "stream",
677
     "text": [
678
      "For fold 6:\n",
679
      "Accuracy: 0.6281000420344682\n",
680
      "f-score: 0.6281000420344682\n",
681
      "[('1', 984), ('2', 6622), ('3', 1438), ('4', 472)]\n",
682
      "                   pre       rec       spe        f1       geo       iba       sup\n",
683
      "\n",
684
      "          1       0.34      0.79      0.83      0.48      0.81      0.65       984\n",
685
      "          2       0.88      0.73      0.78      0.80      0.76      0.57      6622\n",
686
      "          3       0.24      0.18      0.90      0.21      0.40      0.15      1438\n",
687
      "          4       0.15      0.21      0.94      0.17      0.45      0.18       472\n",
688
      "\n",
689
      "avg / total       0.69      0.63      0.81      0.65      0.69      0.49      9516\n",
690
      "\n",
691
      "[('1', 8853), ('2', 59596), ('3', 12940), ('4', 4245)]\n"
692
     ]
693
    },
694
    {
695
     "name": "stderr",
696
     "output_type": "stream",
697
     "text": [
698
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
699
      "\n",
700
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
701
      "\n",
702
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
703
      "\n",
704
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
705
      "\n",
706
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
707
      "\n",
708
      "Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
709
      "\n"
710
     ]
711
    },
712
    {
713
     "name": "stdout",
714
     "output_type": "stream",
715
     "text": [
716
      "For fold 7:\n",
717
      "Accuracy: 0.6210847172587766\n",
718
      "f-score: 0.6210847172587766\n",
719
      "[('1', 983), ('2', 6622), ('3', 1438), ('4', 471)]\n",
720
      "                   pre       rec       spe        f1       geo       iba       sup\n",
721
      "\n",
722
      "          1       0.35      0.77      0.83      0.48      0.80      0.64       983\n",
723
      "          2       0.89      0.69      0.81      0.78      0.75      0.56      6622\n",
724
      "          3       0.31      0.31      0.88      0.31      0.52      0.26      1438\n",
725
      "          4       0.15      0.24      0.93      0.18      0.48      0.21       471\n",
726
      "\n",
727
      "avg / total       0.71      0.62      0.83      0.65      0.71      0.50      9514\n",
728
      "\n",
729
      "[('1', 8853), ('2', 59596), ('3', 12940), ('4', 4245)]\n"
730
     ]
731
    },
732
    {
733
     "name": "stderr",
734
     "output_type": "stream",
735
     "text": [
736
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
737
      "\n",
738
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
739
      "\n",
740
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
741
      "\n",
742
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
743
      "\n",
744
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
745
      "\n",
746
      "Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
747
      "\n"
748
     ]
749
    },
750
    {
751
     "name": "stdout",
752
     "output_type": "stream",
753
     "text": [
754
      "For fold 8:\n",
755
      "Accuracy: 0.6231868824889636\n",
756
      "f-score: 0.6231868824889636\n",
757
      "[('1', 983), ('2', 6622), ('3', 1438), ('4', 471)]\n",
758
      "                   pre       rec       spe        f1       geo       iba       sup\n",
759
      "\n",
760
      "          1       0.37      0.75      0.85      0.49      0.80      0.63       983\n",
761
      "          2       0.88      0.70      0.79      0.78      0.74      0.55      6622\n",
762
      "          3       0.30      0.29      0.88      0.30      0.51      0.24      1438\n",
763
      "          4       0.16      0.29      0.92      0.21      0.52      0.25       471\n",
764
      "\n",
765
      "avg / total       0.71      0.62      0.81      0.65      0.70      0.49      9514\n",
766
      "\n",
767
      "[('1', 8853), ('2', 59597), ('3', 12941), ('4', 4245)]\n"
768
     ]
769
    },
770
    {
771
     "name": "stderr",
772
     "output_type": "stream",
773
     "text": [
774
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
775
      "\n",
776
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
777
      "\n",
778
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
779
      "\n",
780
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
781
      "\n",
782
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
783
      "\n",
784
      "Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
785
      "\n"
786
     ]
787
    },
788
    {
789
     "name": "stdout",
790
     "output_type": "stream",
791
     "text": [
792
      "For fold 9:\n",
793
      "Accuracy: 0.6289949537426409\n",
794
      "f-score: 0.6289949537426409\n",
795
      "[('1', 983), ('2', 6621), ('3', 1437), ('4', 471)]\n",
796
      "                   pre       rec       spe        f1       geo       iba       sup\n",
797
      "\n",
798
      "          1       0.37      0.80      0.84      0.51      0.82      0.68       983\n",
799
      "          2       0.89      0.72      0.80      0.80      0.76      0.57      6621\n",
800
      "          3       0.29      0.22      0.90      0.25      0.45      0.19      1437\n",
801
      "          4       0.10      0.20      0.91      0.14      0.43      0.17       471\n",
802
      "\n",
803
      "avg / total       0.71      0.63      0.82      0.65      0.70      0.50      9512\n",
804
      "\n",
805
      "[('1', 8853), ('2', 59597), ('3', 12941), ('4', 4245)]\n"
806
     ]
807
    },
808
    {
809
     "name": "stderr",
810
     "output_type": "stream",
811
     "text": [
812
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
813
      "\n",
814
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
815
      "\n",
816
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:433: FutureWarning:\n",
817
      "\n",
818
      "Default solver will be changed to 'lbfgs' in 0.22. Specify a solver to silence this warning.\n",
819
      "\n",
820
      "/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py:460: FutureWarning:\n",
821
      "\n",
822
      "Default multi_class will be changed to 'auto' in 0.22. Specify the multi_class option to silence this warning.\n",
823
      "\n"
824
     ]
825
    },
826
    {
827
     "name": "stdout",
828
     "output_type": "stream",
829
     "text": [
830
      "For fold 10:\n",
831
      "Accuracy: 0.6135407905803196\n",
832
      "f-score: 0.6135407905803196\n",
833
      "[('1', 983), ('2', 6621), ('3', 1437), ('4', 471)]\n",
834
      "                   pre       rec       spe        f1       geo       iba       sup\n",
835
      "\n",
836
      "          1       0.32      0.82      0.80      0.46      0.81      0.66       983\n",
837
      "          2       0.88      0.70      0.78      0.78      0.74      0.54      6621\n",
838
      "          3       0.34      0.17      0.94      0.23      0.40      0.15      1437\n",
839
      "          4       0.13      0.26      0.91      0.17      0.49      0.22       471\n",
840
      "\n",
841
      "avg / total       0.70      0.61      0.81      0.63      0.68      0.48      9512\n",
842
      "\n"
843
     ]
844
    },
845
    {
846
     "data": {
847
      "text/plain": [
848
       "<Figure size 576x396 with 0 Axes>"
849
      ]
850
     },
851
     "metadata": {},
852
     "output_type": "display_data"
853
    }
854
   ],
855
   "source": [
856
    "from sklearn.model_selection import KFold\n",
857
    "from sklearn import preprocessing\n",
858
    "from imblearn.over_sampling import SMOTE\n",
859
    "from imblearn.over_sampling import SMOTENC\n",
860
    "from sklearn.metrics import f1_score\n",
861
    "from imblearn.metrics import classification_report_imbalanced\n",
862
    "from fancyimpute import IterativeImputer\n",
863
    "from yellowbrick.classifier import ROCAUC\n",
864
    "from sklearn.linear_model import LogisticRegression\n",
865
    "import numpy as np\n",
866
    "import pandas as pd\n",
867
    "from hyperopt import hp, tpe\n",
868
    "from hyperopt.fmin import fmin\n",
869
    "from sklearn.model_selection import cross_val_score, StratifiedKFold\n",
870
    "from sklearn.ensemble import RandomForestClassifier\n",
871
    "from sklearn.metrics import make_scorer\n",
872
    "import xgboost as xgb\n",
873
    "import lightgbm as lgbm\n",
874
    "from sklearn.model_selection import StratifiedKFold\n",
875
    "from collections import Counter\n",
876
    "import io \n",
877
    "\n",
878
    "classes=['Death','Home','Nursing Home','Rehabilitation']\n",
879
    "\n",
880
    "\n",
881
    "kf = StratifiedKFold(n_splits=10)\n",
882
    "y = y.astype(str)\n",
883
    "\n",
884
    "for fold, (train_index, test_index) in enumerate(kf.split(X,y), 1):\n",
885
    "    X_train = X.iloc[train_index]\n",
886
    "    y_train = y.iloc[train_index]  # Based on your code, you might need a ravel call here, but I would look into how you're generating your y\n",
887
    "    X_test = X.iloc[test_index]\n",
888
    "    y_test = y.iloc[test_index]  # See comment on ravel and  y_train\n",
889
    "    \n",
890
    "    \n",
891
    "#------------------------------IMPUTE Training Set------------------------------------\n",
892
    "    \n",
893
    "    # Use MICE to fill in each row's missing features\n",
894
    "    X_train = pd.DataFrame(IterativeImputer(verbose=False, sample_posterior=True).fit_transform(X_train))\n",
895
    "    X_train.columns = df_cols\n",
896
    "\n",
897
    "#------------------------------IMPUTE Testing Set------------------------------------ \n",
898
    "\n",
899
    "    # Use MICE to fill in each row's missing features\n",
900
    "    X_test = pd.DataFrame(IterativeImputer(verbose=False, sample_posterior=True).fit_transform(X_test))\n",
901
    "    X_test.columns = df_cols\n",
902
    "    \n",
903
    "#------------------------------Standardize Testing Set------------------------------------\n",
904
    "    \n",
905
    "    std_scale = preprocessing.StandardScaler().fit(X_train[cols_to_norm])\n",
906
    "    X_train[cols_to_norm] = std_scale.transform(X_train[cols_to_norm])\n",
907
    "    X_test[cols_to_norm] = std_scale.transform(X_test[cols_to_norm])\n",
908
    "#------------------------------------------------------------------------------------------\n",
909
    "\n",
910
    " # Hyperparameters are optimized using hyperopt\n",
911
    "\n",
912
    "    #sm = SMOTE()\n",
913
    "    \n",
914
    "    #sm = SMOTENC(random_state=50, categorical_features=[1,2,3,22,23,24,25,26,27,28,29,30,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61, 62,63,64,65,66,67,68,69,70,71,72,73,74,75,76,77,78,79,80,81])\n",
915
    "    #X_train_oversampled, y_train_oversampled = sm.fit_sample(X_train, y_train)\n",
916
    "    print(sorted(Counter(y_train).items()))\n",
917
    "    \n",
918
    "    model = linear_model.LogisticRegression(class_weight='balanced') \n",
919
    "    clf = HierarchicalClassifier(\n",
920
    "        base_estimator=model,\n",
921
    "        class_hierarchy=class_hierarchy,\n",
922
    "    )\n",
923
    "    clf.fit(X_train, y_train)  \n",
924
    "    y_pred = clf.predict(X_test)\n",
925
    "    visualizer = ROCAUC(model, classes=classes)\n",
926
    "    visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer\n",
927
    "    visualizer.score(X_test, y_test)  # Evaluate the model on the test data\n",
928
    "    visualizer.poof(\"LR_Hierarchy_{}.pdf\".format(fold), clear_figure=True) \n",
929
    "    print(f'For fold {fold}:')\n",
930
    "    print(f'Accuracy: {clf.score(X_test, y_test)}')\n",
931
    "    f1=f1_score(y_test, y_pred, average='micro')\n",
932
    "    print(f'f-score: {f1}')\n",
933
    "    print(sorted(Counter(y_test).items()))\n",
934
    "    print(classification_report_imbalanced(y_test, y_pred))\n",
935
    "    K= classification_report_imbalanced(y_test, y_pred)\n",
936
    "    df = pd.read_fwf(io.StringIO(K))\n",
937
    "    df.loc[\"1\":\"1\",\"pre\":\"sup\"].to_csv(\"LR-Hierarchy-D.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n",
938
    "    df.loc[\"2\":\"2\",\"pre\":\"sup\"].to_csv(\"LR-Hierarchy-H.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n",
939
    "    df.loc[\"3\":\"3\",\"pre\":\"sup\"].to_csv(\"LR-Hierarchy-N.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n",
940
    "    df.loc[\"4\":\"4\",\"pre\":\"sup\"].to_csv(\"LR-Hierarchy-R.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n",
941
    "    df.iloc[6:7,:].to_csv(\"LR-Hierarchy-avg.csv\" , sep=',', encoding='utf-8', doublequote=False, index=False, mode=\"a\", header=False)\n",
942
    "\n",
943
    "        #\n",
944
    "    \n",
945
    "    #\n",
946
    "\n",
947
    "    \n",
948
    "    "
949
   ]
950
  },
951
  {
952
   "cell_type": "code",
953
   "execution_count": null,
954
   "metadata": {},
955
   "outputs": [],
956
   "source": [
957
    "X.shape"
958
   ]
959
  },
960
  {
961
   "cell_type": "code",
962
   "execution_count": null,
963
   "metadata": {},
964
   "outputs": [],
965
   "source": [
966
    "y.shape"
967
   ]
968
  },
969
  {
970
   "cell_type": "code",
971
   "execution_count": null,
972
   "metadata": {},
973
   "outputs": [],
974
   "source": []
975
  }
976
 ],
977
 "metadata": {
978
  "kernelspec": {
979
   "display_name": "Python 3",
980
   "language": "python",
981
   "name": "python3"
982
  },
983
  "language_info": {
984
   "codemirror_mode": {
985
    "name": "ipython",
986
    "version": 3
987
   },
988
   "file_extension": ".py",
989
   "mimetype": "text/x-python",
990
   "name": "python",
991
   "nbconvert_exporter": "python",
992
   "pygments_lexer": "ipython3",
993
   "version": "3.7.1"
994
  }
995
 },
996
 "nbformat": 4,
997
 "nbformat_minor": 2
998
}