Switch to unified view

a b/04_TrainBaselineModels.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "markdown",
5
   "metadata": {},
6
   "source": [
7
    "* [Baseline models](#Baseline-models)\n",
8
    "* [Load and prepare data](#Load-and-prepare-data)\n",
9
    "    * [Load and prepare the text](#Load-and-prepare-the-text)\n",
10
    "    * [Compute LACE features](#Compute-LACE-features)\n",
11
    "* [Train or load Word2Vec](#Train-or-load-Word2Vec)\n",
12
    "* [Model](#Model)\n",
13
    "    * [Neural network with LACE features](#Neural-network-with-LACE-features)\n",
14
    "    * [Random forest with TF-IDF matrix](#Random-forest-with-TF-IDF-matrix)\n",
15
    "    * [2-layer feed forward neural network](#2-layer-feed-forward-neural-network)\n",
16
    "    * [Logistic regression](#Logistic-regression)"
17
   ]
18
  },
19
  {
20
   "cell_type": "markdown",
21
   "metadata": {},
22
   "source": [
23
    "# Baseline models"
24
   ]
25
  },
26
  {
27
   "cell_type": "code",
28
   "execution_count": null,
29
   "metadata": {
30
    "collapsed": true
31
   },
32
   "outputs": [],
33
   "source": [
34
    "# Data prep\n",
35
    "import numpy as np\n",
36
    "import pandas as pd\n",
37
    "from   sklearn.model_selection import train_test_split\n",
38
    "\n",
39
    "# Word2Vec\n",
40
    "import os\n",
41
    "import logging\n",
42
    "import string\n",
43
    "from   gensim.models import word2vec\n",
44
    "import gensim\n",
45
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
46
    "\n",
47
    "# Neural networks \n",
48
    "import keras\n",
49
    "from   keras.models import Model\n",
50
    "from   keras.preprocessing.text import Tokenizer\n",
51
    "from   keras.preprocessing.sequence import pad_sequences\n",
52
    "from   keras.layers import Embedding, Input, Conv1D, Dense, GlobalMaxPooling1D\n",
53
    "from   keras.optimizers import RMSprop\n",
54
    "import keras.backend as K\n",
55
    "\n",
56
    "# Random forest\n",
57
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
58
    "from sklearn.ensemble import RandomForestClassifier\n",
59
    "\n",
60
    "# Logistic regression\n",
61
    "import statsmodels.api as sm"
62
   ]
63
  },
64
  {
65
   "cell_type": "code",
66
   "execution_count": null,
67
   "metadata": {
68
    "collapsed": true
69
   },
70
   "outputs": [],
71
   "source": [
72
    "# Data frame created by TextSections/TextPrep\n",
73
    "TRAIN_TEXT_LOC = \"\"\n",
74
    "TEST_TEXT_LOC  = \"\"\n",
75
    "\n",
76
    "# Data frame containing LACE features.\n",
77
    "# Assumes presence of:\n",
78
    "# - LengthOfStay\n",
79
    "# - Charlson\n",
80
    "# - PrevERVisits\n",
81
    "# - AdmittedViaER\n",
82
    "TRAIN_AUX_LOC  = \"\"\n",
83
    "TEST_AUX_LOC   = \"\"\n",
84
    "\n",
85
    "# Unique visit identifier to merge the train/test text with LACE data\n",
86
    "MERGE_ON       = \"\"\n",
87
    "\n",
88
    "# Other column names\n",
89
    "VISITID        = \"\"\n",
90
    "OUTCOME        = \"\" # e.g. ReadmissionInLessThan30Days"
91
   ]
92
  },
93
  {
94
   "cell_type": "markdown",
95
   "metadata": {},
96
   "source": [
97
    "# Load and prepare data"
98
   ]
99
  },
100
  {
101
   "cell_type": "markdown",
102
   "metadata": {},
103
   "source": [
104
    "## Load and prepare the text"
105
   ]
106
  },
107
  {
108
   "cell_type": "code",
109
   "execution_count": null,
110
   "metadata": {
111
    "collapsed": true
112
   },
113
   "outputs": [],
114
   "source": [
115
    "# Read train and test text data.\n",
116
    "trainTXT = pd.read_csv(TRAIN_TEXT_LOC)\n",
117
    "testTXT  = pd.read_csv(TEST_TEXT_LOC)\n",
118
    "\n",
119
    "# Read train and test LACE data.\n",
120
    "trainLACE = pd.read_csv(TRAIN_AUX_LOC)\n",
121
    "testLACE  = pd.read_csv(TEST_AUX_LOC)\n",
122
    "\n",
123
    "# Combine data\n",
124
    "train = pd.merge(trainTXT, trainLACE, on = MERGE_ON)\n",
125
    "test  = pd.merge(testTXT,  testLACE,  on = MERGE_ON)\n",
126
    "\n",
127
    "# Split the train data into a train and validation set.\n",
128
    "train, valid = train_test_split(train, \n",
129
    "                                stratify     = train[OUTCOME], \n",
130
    "                                train_size   = .9, \n",
131
    "                                random_state = 1234)\n",
132
    "\n",
133
    "# Prepare the sections.\n",
134
    "# If `sectiontext` is present, then include \"SECTIONNAME sectiontext\".\n",
135
    "# If not present, include only \"SECTIONNAME\".\n",
136
    "SECTIONNAMES = [x for x in trainTXT.columns if VISITID not in x and OUTCOME not in x]\n",
137
    "for x in SECTIONNAMES:\n",
138
    "    rep      = x.replace(\" \", \"_\").upper()\n",
139
    "    train[x] = [\" \".join([rep, t]) if not pd.isnull(t) else rep for t in train[x]]\n",
140
    "    valid[x] = [\" \".join([rep, t]) if not pd.isnull(t) else rep for t in valid[x]]\n",
141
    "    test[x]  = [\" \".join([rep, t]) if not pd.isnull(t) else rep for t in test[x]]"
142
   ]
143
  },
144
  {
145
   "cell_type": "markdown",
146
   "metadata": {},
147
   "source": [
148
    "## Compute LACE features"
149
   ]
150
  },
151
  {
152
   "cell_type": "markdown",
153
   "metadata": {},
154
   "source": [
155
    "This code assumes that, for each hospital visit, you have computed:\n",
156
    " * the Charlson index\n",
157
    " * the number of ER visits in the last 6 months\n",
158
    " * whether the patient was admitted through the ER\n",
159
    " * the length of stay, in days\n",
160
    "\n",
161
    "We then using these data to compute LACE."
162
   ]
163
  },
164
  {
165
   "cell_type": "code",
166
   "execution_count": null,
167
   "metadata": {
168
    "collapsed": true
169
   },
170
   "outputs": [],
171
   "source": [
172
    "def LOS(los):\n",
173
    "    if los <= 3:\n",
174
    "        return(los)\n",
175
    "    elif los <= 6:\n",
176
    "        return(4)\n",
177
    "    elif los <= 13:\n",
178
    "        return(5)\n",
179
    "    else:\n",
180
    "        return(7)\n",
181
    "    \n",
182
    "def ACUITY(erboolean):\n",
183
    "    if erboolean:\n",
184
    "        return(3)\n",
185
    "    else:\n",
186
    "        return(0)\n",
187
    "    \n",
188
    "def LACE(data):\n",
189
    "    return(LOS(data.LengthOfStay) + ACUITY(data.AdmittedViaER) + data.Charlson + data.PrevERVisits)\n",
190
    "\n",
191
    "train[\"LACE\"] = train.apply(LACE, axis=1)\n",
192
    "valid[\"LACE\"] = valid.apply(LACE, axis=1)\n",
193
    "test[\"LACE\"]  = test.apply(LACE,  axis=1)"
194
   ]
195
  },
196
  {
197
   "cell_type": "markdown",
198
   "metadata": {},
199
   "source": [
200
    "For their use in modeling, we also transform the LACE variables by subtracting the mean of the train data:"
201
   ]
202
  },
203
  {
204
   "cell_type": "code",
205
   "execution_count": null,
206
   "metadata": {
207
    "collapsed": true
208
   },
209
   "outputs": [],
210
   "source": [
211
    "# We transform \"length of stay\" following the precedent set by LACE.\n",
212
    "train[\"LOS_Quantized\"]            = train.LengthOfStay.apply(LOS)\n",
213
    "test[\"LOS_Quantized\"]             = test.LengthOfStay.apply(LOS)\n",
214
    "valid[\"LOS_Quantized\"]            = valid.LengthOfStay.apply(LOS)\n",
215
    "\n",
216
    "train[\"Charlson_Transformed\"]     = train.Charlson - train.Charlson.mean()\n",
217
    "train[\"LOS_Transformed\"]          = train.LOS_Quantized - train.LOS_Quantized.mean()\n",
218
    "train[\"PrevERVisits_Transformed\"] = train.PrevERVisits - train.PrevERVisits.mean()\n",
219
    "\n",
220
    "test[\"Charlson_Transformed\"]      = test.Charlson - train.Charlson.mean()\n",
221
    "test[\"LOS_Transformed\"]           = test.LOS_Quantized - train.LOS_Quantized.mean()\n",
222
    "test[\"PrevERVisits_Transformed\"]  = test.PrevERVisits - train.PrevERVisits.mean()\n",
223
    "\n",
224
    "valid[\"Charlson_Transformed\"]     = valid.Charlson - train.Charlson.mean()\n",
225
    "valid[\"LOS_Transformed\"]          = valid.LOS_Quantized - train.LOS_Quantized.mean()\n",
226
    "valid[\"PrevERVisits_Transformed\"] = valid.PrevERVisits - train.PrevERVisits.mean()"
227
   ]
228
  },
229
  {
230
   "cell_type": "markdown",
231
   "metadata": {},
232
   "source": [
233
    "# Train or load Word2Vec"
234
   ]
235
  },
236
  {
237
   "cell_type": "code",
238
   "execution_count": null,
239
   "metadata": {
240
    "collapsed": true
241
   },
242
   "outputs": [],
243
   "source": [
244
    "# Word2Vec hyperparameters\n",
245
    "window    = 2\n",
246
    "dimension = 1000\n",
247
    "min_count = 5\n",
248
    "sg        = 1  \n",
249
    "hs        = 0  \n",
250
    "\n",
251
    "# Where to save the model:\n",
252
    "modelFile = './word2vec/w2v_dims_' + str(dimension) + \"_window_\" + str(window) + '.bin'\n",
253
    "\n",
254
    "# We will remove digits and punctuation:\n",
255
    "remove_digits_punc = str.maketrans('', '', string.digits + ''.join([x for x in string.punctuation if '_' not in x]))\n",
256
    "remove_digits_punc = {a:\" \" for a in remove_digits_punc.keys()}\n",
257
    "\n",
258
    "# (If the model already exists, don't recompute.)\n",
259
    "if not os.path.isfile(modelFile):\n",
260
    "    # Use only training data to train word2vec:\n",
261
    "    notes = train[SECTIONNAMES].apply(lambda x: \" \".join(x), axis=1).values  \n",
262
    "    stop  = set([x for x in string.ascii_lowercase]) \n",
263
    "    for i in range(len(notes)):\n",
264
    "        notes[i] = [w for w in notes[i].translate(remove_digits_punc).split() if (w not in stop)]\n",
265
    "    \n",
266
    "    w2v = word2vec.Word2Vec(notes, \n",
267
    "                            size=dimension, \n",
268
    "                            window=window, \n",
269
    "                            sg=sg, \n",
270
    "                            hs=hs, \n",
271
    "                            min_count=min_count, \n",
272
    "                            workers=50)\n",
273
    "    w2v.wv.save_word2vec_format(modelFile, binary=True)\n",
274
    "else:\n",
275
    "    w2v = gensim.models.KeyedVectors.load_word2vec_format(modelFile, binary=True)"
276
   ]
277
  },
278
  {
279
   "cell_type": "code",
280
   "execution_count": null,
281
   "metadata": {
282
    "collapsed": true
283
   },
284
   "outputs": [],
285
   "source": [
286
    "# Make the embedding matrix.\n",
287
    "# We include one extra word, `PADDING`. This is the word that will right-pad short notes.\n",
288
    "# For `PADDING`'s vector representation, we choose the zero vector.\n",
289
    "vocab = [\"PADDING\"] + sorted(list(w2v.wv.vocab.keys()))\n",
290
    "vset  = set(vocab)\n",
291
    "\n",
292
    "embeddings_index = {}\n",
293
    "for i in range(len(vocab)):\n",
294
    "    embeddings_index[vocab[i]] = i\n",
295
    "\n",
296
    "reverse_embeddings_index = {b:a for a,b in embeddings_index.items()}\n",
297
    "embeddings_matrix        = np.matrix(np.concatenate(([[0.]*1000], [w2v[x] for x in vocab[1:]])))"
298
   ]
299
  },
300
  {
301
   "cell_type": "markdown",
302
   "metadata": {},
303
   "source": [
304
    "# Model"
305
   ]
306
  },
307
  {
308
   "cell_type": "markdown",
309
   "metadata": {},
310
   "source": [
311
    "## Neural network with LACE features"
312
   ]
313
  },
314
  {
315
   "cell_type": "markdown",
316
   "metadata": {},
317
   "source": [
318
    "Prepare text using our embeddings index:"
319
   ]
320
  },
321
  {
322
   "cell_type": "code",
323
   "execution_count": null,
324
   "metadata": {
325
    "collapsed": true
326
   },
327
   "outputs": [],
328
   "source": [
329
    "train_x = train[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
330
    "test_x  = test[ SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
331
    "valid_x = valid[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
332
    "\n",
333
    "train_x = [[embeddings_index[x] for x in note.split() if x in vset] for note in train_x]\n",
334
    "valid_x = [[embeddings_index[x] for x in note.split() if x in vset] for note in valid_x]\n",
335
    "test_x  = [[embeddings_index[x] for x in note.split() if x in vset] for note in test_x]\n",
336
    "\n",
337
    "train_y = train[OUTCOME]\n",
338
    "valid_y = valid[OUTCOME]\n",
339
    "test_y  = test[OUTCOME]"
340
   ]
341
  },
342
  {
343
   "cell_type": "markdown",
344
   "metadata": {},
345
   "source": [
346
    "And model:"
347
   ]
348
  },
349
  {
350
   "cell_type": "code",
351
   "execution_count": null,
352
   "metadata": {
353
    "collapsed": true
354
   },
355
   "outputs": [],
356
   "source": [
357
    "UNITS      = 500\n",
358
    "FILTERSIZE = 3\n",
359
    "embedding_layer = Embedding(embeddings_matrix.shape[0],\n",
360
    "                            embeddings_matrix.shape[1],\n",
361
    "                            weights=[embeddings_matrix],\n",
362
    "                            input_length=maxlen,\n",
363
    "                            trainable=True)\n",
364
    "\n",
365
    "sequence_input     = Input(shape=(maxlen,), dtype='int32')\n",
366
    "embedded_sequences = embedding_layer(sequence_input)\n",
367
    "\n",
368
    "lace_in            = Input(shape=(4,))\n",
369
    "lace               = keras.layers.Reshape((1,4,))(lace_in)\n",
370
    "lace               = keras.layers.UpSampling1D(700)(lace)\n",
371
    "\n",
372
    "combined           = keras.layers.concatenate([embedded_sequences, lace])\n",
373
    "\n",
374
    "conv               = Conv1D(UNITS, FILTERSIZE, activation=\"tanh\", use_bias=True)(combined)\n",
375
    "pool               = GlobalMaxPooling1D()(conv)\n",
376
    "\n",
377
    "\n",
378
    "out                = Dense(1, \n",
379
    "                           activation='sigmoid', \n",
380
    "                           activity_regularizer=keras.regularizers.l1(l=.05)\n",
381
    "                        )(pool)\n",
382
    "\n",
383
    "optimizer = keras.optimizers.RMSprop(lr = .0001)\n",
384
    "model=Model(inputs=[sequence_input, lace_in], outputs=out)\n",
385
    "model.compile(loss='binary_crossentropy', optimizer=optimizer)\n",
386
    "\n",
387
    "model.fit(train_x, train_y, batch_size=100, epochs=4, validation_data=(valid_x, valid_y), verbose=1)"
388
   ]
389
  },
390
  {
391
   "cell_type": "markdown",
392
   "metadata": {},
393
   "source": [
394
    "## Random forest with TF-IDF matrix"
395
   ]
396
  },
397
  {
398
   "cell_type": "code",
399
   "execution_count": null,
400
   "metadata": {
401
    "collapsed": true
402
   },
403
   "outputs": [],
404
   "source": [
405
    "# Prepare the text for sklearn's tfidf vectorizer:\n",
406
    "train_x = train[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
407
    "test_x  = test[ SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
408
    "valid_x = valid[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
409
    "\n",
410
    "train_y = train[OUTCOME]\n",
411
    "valid_y = valid[OUTCOME]\n",
412
    "test_y  = test[OUTCOME]\n",
413
    "\n",
414
    "tfidf = TfidfVectorizer()\n",
415
    "tr_x  = tfidf.fit_transform(train_x)\n",
416
    "te_x  = tfidf.transform(test_x)\n",
417
    "va_x  = tfidf.transform(valid_x)"
418
   ]
419
  },
420
  {
421
   "cell_type": "code",
422
   "execution_count": null,
423
   "metadata": {
424
    "collapsed": true
425
   },
426
   "outputs": [],
427
   "source": [
428
    "# Model:\n",
429
    "rfc = RandomForestClassifier(n_estimators=1000, max_depth=100, n_jobs=-1)\n",
430
    "rfc.fit(tr_x, train_y)"
431
   ]
432
  },
433
  {
434
   "cell_type": "markdown",
435
   "metadata": {},
436
   "source": [
437
    "## 2-layer feed forward neural network "
438
   ]
439
  },
440
  {
441
   "cell_type": "markdown",
442
   "metadata": {},
443
   "source": [
444
    "This model uses only the components of LACE together with the LACE score:"
445
   ]
446
  },
447
  {
448
   "cell_type": "code",
449
   "execution_count": null,
450
   "metadata": {
451
    "collapsed": true
452
   },
453
   "outputs": [],
454
   "source": [
455
    "lace  = Input(shape=(5,))\n",
456
    "dense = Dense(50, activation='tanh')(lace)\n",
457
    "out   = Dense(1, activation='sigmoid')(dense)\n",
458
    "\n",
459
    "model = Model(inputs=lace, outputs=out)\n",
460
    "model.compile(loss='binary_crossentropy', optimizer=\"nadam\")"
461
   ]
462
  },
463
  {
464
   "cell_type": "code",
465
   "execution_count": null,
466
   "metadata": {
467
    "collapsed": true
468
   },
469
   "outputs": [],
470
   "source": [
471
    "model.fit(train[[\"LOS_Transformed\", \"AdmittedViaER\", \"Charlson_Transformed\", \"PrevERVisits_Transformed\", \"LACE\"]].values, \n",
472
    "           train_y,\n",
473
    "           class_weight={0:1, 1:10}, \n",
474
    "           epochs=1)"
475
   ]
476
  },
477
  {
478
   "cell_type": "markdown",
479
   "metadata": {},
480
   "source": [
481
    "## Logistic regression"
482
   ]
483
  },
484
  {
485
   "cell_type": "code",
486
   "execution_count": null,
487
   "metadata": {
488
    "collapsed": true
489
   },
490
   "outputs": [],
491
   "source": [
492
    "model = logit(formula = OUTCOME + \" ~ (LOS_Transformed + AdmittedViaER + Charlson_Transformed + PrevERVisits_Transformed + LACE)\", \n",
493
    "              data = train\n",
494
    "        ).fit(maxiter = 1000, method = 'lbfgs')"
495
   ]
496
  }
497
 ],
498
 "metadata": {
499
  "kernelspec": {
500
   "display_name": "Python 3",
501
   "language": "python",
502
   "name": "python3"
503
  },
504
  "language_info": {
505
   "codemirror_mode": {
506
    "name": "ipython",
507
    "version": 3
508
   },
509
   "file_extension": ".py",
510
   "mimetype": "text/x-python",
511
   "name": "python",
512
   "nbconvert_exporter": "python",
513
   "pygments_lexer": "ipython3",
514
   "version": "3.5.2"
515
  }
516
 },
517
 "nbformat": 4,
518
 "nbformat_minor": 2
519
}