--- a +++ b/04_TrainBaselineModels.ipynb @@ -0,0 +1,519 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "* [Baseline models](#Baseline-models)\n", + "* [Load and prepare data](#Load-and-prepare-data)\n", + " * [Load and prepare the text](#Load-and-prepare-the-text)\n", + " * [Compute LACE features](#Compute-LACE-features)\n", + "* [Train or load Word2Vec](#Train-or-load-Word2Vec)\n", + "* [Model](#Model)\n", + " * [Neural network with LACE features](#Neural-network-with-LACE-features)\n", + " * [Random forest with TF-IDF matrix](#Random-forest-with-TF-IDF-matrix)\n", + " * [2-layer feed forward neural network](#2-layer-feed-forward-neural-network)\n", + " * [Logistic regression](#Logistic-regression)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Baseline models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Data prep\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "# Word2Vec\n", + "import os\n", + "import logging\n", + "import string\n", + "from gensim.models import word2vec\n", + "import gensim\n", + "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n", + "\n", + "# Neural networks \n", + "import keras\n", + "from keras.models import Model\n", + "from keras.preprocessing.text import Tokenizer\n", + "from keras.preprocessing.sequence import pad_sequences\n", + "from keras.layers import Embedding, Input, Conv1D, Dense, GlobalMaxPooling1D\n", + "from keras.optimizers import RMSprop\n", + "import keras.backend as K\n", + "\n", + "# Random forest\n", + "from sklearn.feature_extraction.text import TfidfVectorizer\n", + "from sklearn.ensemble import RandomForestClassifier\n", + "\n", + "# Logistic regression\n", + "import statsmodels.api as sm" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Data frame created by TextSections/TextPrep\n", + "TRAIN_TEXT_LOC = \"\"\n", + "TEST_TEXT_LOC = \"\"\n", + "\n", + "# Data frame containing LACE features.\n", + "# Assumes presence of:\n", + "# - LengthOfStay\n", + "# - Charlson\n", + "# - PrevERVisits\n", + "# - AdmittedViaER\n", + "TRAIN_AUX_LOC = \"\"\n", + "TEST_AUX_LOC = \"\"\n", + "\n", + "# Unique visit identifier to merge the train/test text with LACE data\n", + "MERGE_ON = \"\"\n", + "\n", + "# Other column names\n", + "VISITID = \"\"\n", + "OUTCOME = \"\" # e.g. ReadmissionInLessThan30Days" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Load and prepare data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Load and prepare the text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Read train and test text data.\n", + "trainTXT = pd.read_csv(TRAIN_TEXT_LOC)\n", + "testTXT = pd.read_csv(TEST_TEXT_LOC)\n", + "\n", + "# Read train and test LACE data.\n", + "trainLACE = pd.read_csv(TRAIN_AUX_LOC)\n", + "testLACE = pd.read_csv(TEST_AUX_LOC)\n", + "\n", + "# Combine data\n", + "train = pd.merge(trainTXT, trainLACE, on = MERGE_ON)\n", + "test = pd.merge(testTXT, testLACE, on = MERGE_ON)\n", + "\n", + "# Split the train data into a train and validation set.\n", + "train, valid = train_test_split(train, \n", + " stratify = train[OUTCOME], \n", + " train_size = .9, \n", + " random_state = 1234)\n", + "\n", + "# Prepare the sections.\n", + "# If `sectiontext` is present, then include \"SECTIONNAME sectiontext\".\n", + "# If not present, include only \"SECTIONNAME\".\n", + "SECTIONNAMES = [x for x in trainTXT.columns if VISITID not in x and OUTCOME not in x]\n", + "for x in SECTIONNAMES:\n", + " rep = x.replace(\" \", \"_\").upper()\n", + " train[x] = [\" \".join([rep, t]) if not pd.isnull(t) else rep for t in train[x]]\n", + " valid[x] = [\" \".join([rep, t]) if not pd.isnull(t) else rep for t in valid[x]]\n", + " test[x] = [\" \".join([rep, t]) if not pd.isnull(t) else rep for t in test[x]]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Compute LACE features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This code assumes that, for each hospital visit, you have computed:\n", + " * the Charlson index\n", + " * the number of ER visits in the last 6 months\n", + " * whether the patient was admitted through the ER\n", + " * the length of stay, in days\n", + "\n", + "We then using these data to compute LACE." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def LOS(los):\n", + " if los <= 3:\n", + " return(los)\n", + " elif los <= 6:\n", + " return(4)\n", + " elif los <= 13:\n", + " return(5)\n", + " else:\n", + " return(7)\n", + " \n", + "def ACUITY(erboolean):\n", + " if erboolean:\n", + " return(3)\n", + " else:\n", + " return(0)\n", + " \n", + "def LACE(data):\n", + " return(LOS(data.LengthOfStay) + ACUITY(data.AdmittedViaER) + data.Charlson + data.PrevERVisits)\n", + "\n", + "train[\"LACE\"] = train.apply(LACE, axis=1)\n", + "valid[\"LACE\"] = valid.apply(LACE, axis=1)\n", + "test[\"LACE\"] = test.apply(LACE, axis=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For their use in modeling, we also transform the LACE variables by subtracting the mean of the train data:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# We transform \"length of stay\" following the precedent set by LACE.\n", + "train[\"LOS_Quantized\"] = train.LengthOfStay.apply(LOS)\n", + "test[\"LOS_Quantized\"] = test.LengthOfStay.apply(LOS)\n", + "valid[\"LOS_Quantized\"] = valid.LengthOfStay.apply(LOS)\n", + "\n", + "train[\"Charlson_Transformed\"] = train.Charlson - train.Charlson.mean()\n", + "train[\"LOS_Transformed\"] = train.LOS_Quantized - train.LOS_Quantized.mean()\n", + "train[\"PrevERVisits_Transformed\"] = train.PrevERVisits - train.PrevERVisits.mean()\n", + "\n", + "test[\"Charlson_Transformed\"] = test.Charlson - train.Charlson.mean()\n", + "test[\"LOS_Transformed\"] = test.LOS_Quantized - train.LOS_Quantized.mean()\n", + "test[\"PrevERVisits_Transformed\"] = test.PrevERVisits - train.PrevERVisits.mean()\n", + "\n", + "valid[\"Charlson_Transformed\"] = valid.Charlson - train.Charlson.mean()\n", + "valid[\"LOS_Transformed\"] = valid.LOS_Quantized - train.LOS_Quantized.mean()\n", + "valid[\"PrevERVisits_Transformed\"] = valid.PrevERVisits - train.PrevERVisits.mean()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Train or load Word2Vec" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Word2Vec hyperparameters\n", + "window = 2\n", + "dimension = 1000\n", + "min_count = 5\n", + "sg = 1 \n", + "hs = 0 \n", + "\n", + "# Where to save the model:\n", + "modelFile = './word2vec/w2v_dims_' + str(dimension) + \"_window_\" + str(window) + '.bin'\n", + "\n", + "# We will remove digits and punctuation:\n", + "remove_digits_punc = str.maketrans('', '', string.digits + ''.join([x for x in string.punctuation if '_' not in x]))\n", + "remove_digits_punc = {a:\" \" for a in remove_digits_punc.keys()}\n", + "\n", + "# (If the model already exists, don't recompute.)\n", + "if not os.path.isfile(modelFile):\n", + " # Use only training data to train word2vec:\n", + " notes = train[SECTIONNAMES].apply(lambda x: \" \".join(x), axis=1).values \n", + " stop = set([x for x in string.ascii_lowercase]) \n", + " for i in range(len(notes)):\n", + " notes[i] = [w for w in notes[i].translate(remove_digits_punc).split() if (w not in stop)]\n", + " \n", + " w2v = word2vec.Word2Vec(notes, \n", + " size=dimension, \n", + " window=window, \n", + " sg=sg, \n", + " hs=hs, \n", + " min_count=min_count, \n", + " workers=50)\n", + " w2v.wv.save_word2vec_format(modelFile, binary=True)\n", + "else:\n", + " w2v = gensim.models.KeyedVectors.load_word2vec_format(modelFile, binary=True)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Make the embedding matrix.\n", + "# We include one extra word, `PADDING`. This is the word that will right-pad short notes.\n", + "# For `PADDING`'s vector representation, we choose the zero vector.\n", + "vocab = [\"PADDING\"] + sorted(list(w2v.wv.vocab.keys()))\n", + "vset = set(vocab)\n", + "\n", + "embeddings_index = {}\n", + "for i in range(len(vocab)):\n", + " embeddings_index[vocab[i]] = i\n", + "\n", + "reverse_embeddings_index = {b:a for a,b in embeddings_index.items()}\n", + "embeddings_matrix = np.matrix(np.concatenate(([[0.]*1000], [w2v[x] for x in vocab[1:]])))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Model" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Neural network with LACE features" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Prepare text using our embeddings index:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "train_x = train[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values \n", + "test_x = test[ SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values \n", + "valid_x = valid[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values \n", + "\n", + "train_x = [[embeddings_index[x] for x in note.split() if x in vset] for note in train_x]\n", + "valid_x = [[embeddings_index[x] for x in note.split() if x in vset] for note in valid_x]\n", + "test_x = [[embeddings_index[x] for x in note.split() if x in vset] for note in test_x]\n", + "\n", + "train_y = train[OUTCOME]\n", + "valid_y = valid[OUTCOME]\n", + "test_y = test[OUTCOME]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "And model:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "UNITS = 500\n", + "FILTERSIZE = 3\n", + "embedding_layer = Embedding(embeddings_matrix.shape[0],\n", + " embeddings_matrix.shape[1],\n", + " weights=[embeddings_matrix],\n", + " input_length=maxlen,\n", + " trainable=True)\n", + "\n", + "sequence_input = Input(shape=(maxlen,), dtype='int32')\n", + "embedded_sequences = embedding_layer(sequence_input)\n", + "\n", + "lace_in = Input(shape=(4,))\n", + "lace = keras.layers.Reshape((1,4,))(lace_in)\n", + "lace = keras.layers.UpSampling1D(700)(lace)\n", + "\n", + "combined = keras.layers.concatenate([embedded_sequences, lace])\n", + "\n", + "conv = Conv1D(UNITS, FILTERSIZE, activation=\"tanh\", use_bias=True)(combined)\n", + "pool = GlobalMaxPooling1D()(conv)\n", + "\n", + "\n", + "out = Dense(1, \n", + " activation='sigmoid', \n", + " activity_regularizer=keras.regularizers.l1(l=.05)\n", + " )(pool)\n", + "\n", + "optimizer = keras.optimizers.RMSprop(lr = .0001)\n", + "model=Model(inputs=[sequence_input, lace_in], outputs=out)\n", + "model.compile(loss='binary_crossentropy', optimizer=optimizer)\n", + "\n", + "model.fit(train_x, train_y, batch_size=100, epochs=4, validation_data=(valid_x, valid_y), verbose=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Random forest with TF-IDF matrix" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Prepare the text for sklearn's tfidf vectorizer:\n", + "train_x = train[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values \n", + "test_x = test[ SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values \n", + "valid_x = valid[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values \n", + "\n", + "train_y = train[OUTCOME]\n", + "valid_y = valid[OUTCOME]\n", + "test_y = test[OUTCOME]\n", + "\n", + "tfidf = TfidfVectorizer()\n", + "tr_x = tfidf.fit_transform(train_x)\n", + "te_x = tfidf.transform(test_x)\n", + "va_x = tfidf.transform(valid_x)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Model:\n", + "rfc = RandomForestClassifier(n_estimators=1000, max_depth=100, n_jobs=-1)\n", + "rfc.fit(tr_x, train_y)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2-layer feed forward neural network " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This model uses only the components of LACE together with the LACE score:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "lace = Input(shape=(5,))\n", + "dense = Dense(50, activation='tanh')(lace)\n", + "out = Dense(1, activation='sigmoid')(dense)\n", + "\n", + "model = Model(inputs=lace, outputs=out)\n", + "model.compile(loss='binary_crossentropy', optimizer=\"nadam\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "model.fit(train[[\"LOS_Transformed\", \"AdmittedViaER\", \"Charlson_Transformed\", \"PrevERVisits_Transformed\", \"LACE\"]].values, \n", + " train_y,\n", + " class_weight={0:1, 1:10}, \n", + " epochs=1)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Logistic regression" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "model = logit(formula = OUTCOME + \" ~ (LOS_Transformed + AdmittedViaER + Charlson_Transformed + PrevERVisits_Transformed + LACE)\", \n", + " data = train\n", + " ).fit(maxiter = 1000, method = 'lbfgs')" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.5.2" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}