Switch to side-by-side view

--- a
+++ b/04_TrainBaselineModels.ipynb
@@ -0,0 +1,519 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "* [Baseline models](#Baseline-models)\n",
+    "* [Load and prepare data](#Load-and-prepare-data)\n",
+    "    * [Load and prepare the text](#Load-and-prepare-the-text)\n",
+    "    * [Compute LACE features](#Compute-LACE-features)\n",
+    "* [Train or load Word2Vec](#Train-or-load-Word2Vec)\n",
+    "* [Model](#Model)\n",
+    "    * [Neural network with LACE features](#Neural-network-with-LACE-features)\n",
+    "    * [Random forest with TF-IDF matrix](#Random-forest-with-TF-IDF-matrix)\n",
+    "    * [2-layer feed forward neural network](#2-layer-feed-forward-neural-network)\n",
+    "    * [Logistic regression](#Logistic-regression)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Baseline models"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Data prep\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from   sklearn.model_selection import train_test_split\n",
+    "\n",
+    "# Word2Vec\n",
+    "import os\n",
+    "import logging\n",
+    "import string\n",
+    "from   gensim.models import word2vec\n",
+    "import gensim\n",
+    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
+    "\n",
+    "# Neural networks \n",
+    "import keras\n",
+    "from   keras.models import Model\n",
+    "from   keras.preprocessing.text import Tokenizer\n",
+    "from   keras.preprocessing.sequence import pad_sequences\n",
+    "from   keras.layers import Embedding, Input, Conv1D, Dense, GlobalMaxPooling1D\n",
+    "from   keras.optimizers import RMSprop\n",
+    "import keras.backend as K\n",
+    "\n",
+    "# Random forest\n",
+    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
+    "from sklearn.ensemble import RandomForestClassifier\n",
+    "\n",
+    "# Logistic regression\n",
+    "import statsmodels.api as sm"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Data frame created by TextSections/TextPrep\n",
+    "TRAIN_TEXT_LOC = \"\"\n",
+    "TEST_TEXT_LOC  = \"\"\n",
+    "\n",
+    "# Data frame containing LACE features.\n",
+    "# Assumes presence of:\n",
+    "# - LengthOfStay\n",
+    "# - Charlson\n",
+    "# - PrevERVisits\n",
+    "# - AdmittedViaER\n",
+    "TRAIN_AUX_LOC  = \"\"\n",
+    "TEST_AUX_LOC   = \"\"\n",
+    "\n",
+    "# Unique visit identifier to merge the train/test text with LACE data\n",
+    "MERGE_ON       = \"\"\n",
+    "\n",
+    "# Other column names\n",
+    "VISITID        = \"\"\n",
+    "OUTCOME        = \"\" # e.g. ReadmissionInLessThan30Days"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Load and prepare data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Load and prepare the text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Read train and test text data.\n",
+    "trainTXT = pd.read_csv(TRAIN_TEXT_LOC)\n",
+    "testTXT  = pd.read_csv(TEST_TEXT_LOC)\n",
+    "\n",
+    "# Read train and test LACE data.\n",
+    "trainLACE = pd.read_csv(TRAIN_AUX_LOC)\n",
+    "testLACE  = pd.read_csv(TEST_AUX_LOC)\n",
+    "\n",
+    "# Combine data\n",
+    "train = pd.merge(trainTXT, trainLACE, on = MERGE_ON)\n",
+    "test  = pd.merge(testTXT,  testLACE,  on = MERGE_ON)\n",
+    "\n",
+    "# Split the train data into a train and validation set.\n",
+    "train, valid = train_test_split(train, \n",
+    "                                stratify     = train[OUTCOME], \n",
+    "                                train_size   = .9, \n",
+    "                                random_state = 1234)\n",
+    "\n",
+    "# Prepare the sections.\n",
+    "# If `sectiontext` is present, then include \"SECTIONNAME sectiontext\".\n",
+    "# If not present, include only \"SECTIONNAME\".\n",
+    "SECTIONNAMES = [x for x in trainTXT.columns if VISITID not in x and OUTCOME not in x]\n",
+    "for x in SECTIONNAMES:\n",
+    "    rep      = x.replace(\" \", \"_\").upper()\n",
+    "    train[x] = [\" \".join([rep, t]) if not pd.isnull(t) else rep for t in train[x]]\n",
+    "    valid[x] = [\" \".join([rep, t]) if not pd.isnull(t) else rep for t in valid[x]]\n",
+    "    test[x]  = [\" \".join([rep, t]) if not pd.isnull(t) else rep for t in test[x]]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compute LACE features"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This code assumes that, for each hospital visit, you have computed:\n",
+    " * the Charlson index\n",
+    " * the number of ER visits in the last 6 months\n",
+    " * whether the patient was admitted through the ER\n",
+    " * the length of stay, in days\n",
+    "\n",
+    "We then using these data to compute LACE."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def LOS(los):\n",
+    "    if los <= 3:\n",
+    "        return(los)\n",
+    "    elif los <= 6:\n",
+    "        return(4)\n",
+    "    elif los <= 13:\n",
+    "        return(5)\n",
+    "    else:\n",
+    "        return(7)\n",
+    "    \n",
+    "def ACUITY(erboolean):\n",
+    "    if erboolean:\n",
+    "        return(3)\n",
+    "    else:\n",
+    "        return(0)\n",
+    "    \n",
+    "def LACE(data):\n",
+    "    return(LOS(data.LengthOfStay) + ACUITY(data.AdmittedViaER) + data.Charlson + data.PrevERVisits)\n",
+    "\n",
+    "train[\"LACE\"] = train.apply(LACE, axis=1)\n",
+    "valid[\"LACE\"] = valid.apply(LACE, axis=1)\n",
+    "test[\"LACE\"]  = test.apply(LACE,  axis=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "For their use in modeling, we also transform the LACE variables by subtracting the mean of the train data:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# We transform \"length of stay\" following the precedent set by LACE.\n",
+    "train[\"LOS_Quantized\"]            = train.LengthOfStay.apply(LOS)\n",
+    "test[\"LOS_Quantized\"]             = test.LengthOfStay.apply(LOS)\n",
+    "valid[\"LOS_Quantized\"]            = valid.LengthOfStay.apply(LOS)\n",
+    "\n",
+    "train[\"Charlson_Transformed\"]     = train.Charlson - train.Charlson.mean()\n",
+    "train[\"LOS_Transformed\"]          = train.LOS_Quantized - train.LOS_Quantized.mean()\n",
+    "train[\"PrevERVisits_Transformed\"] = train.PrevERVisits - train.PrevERVisits.mean()\n",
+    "\n",
+    "test[\"Charlson_Transformed\"]      = test.Charlson - train.Charlson.mean()\n",
+    "test[\"LOS_Transformed\"]           = test.LOS_Quantized - train.LOS_Quantized.mean()\n",
+    "test[\"PrevERVisits_Transformed\"]  = test.PrevERVisits - train.PrevERVisits.mean()\n",
+    "\n",
+    "valid[\"Charlson_Transformed\"]     = valid.Charlson - train.Charlson.mean()\n",
+    "valid[\"LOS_Transformed\"]          = valid.LOS_Quantized - train.LOS_Quantized.mean()\n",
+    "valid[\"PrevERVisits_Transformed\"] = valid.PrevERVisits - train.PrevERVisits.mean()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Train or load Word2Vec"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Word2Vec hyperparameters\n",
+    "window    = 2\n",
+    "dimension = 1000\n",
+    "min_count = 5\n",
+    "sg        = 1  \n",
+    "hs        = 0  \n",
+    "\n",
+    "# Where to save the model:\n",
+    "modelFile = './word2vec/w2v_dims_' + str(dimension) + \"_window_\" + str(window) + '.bin'\n",
+    "\n",
+    "# We will remove digits and punctuation:\n",
+    "remove_digits_punc = str.maketrans('', '', string.digits + ''.join([x for x in string.punctuation if '_' not in x]))\n",
+    "remove_digits_punc = {a:\" \" for a in remove_digits_punc.keys()}\n",
+    "\n",
+    "# (If the model already exists, don't recompute.)\n",
+    "if not os.path.isfile(modelFile):\n",
+    "    # Use only training data to train word2vec:\n",
+    "    notes = train[SECTIONNAMES].apply(lambda x: \" \".join(x), axis=1).values  \n",
+    "    stop  = set([x for x in string.ascii_lowercase]) \n",
+    "    for i in range(len(notes)):\n",
+    "        notes[i] = [w for w in notes[i].translate(remove_digits_punc).split() if (w not in stop)]\n",
+    "    \n",
+    "    w2v = word2vec.Word2Vec(notes, \n",
+    "                            size=dimension, \n",
+    "                            window=window, \n",
+    "                            sg=sg, \n",
+    "                            hs=hs, \n",
+    "                            min_count=min_count, \n",
+    "                            workers=50)\n",
+    "    w2v.wv.save_word2vec_format(modelFile, binary=True)\n",
+    "else:\n",
+    "    w2v = gensim.models.KeyedVectors.load_word2vec_format(modelFile, binary=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Make the embedding matrix.\n",
+    "# We include one extra word, `PADDING`. This is the word that will right-pad short notes.\n",
+    "# For `PADDING`'s vector representation, we choose the zero vector.\n",
+    "vocab = [\"PADDING\"] + sorted(list(w2v.wv.vocab.keys()))\n",
+    "vset  = set(vocab)\n",
+    "\n",
+    "embeddings_index = {}\n",
+    "for i in range(len(vocab)):\n",
+    "    embeddings_index[vocab[i]] = i\n",
+    "\n",
+    "reverse_embeddings_index = {b:a for a,b in embeddings_index.items()}\n",
+    "embeddings_matrix        = np.matrix(np.concatenate(([[0.]*1000], [w2v[x] for x in vocab[1:]])))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Model"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Neural network with LACE features"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Prepare text using our embeddings index:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "train_x = train[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
+    "test_x  = test[ SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
+    "valid_x = valid[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
+    "\n",
+    "train_x = [[embeddings_index[x] for x in note.split() if x in vset] for note in train_x]\n",
+    "valid_x = [[embeddings_index[x] for x in note.split() if x in vset] for note in valid_x]\n",
+    "test_x  = [[embeddings_index[x] for x in note.split() if x in vset] for note in test_x]\n",
+    "\n",
+    "train_y = train[OUTCOME]\n",
+    "valid_y = valid[OUTCOME]\n",
+    "test_y  = test[OUTCOME]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "And model:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "UNITS      = 500\n",
+    "FILTERSIZE = 3\n",
+    "embedding_layer = Embedding(embeddings_matrix.shape[0],\n",
+    "                            embeddings_matrix.shape[1],\n",
+    "                            weights=[embeddings_matrix],\n",
+    "                            input_length=maxlen,\n",
+    "                            trainable=True)\n",
+    "\n",
+    "sequence_input     = Input(shape=(maxlen,), dtype='int32')\n",
+    "embedded_sequences = embedding_layer(sequence_input)\n",
+    "\n",
+    "lace_in            = Input(shape=(4,))\n",
+    "lace               = keras.layers.Reshape((1,4,))(lace_in)\n",
+    "lace               = keras.layers.UpSampling1D(700)(lace)\n",
+    "\n",
+    "combined           = keras.layers.concatenate([embedded_sequences, lace])\n",
+    "\n",
+    "conv               = Conv1D(UNITS, FILTERSIZE, activation=\"tanh\", use_bias=True)(combined)\n",
+    "pool               = GlobalMaxPooling1D()(conv)\n",
+    "\n",
+    "\n",
+    "out                = Dense(1, \n",
+    "                           activation='sigmoid', \n",
+    "                           activity_regularizer=keras.regularizers.l1(l=.05)\n",
+    "                        )(pool)\n",
+    "\n",
+    "optimizer = keras.optimizers.RMSprop(lr = .0001)\n",
+    "model=Model(inputs=[sequence_input, lace_in], outputs=out)\n",
+    "model.compile(loss='binary_crossentropy', optimizer=optimizer)\n",
+    "\n",
+    "model.fit(train_x, train_y, batch_size=100, epochs=4, validation_data=(valid_x, valid_y), verbose=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Random forest with TF-IDF matrix"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Prepare the text for sklearn's tfidf vectorizer:\n",
+    "train_x = train[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
+    "test_x  = test[ SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
+    "valid_x = valid[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
+    "\n",
+    "train_y = train[OUTCOME]\n",
+    "valid_y = valid[OUTCOME]\n",
+    "test_y  = test[OUTCOME]\n",
+    "\n",
+    "tfidf = TfidfVectorizer()\n",
+    "tr_x  = tfidf.fit_transform(train_x)\n",
+    "te_x  = tfidf.transform(test_x)\n",
+    "va_x  = tfidf.transform(valid_x)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Model:\n",
+    "rfc = RandomForestClassifier(n_estimators=1000, max_depth=100, n_jobs=-1)\n",
+    "rfc.fit(tr_x, train_y)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2-layer feed forward neural network "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This model uses only the components of LACE together with the LACE score:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "lace  = Input(shape=(5,))\n",
+    "dense = Dense(50, activation='tanh')(lace)\n",
+    "out   = Dense(1, activation='sigmoid')(dense)\n",
+    "\n",
+    "model = Model(inputs=lace, outputs=out)\n",
+    "model.compile(loss='binary_crossentropy', optimizer=\"nadam\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "model.fit(train[[\"LOS_Transformed\", \"AdmittedViaER\", \"Charlson_Transformed\", \"PrevERVisits_Transformed\", \"LACE\"]].values, \n",
+    "           train_y,\n",
+    "           class_weight={0:1, 1:10}, \n",
+    "           epochs=1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Logistic regression"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "model = logit(formula = OUTCOME + \" ~ (LOS_Transformed + AdmittedViaER + Charlson_Transformed + PrevERVisits_Transformed + LACE)\", \n",
+    "              data = train\n",
+    "        ).fit(maxiter = 1000, method = 'lbfgs')"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.5.2"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}