[d4cc86]: / 04_TrainBaselineModels.ipynb

Download this file

520 lines (519 with data), 16.2 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "* [Baseline models](#Baseline-models)\n",
    "* [Load and prepare data](#Load-and-prepare-data)\n",
    "    * [Load and prepare the text](#Load-and-prepare-the-text)\n",
    "    * [Compute LACE features](#Compute-LACE-features)\n",
    "* [Train or load Word2Vec](#Train-or-load-Word2Vec)\n",
    "* [Model](#Model)\n",
    "    * [Neural network with LACE features](#Neural-network-with-LACE-features)\n",
    "    * [Random forest with TF-IDF matrix](#Random-forest-with-TF-IDF-matrix)\n",
    "    * [2-layer feed forward neural network](#2-layer-feed-forward-neural-network)\n",
    "    * [Logistic regression](#Logistic-regression)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Baseline models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Data prep\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from   sklearn.model_selection import train_test_split\n",
    "\n",
    "# Word2Vec\n",
    "import os\n",
    "import logging\n",
    "import string\n",
    "from   gensim.models import word2vec\n",
    "import gensim\n",
    "logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\n",
    "\n",
    "# Neural networks \n",
    "import keras\n",
    "from   keras.models import Model\n",
    "from   keras.preprocessing.text import Tokenizer\n",
    "from   keras.preprocessing.sequence import pad_sequences\n",
    "from   keras.layers import Embedding, Input, Conv1D, Dense, GlobalMaxPooling1D\n",
    "from   keras.optimizers import RMSprop\n",
    "import keras.backend as K\n",
    "\n",
    "# Random forest\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "# Logistic regression\n",
    "import statsmodels.api as sm"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Data frame created by TextSections/TextPrep\n",
    "TRAIN_TEXT_LOC = \"\"\n",
    "TEST_TEXT_LOC  = \"\"\n",
    "\n",
    "# Data frame containing LACE features.\n",
    "# Assumes presence of:\n",
    "# - LengthOfStay\n",
    "# - Charlson\n",
    "# - PrevERVisits\n",
    "# - AdmittedViaER\n",
    "TRAIN_AUX_LOC  = \"\"\n",
    "TEST_AUX_LOC   = \"\"\n",
    "\n",
    "# Unique visit identifier to merge the train/test text with LACE data\n",
    "MERGE_ON       = \"\"\n",
    "\n",
    "# Other column names\n",
    "VISITID        = \"\"\n",
    "OUTCOME        = \"\" # e.g. ReadmissionInLessThan30Days"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Load and prepare data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Load and prepare the text"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Read train and test text data.\n",
    "trainTXT = pd.read_csv(TRAIN_TEXT_LOC)\n",
    "testTXT  = pd.read_csv(TEST_TEXT_LOC)\n",
    "\n",
    "# Read train and test LACE data.\n",
    "trainLACE = pd.read_csv(TRAIN_AUX_LOC)\n",
    "testLACE  = pd.read_csv(TEST_AUX_LOC)\n",
    "\n",
    "# Combine data\n",
    "train = pd.merge(trainTXT, trainLACE, on = MERGE_ON)\n",
    "test  = pd.merge(testTXT,  testLACE,  on = MERGE_ON)\n",
    "\n",
    "# Split the train data into a train and validation set.\n",
    "train, valid = train_test_split(train, \n",
    "                                stratify     = train[OUTCOME], \n",
    "                                train_size   = .9, \n",
    "                                random_state = 1234)\n",
    "\n",
    "# Prepare the sections.\n",
    "# If `sectiontext` is present, then include \"SECTIONNAME sectiontext\".\n",
    "# If not present, include only \"SECTIONNAME\".\n",
    "SECTIONNAMES = [x for x in trainTXT.columns if VISITID not in x and OUTCOME not in x]\n",
    "for x in SECTIONNAMES:\n",
    "    rep      = x.replace(\" \", \"_\").upper()\n",
    "    train[x] = [\" \".join([rep, t]) if not pd.isnull(t) else rep for t in train[x]]\n",
    "    valid[x] = [\" \".join([rep, t]) if not pd.isnull(t) else rep for t in valid[x]]\n",
    "    test[x]  = [\" \".join([rep, t]) if not pd.isnull(t) else rep for t in test[x]]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Compute LACE features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This code assumes that, for each hospital visit, you have computed:\n",
    " * the Charlson index\n",
    " * the number of ER visits in the last 6 months\n",
    " * whether the patient was admitted through the ER\n",
    " * the length of stay, in days\n",
    "\n",
    "We then using these data to compute LACE."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "def LOS(los):\n",
    "    if los <= 3:\n",
    "        return(los)\n",
    "    elif los <= 6:\n",
    "        return(4)\n",
    "    elif los <= 13:\n",
    "        return(5)\n",
    "    else:\n",
    "        return(7)\n",
    "    \n",
    "def ACUITY(erboolean):\n",
    "    if erboolean:\n",
    "        return(3)\n",
    "    else:\n",
    "        return(0)\n",
    "    \n",
    "def LACE(data):\n",
    "    return(LOS(data.LengthOfStay) + ACUITY(data.AdmittedViaER) + data.Charlson + data.PrevERVisits)\n",
    "\n",
    "train[\"LACE\"] = train.apply(LACE, axis=1)\n",
    "valid[\"LACE\"] = valid.apply(LACE, axis=1)\n",
    "test[\"LACE\"]  = test.apply(LACE,  axis=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For their use in modeling, we also transform the LACE variables by subtracting the mean of the train data:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# We transform \"length of stay\" following the precedent set by LACE.\n",
    "train[\"LOS_Quantized\"]            = train.LengthOfStay.apply(LOS)\n",
    "test[\"LOS_Quantized\"]             = test.LengthOfStay.apply(LOS)\n",
    "valid[\"LOS_Quantized\"]            = valid.LengthOfStay.apply(LOS)\n",
    "\n",
    "train[\"Charlson_Transformed\"]     = train.Charlson - train.Charlson.mean()\n",
    "train[\"LOS_Transformed\"]          = train.LOS_Quantized - train.LOS_Quantized.mean()\n",
    "train[\"PrevERVisits_Transformed\"] = train.PrevERVisits - train.PrevERVisits.mean()\n",
    "\n",
    "test[\"Charlson_Transformed\"]      = test.Charlson - train.Charlson.mean()\n",
    "test[\"LOS_Transformed\"]           = test.LOS_Quantized - train.LOS_Quantized.mean()\n",
    "test[\"PrevERVisits_Transformed\"]  = test.PrevERVisits - train.PrevERVisits.mean()\n",
    "\n",
    "valid[\"Charlson_Transformed\"]     = valid.Charlson - train.Charlson.mean()\n",
    "valid[\"LOS_Transformed\"]          = valid.LOS_Quantized - train.LOS_Quantized.mean()\n",
    "valid[\"PrevERVisits_Transformed\"] = valid.PrevERVisits - train.PrevERVisits.mean()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Train or load Word2Vec"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Word2Vec hyperparameters\n",
    "window    = 2\n",
    "dimension = 1000\n",
    "min_count = 5\n",
    "sg        = 1  \n",
    "hs        = 0  \n",
    "\n",
    "# Where to save the model:\n",
    "modelFile = './word2vec/w2v_dims_' + str(dimension) + \"_window_\" + str(window) + '.bin'\n",
    "\n",
    "# We will remove digits and punctuation:\n",
    "remove_digits_punc = str.maketrans('', '', string.digits + ''.join([x for x in string.punctuation if '_' not in x]))\n",
    "remove_digits_punc = {a:\" \" for a in remove_digits_punc.keys()}\n",
    "\n",
    "# (If the model already exists, don't recompute.)\n",
    "if not os.path.isfile(modelFile):\n",
    "    # Use only training data to train word2vec:\n",
    "    notes = train[SECTIONNAMES].apply(lambda x: \" \".join(x), axis=1).values  \n",
    "    stop  = set([x for x in string.ascii_lowercase]) \n",
    "    for i in range(len(notes)):\n",
    "        notes[i] = [w for w in notes[i].translate(remove_digits_punc).split() if (w not in stop)]\n",
    "    \n",
    "    w2v = word2vec.Word2Vec(notes, \n",
    "                            size=dimension, \n",
    "                            window=window, \n",
    "                            sg=sg, \n",
    "                            hs=hs, \n",
    "                            min_count=min_count, \n",
    "                            workers=50)\n",
    "    w2v.wv.save_word2vec_format(modelFile, binary=True)\n",
    "else:\n",
    "    w2v = gensim.models.KeyedVectors.load_word2vec_format(modelFile, binary=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Make the embedding matrix.\n",
    "# We include one extra word, `PADDING`. This is the word that will right-pad short notes.\n",
    "# For `PADDING`'s vector representation, we choose the zero vector.\n",
    "vocab = [\"PADDING\"] + sorted(list(w2v.wv.vocab.keys()))\n",
    "vset  = set(vocab)\n",
    "\n",
    "embeddings_index = {}\n",
    "for i in range(len(vocab)):\n",
    "    embeddings_index[vocab[i]] = i\n",
    "\n",
    "reverse_embeddings_index = {b:a for a,b in embeddings_index.items()}\n",
    "embeddings_matrix        = np.matrix(np.concatenate(([[0.]*1000], [w2v[x] for x in vocab[1:]])))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Model"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Neural network with LACE features"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Prepare text using our embeddings index:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "train_x = train[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
    "test_x  = test[ SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
    "valid_x = valid[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
    "\n",
    "train_x = [[embeddings_index[x] for x in note.split() if x in vset] for note in train_x]\n",
    "valid_x = [[embeddings_index[x] for x in note.split() if x in vset] for note in valid_x]\n",
    "test_x  = [[embeddings_index[x] for x in note.split() if x in vset] for note in test_x]\n",
    "\n",
    "train_y = train[OUTCOME]\n",
    "valid_y = valid[OUTCOME]\n",
    "test_y  = test[OUTCOME]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And model:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "UNITS      = 500\n",
    "FILTERSIZE = 3\n",
    "embedding_layer = Embedding(embeddings_matrix.shape[0],\n",
    "                            embeddings_matrix.shape[1],\n",
    "                            weights=[embeddings_matrix],\n",
    "                            input_length=maxlen,\n",
    "                            trainable=True)\n",
    "\n",
    "sequence_input     = Input(shape=(maxlen,), dtype='int32')\n",
    "embedded_sequences = embedding_layer(sequence_input)\n",
    "\n",
    "lace_in            = Input(shape=(4,))\n",
    "lace               = keras.layers.Reshape((1,4,))(lace_in)\n",
    "lace               = keras.layers.UpSampling1D(700)(lace)\n",
    "\n",
    "combined           = keras.layers.concatenate([embedded_sequences, lace])\n",
    "\n",
    "conv               = Conv1D(UNITS, FILTERSIZE, activation=\"tanh\", use_bias=True)(combined)\n",
    "pool               = GlobalMaxPooling1D()(conv)\n",
    "\n",
    "\n",
    "out                = Dense(1, \n",
    "                           activation='sigmoid', \n",
    "                           activity_regularizer=keras.regularizers.l1(l=.05)\n",
    "                        )(pool)\n",
    "\n",
    "optimizer = keras.optimizers.RMSprop(lr = .0001)\n",
    "model=Model(inputs=[sequence_input, lace_in], outputs=out)\n",
    "model.compile(loss='binary_crossentropy', optimizer=optimizer)\n",
    "\n",
    "model.fit(train_x, train_y, batch_size=100, epochs=4, validation_data=(valid_x, valid_y), verbose=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Random forest with TF-IDF matrix"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Prepare the text for sklearn's tfidf vectorizer:\n",
    "train_x = train[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
    "test_x  = test[ SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
    "valid_x = valid[SECTIONNAMES].apply(lambda x: (\" \".join(x)).translate(remove_digits_punc), axis=1).values  \n",
    "\n",
    "train_y = train[OUTCOME]\n",
    "valid_y = valid[OUTCOME]\n",
    "test_y  = test[OUTCOME]\n",
    "\n",
    "tfidf = TfidfVectorizer()\n",
    "tr_x  = tfidf.fit_transform(train_x)\n",
    "te_x  = tfidf.transform(test_x)\n",
    "va_x  = tfidf.transform(valid_x)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Model:\n",
    "rfc = RandomForestClassifier(n_estimators=1000, max_depth=100, n_jobs=-1)\n",
    "rfc.fit(tr_x, train_y)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 2-layer feed forward neural network "
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "This model uses only the components of LACE together with the LACE score:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "lace  = Input(shape=(5,))\n",
    "dense = Dense(50, activation='tanh')(lace)\n",
    "out   = Dense(1, activation='sigmoid')(dense)\n",
    "\n",
    "model = Model(inputs=lace, outputs=out)\n",
    "model.compile(loss='binary_crossentropy', optimizer=\"nadam\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model.fit(train[[\"LOS_Transformed\", \"AdmittedViaER\", \"Charlson_Transformed\", \"PrevERVisits_Transformed\", \"LACE\"]].values, \n",
    "           train_y,\n",
    "           class_weight={0:1, 1:10}, \n",
    "           epochs=1)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Logistic regression"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "model = logit(formula = OUTCOME + \" ~ (LOS_Transformed + AdmittedViaER + Charlson_Transformed + PrevERVisits_Transformed + LACE)\", \n",
    "              data = train\n",
    "        ).fit(maxiter = 1000, method = 'lbfgs')"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}