* [Baseline models](#Baseline-models)
* [Load and prepare data](#Load-and-prepare-data)
    * [Load and prepare the text](#Load-and-prepare-the-text)
    * [Compute LACE features](#Compute-LACE-features)
* [Train or load Word2Vec](#Train-or-load-Word2Vec)
* [Model](#Model)
    * [Neural network with LACE features](#Neural-network-with-LACE-features)
    * [Random forest with TF-IDF matrix](#Random-forest-with-TF-IDF-matrix)
    * [2-layer feed forward neural network](#2-layer-feed-forward-neural-network)
    * [Logistic regression](#Logistic-regression)

# Baseline models

In [None]:
# Data prep
import numpy as np
import pandas as pd
from   sklearn.model_selection import train_test_split

# Word2Vec
import os
import logging
import string
from   gensim.models import word2vec
import gensim
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

# Neural networks 
import keras
from   keras.models import Model
from   keras.preprocessing.text import Tokenizer
from   keras.preprocessing.sequence import pad_sequences
from   keras.layers import Embedding, Input, Conv1D, Dense, GlobalMaxPooling1D
from   keras.optimizers import RMSprop
import keras.backend as K

# Random forest
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier

# Logistic regression
import statsmodels.api as sm

In [None]:
# Data frame created by TextSections/TextPrep
TRAIN_TEXT_LOC = ""
TEST_TEXT_LOC  = ""

# Data frame containing LACE features.
# Assumes presence of:
# - LengthOfStay
# - Charlson
# - PrevERVisits
# - AdmittedViaER
TRAIN_AUX_LOC  = ""
TEST_AUX_LOC   = ""

# Unique visit identifier to merge the train/test text with LACE data
MERGE_ON       = ""

# Other column names
VISITID        = ""
OUTCOME        = "" # e.g. ReadmissionInLessThan30Days

# Load and prepare data

## Load and prepare the text

In [None]:
# Read train and test text data.
trainTXT = pd.read_csv(TRAIN_TEXT_LOC)
testTXT  = pd.read_csv(TEST_TEXT_LOC)

# Read train and test LACE data.
trainLACE = pd.read_csv(TRAIN_AUX_LOC)
testLACE  = pd.read_csv(TEST_AUX_LOC)

# Combine data
train = pd.merge(trainTXT, trainLACE, on = MERGE_ON)
test  = pd.merge(testTXT,  testLACE,  on = MERGE_ON)

# Split the train data into a train and validation set.
train, valid = train_test_split(train, 
                                stratify     = train[OUTCOME], 
                                train_size   = .9, 
                                random_state = 1234)

# Prepare the sections.
# If `sectiontext` is present, then include "SECTIONNAME sectiontext".
# If not present, include only "SECTIONNAME".
SECTIONNAMES = [x for x in trainTXT.columns if VISITID not in x and OUTCOME not in x]
for x in SECTIONNAMES:
    rep      = x.replace(" ", "_").upper()
    train[x] = [" ".join([rep, t]) if not pd.isnull(t) else rep for t in train[x]]
    valid[x] = [" ".join([rep, t]) if not pd.isnull(t) else rep for t in valid[x]]
    test[x]  = [" ".join([rep, t]) if not pd.isnull(t) else rep for t in test[x]]

## Compute LACE features

This code assumes that, for each hospital visit, you have computed:
 * the Charlson index
 * the number of ER visits in the last 6 months
 * whether the patient was admitted through the ER
 * the length of stay, in days

We then using these data to compute LACE.

In [None]:
def LOS(los):
    if los <= 3:
        return(los)
    elif los <= 6:
        return(4)
    elif los <= 13:
        return(5)
    else:
        return(7)
    
def ACUITY(erboolean):
    if erboolean:
        return(3)
    else:
        return(0)
    
def LACE(data):
    return(LOS(data.LengthOfStay) + ACUITY(data.AdmittedViaER) + data.Charlson + data.PrevERVisits)

train["LACE"] = train.apply(LACE, axis=1)
valid["LACE"] = valid.apply(LACE, axis=1)
test["LACE"]  = test.apply(LACE,  axis=1)

For their use in modeling, we also transform the LACE variables by subtracting the mean of the train data:

In [None]:
# We transform "length of stay" following the precedent set by LACE.
train["LOS_Quantized"]            = train.LengthOfStay.apply(LOS)
test["LOS_Quantized"]             = test.LengthOfStay.apply(LOS)
valid["LOS_Quantized"]            = valid.LengthOfStay.apply(LOS)

train["Charlson_Transformed"]     = train.Charlson - train.Charlson.mean()
train["LOS_Transformed"]          = train.LOS_Quantized - train.LOS_Quantized.mean()
train["PrevERVisits_Transformed"] = train.PrevERVisits - train.PrevERVisits.mean()

test["Charlson_Transformed"]      = test.Charlson - train.Charlson.mean()
test["LOS_Transformed"]           = test.LOS_Quantized - train.LOS_Quantized.mean()
test["PrevERVisits_Transformed"]  = test.PrevERVisits - train.PrevERVisits.mean()

valid["Charlson_Transformed"]     = valid.Charlson - train.Charlson.mean()
valid["LOS_Transformed"]          = valid.LOS_Quantized - train.LOS_Quantized.mean()
valid["PrevERVisits_Transformed"] = valid.PrevERVisits - train.PrevERVisits.mean()

# Train or load Word2Vec

In [None]:
# Word2Vec hyperparameters
window    = 2
dimension = 1000
min_count = 5
sg        = 1  
hs        = 0  

# Where to save the model:
modelFile = './word2vec/w2v_dims_' + str(dimension) + "_window_" + str(window) + '.bin'

# We will remove digits and punctuation:
remove_digits_punc = str.maketrans('', '', string.digits + ''.join([x for x in string.punctuation if '_' not in x]))
remove_digits_punc = {a:" " for a in remove_digits_punc.keys()}

# (If the model already exists, don't recompute.)
if not os.path.isfile(modelFile):
    # Use only training data to train word2vec:
    notes = train[SECTIONNAMES].apply(lambda x: " ".join(x), axis=1).values  
    stop  = set([x for x in string.ascii_lowercase]) 
    for i in range(len(notes)):
        notes[i] = [w for w in notes[i].translate(remove_digits_punc).split() if (w not in stop)]
    
    w2v = word2vec.Word2Vec(notes, 
                            size=dimension, 
                            window=window, 
                            sg=sg, 
                            hs=hs, 
                            min_count=min_count, 
                            workers=50)
    w2v.wv.save_word2vec_format(modelFile, binary=True)
else:
    w2v = gensim.models.KeyedVectors.load_word2vec_format(modelFile, binary=True)

In [None]:
# Make the embedding matrix.
# We include one extra word, `PADDING`. This is the word that will right-pad short notes.
# For `PADDING`'s vector representation, we choose the zero vector.
vocab = ["PADDING"] + sorted(list(w2v.wv.vocab.keys()))
vset  = set(vocab)

embeddings_index = {}
for i in range(len(vocab)):
    embeddings_index[vocab[i]] = i

reverse_embeddings_index = {b:a for a,b in embeddings_index.items()}
embeddings_matrix        = np.matrix(np.concatenate(([[0.]*1000], [w2v[x] for x in vocab[1:]])))

# Model

## Neural network with LACE features

Prepare text using our embeddings index:

In [None]:
train_x = train[SECTIONNAMES].apply(lambda x: (" ".join(x)).translate(remove_digits_punc), axis=1).values  
test_x  = test[ SECTIONNAMES].apply(lambda x: (" ".join(x)).translate(remove_digits_punc), axis=1).values  
valid_x = valid[SECTIONNAMES].apply(lambda x: (" ".join(x)).translate(remove_digits_punc), axis=1).values  

train_x = [[embeddings_index[x] for x in note.split() if x in vset] for note in train_x]
valid_x = [[embeddings_index[x] for x in note.split() if x in vset] for note in valid_x]
test_x  = [[embeddings_index[x] for x in note.split() if x in vset] for note in test_x]

train_y = train[OUTCOME]
valid_y = valid[OUTCOME]
test_y  = test[OUTCOME]

And model:

In [None]:
UNITS      = 500
FILTERSIZE = 3
embedding_layer = Embedding(embeddings_matrix.shape[0],
                            embeddings_matrix.shape[1],
                            weights=[embeddings_matrix],
                            input_length=maxlen,
                            trainable=True)

sequence_input     = Input(shape=(maxlen,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)

lace_in            = Input(shape=(4,))
lace               = keras.layers.Reshape((1,4,))(lace_in)
lace               = keras.layers.UpSampling1D(700)(lace)

combined           = keras.layers.concatenate([embedded_sequences, lace])

conv               = Conv1D(UNITS, FILTERSIZE, activation="tanh", use_bias=True)(combined)
pool               = GlobalMaxPooling1D()(conv)


out                = Dense(1, 
                           activation='sigmoid', 
                           activity_regularizer=keras.regularizers.l1(l=.05)
                        )(pool)

optimizer = keras.optimizers.RMSprop(lr = .0001)
model=Model(inputs=[sequence_input, lace_in], outputs=out)
model.compile(loss='binary_crossentropy', optimizer=optimizer)

model.fit(train_x, train_y, batch_size=100, epochs=4, validation_data=(valid_x, valid_y), verbose=1)

## Random forest with TF-IDF matrix

In [None]:
# Prepare the text for sklearn's tfidf vectorizer:
train_x = train[SECTIONNAMES].apply(lambda x: (" ".join(x)).translate(remove_digits_punc), axis=1).values  
test_x  = test[ SECTIONNAMES].apply(lambda x: (" ".join(x)).translate(remove_digits_punc), axis=1).values  
valid_x = valid[SECTIONNAMES].apply(lambda x: (" ".join(x)).translate(remove_digits_punc), axis=1).values  

train_y = train[OUTCOME]
valid_y = valid[OUTCOME]
test_y  = test[OUTCOME]

tfidf = TfidfVectorizer()
tr_x  = tfidf.fit_transform(train_x)
te_x  = tfidf.transform(test_x)
va_x  = tfidf.transform(valid_x)

In [None]:
# Model:
rfc = RandomForestClassifier(n_estimators=1000, max_depth=100, n_jobs=-1)
rfc.fit(tr_x, train_y)

## 2-layer feed forward neural network 

This model uses only the components of LACE together with the LACE score:

In [None]:
lace  = Input(shape=(5,))
dense = Dense(50, activation='tanh')(lace)
out   = Dense(1, activation='sigmoid')(dense)

model = Model(inputs=lace, outputs=out)
model.compile(loss='binary_crossentropy', optimizer="nadam")

In [None]:
model.fit(train[["LOS_Transformed", "AdmittedViaER", "Charlson_Transformed", "PrevERVisits_Transformed", "LACE"]].values, 
           train_y,
           class_weight={0:1, 1:10}, 
           epochs=1)

## Logistic regression

In [None]:
model = logit(formula = OUTCOME + " ~ (LOS_Transformed + AdmittedViaER + Charlson_Transformed + PrevERVisits_Transformed + LACE)", 
              data = train
        ).fit(maxiter = 1000, method = 'lbfgs')