# WESAD EMI-LSTM

Adapted from Microsoft's notebooks, available at https://github.com/microsoft/EdgeML authored by Dennis et al.

## Imports

In [1]:
import pandas as pd
import numpy as np
from tabulate import tabulate
import os
import datetime as datetime
import pickle as pkl

## DataFrames from CSVs

In [12]:
filtered_train = pd.read_csv('/home/sf/data/allchest.csv')

## Preprocessing 

In [13]:
filtered_train.columns

Index(['ID', 'chestACCx', 'chestACCy', 'chestACCz', 'chestECG', 'chestEMG',
       'chestEDA', 'chestTemp', 'chestResp', 'label'],
      dtype='object')

In [14]:
filtered_train = filtered_train[filtered_train['label']<4]
filtered_train = filtered_train[filtered_train['label']!=0]
print(filtered_train['ID'].unique())
print(filtered_train['label'].unique())

[10 11 13 14 15 16 17  2  3  4  5  6  7  8  9]
[1 3 2]


## Split Ground Truth 

In [15]:
filtered_target = filtered_train['label']
filtered_train = filtered_train.drop(['ID', 'label'], axis=1)

In [16]:
print(filtered_target.shape)
print(filtered_train.shape)
filtered_target=filtered_target.replace({1:0,2:1,3:2})
filtered_train=filtered_train[0:23206400]
filtered_target=filtered_target[0:23206400]

(23206404,)
(23206404, 8)


In [17]:
print(filtered_target.shape)
print(filtered_train.shape)

(23206400,)
(23206400, 8)


In [18]:
filtered_target = filtered_target.values.reshape(132608, 175)   # 128 is the size of 1 bag , 
                                                # 85744 = (size of the entire set) / 128

## Convert to 3D - (Bags, Timesteps, Features)

In [19]:
len(filtered_train.columns)

8

In [20]:
filtered_train = filtered_train.values
print(filtered_train.shape)
filtered_train = filtered_train.reshape(int(len(filtered_train) / 175), 175, 8)
print(filtered_train.shape)

(23206400, 8)
(132608, 175, 8)


## Filter Overlapping Bags

In [21]:
# filtering bags that overlap with another class
bags_to_remove = []
for i in range(len(filtered_target)):
    if len(set(filtered_target[i])) > 1:
        bags_to_remove.append(i)
print(bags_to_remove)

[22740, 24268, 26924, 31644, 34344, 35832, 40532, 42020, 44764, 49484, 52176, 53648, 58372, 59860, 62752, 67328, 69788, 71236, 75796, 78356, 79856, 84488, 85976, 88516, 93308, 94804, 97384, 102104, 104704, 106192, 110936, 112424, 114984, 119660, 121140, 123820, 128540, 131120]


In [22]:
filtered_train = np.delete(filtered_train, bags_to_remove, axis=0)
filtered_target = np.delete(filtered_target, bags_to_remove, axis=0)

In [23]:
filtered_train.shape 

(132570, 175, 8)

In [24]:
filtered_target.shape

(132570, 175)

## Categorical Representation 

In [25]:
one_hot_list = []
for i in range(len(filtered_target)):
    one_hot_list.append(set(filtered_target[i]).pop())

In [26]:
categorical_y_ver = one_hot_list
categorical_y_ver = np.array(categorical_y_ver)

In [27]:
categorical_y_ver.shape

(132570,)

In [28]:
filtered_train.shape[1]

175

In [29]:
def one_hot(y, numOutput):
    y = np.reshape(y, [-1])
    ret = np.zeros([y.shape[0], numOutput])
    for i, label in enumerate(y):
        ret[i, label] = 1
    return ret

## Extract 3D Normalized Data with Validation Set

In [2]:
from sklearn.model_selection import train_test_split
import pathlib

In [31]:
x_train_val_combined, x_test, y_train_val_combined, y_test = train_test_split(filtered_train, categorical_y_ver, test_size=0.20, random_state=42)

In [32]:
y_test

array([1, 0, 0, ..., 1, 0, 1])

In [33]:
extractedDir = '/home/sf/data/WESAD'
# def generateData(extractedDir):
# x_train_val_combined, x_test, y_train_val_combined, y_test = readData(extractedDir)
timesteps = x_train_val_combined.shape[-2] #128/256
feats = x_train_val_combined.shape[-1]  #16

trainSize = int(x_train_val_combined.shape[0]*0.9) #6566
x_train, x_val = x_train_val_combined[:trainSize], x_train_val_combined[trainSize:] 
y_train, y_val = y_train_val_combined[:trainSize], y_train_val_combined[trainSize:]

# normalization
x_train = np.reshape(x_train, [-1, feats])
mean = np.mean(x_train, axis=0)
std = np.std(x_train, axis=0)

# normalize train
x_train = x_train - mean
x_train = x_train / std
x_train = np.reshape(x_train, [-1, timesteps, feats])

# normalize val
x_val = np.reshape(x_val, [-1, feats])
x_val = x_val - mean
x_val = x_val / std
x_val = np.reshape(x_val, [-1, timesteps, feats])

# normalize test
x_test = np.reshape(x_test, [-1, feats])
x_test = x_test - mean
x_test = x_test / std
x_test = np.reshape(x_test, [-1, timesteps, feats])

# shuffle test, as this was remaining
idx = np.arange(len(x_test))
np.random.shuffle(idx)
x_test = x_test[idx]
y_test = y_test[idx]

In [34]:
# one-hot encoding of labels
numOutput = 3
y_train = one_hot(y_train, numOutput)
y_val = one_hot(y_val, numOutput)
y_test = one_hot(y_test, numOutput)
extractedDir += '/'

pathlib.Path(extractedDir + 'RAW').mkdir(parents=True, exist_ok = True)

np.save(extractedDir + "RAW/x_train", x_train)
np.save(extractedDir + "RAW/y_train", y_train)
np.save(extractedDir + "RAW/x_test", x_test)
np.save(extractedDir + "RAW/y_test", y_test)
np.save(extractedDir + "RAW/x_val", x_val)
np.save(extractedDir + "RAW/y_val", y_val)

print(extractedDir)

/home/sf/data/WESAD/


In [35]:
ls "/home/sf/data/WESAD/RAW"

x_test.npy  x_train.npy  x_val.npy  y_test.npy  y_train.npy  y_val.npy


In [36]:
np.load('/home/sf/data/WESAD/RAW/x_train.npy').shape

(95450, 175, 8)

## Make 4D EMI Data (Bags, Subinstances, Subinstance Length, Features)

In [2]:
def loadData(dirname):
    x_train = np.load(dirname + '/' + 'x_train.npy')
    y_train = np.load(dirname + '/' + 'y_train.npy')
    x_test = np.load(dirname + '/' + 'x_test.npy')
    y_test = np.load(dirname + '/' + 'y_test.npy')
    x_val = np.load(dirname + '/' + 'x_val.npy')
    y_val = np.load(dirname + '/' + 'y_val.npy')
    return x_train, y_train, x_test, y_test, x_val, y_val

In [3]:
def bagData(X, Y, subinstanceLen, subinstanceStride):
    numClass = 3
    numSteps = 175
    numFeats = 8
    assert X.ndim == 3
    assert X.shape[1] == numSteps
    assert X.shape[2] == numFeats
    assert subinstanceLen <= numSteps
    assert subinstanceLen > 0
    assert subinstanceStride <= numSteps
    assert subinstanceStride >= 0
    assert len(X) == len(Y)
    assert Y.ndim == 2
    assert Y.shape[1] == numClass
    x_bagged = []
    y_bagged = []
    for i, point in enumerate(X[:, :, :]):
        instanceList = []
        start = 0
        end = subinstanceLen
        while True:
            x = point[start:end, :]
            if len(x) < subinstanceLen:
                x_ = np.zeros([subinstanceLen, x.shape[1]])
                x_[:len(x), :] = x[:, :]
                x = x_
            instanceList.append(x)
            if end >= numSteps:
                break
            start += subinstanceStride
            end += subinstanceStride
        bag = np.array(instanceList)
        numSubinstance = bag.shape[0]
        label = Y[i]
        label = np.argmax(label)
        labelBag = np.zeros([numSubinstance, numClass])
        labelBag[:, label] = 1
        x_bagged.append(bag)
        label = np.array(labelBag)
        y_bagged.append(label)
    return np.array(x_bagged), np.array(y_bagged)

In [4]:
def makeEMIData(subinstanceLen, subinstanceStride, sourceDir, outDir):
    x_train, y_train, x_test, y_test, x_val, y_val = loadData(sourceDir)
    x, y = bagData(x_train, y_train, subinstanceLen, subinstanceStride)
    np.save(outDir + '/x_train.npy', x)
    np.save(outDir + '/y_train.npy', y)
    print('Num train %d' % len(x))
    x, y = bagData(x_test, y_test, subinstanceLen, subinstanceStride)
    np.save(outDir + '/x_test.npy', x)
    np.save(outDir + '/y_test.npy', y)
    print('Num test %d' % len(x))
    x, y = bagData(x_val, y_val, subinstanceLen, subinstanceStride)
    np.save(outDir + '/x_val.npy', x)
    np.save(outDir + '/y_val.npy', y)
    print('Num val %d' % len(x))

In [5]:
subinstanceLen = 88
subinstanceStride = 30
extractedDir = '/home/sf/data/WESAD/'
from os import mkdir
#mkdir('/home/sf/data/WESAD/88_30')
rawDir = extractedDir + '/RAW'
sourceDir = rawDir
outDir = extractedDir + '/%d_%d/' % (subinstanceLen, subinstanceStride)
makeEMIData(subinstanceLen, subinstanceStride, sourceDir, outDir)

Num train 95450
Num test 26514
Num val 10606


In [6]:
np.load('//home/sf/data/WESAD/88_30/y_train.npy').shape

(95450, 4, 3)

In [7]:
from edgeml.graph.rnn import EMI_DataPipeline
from edgeml.graph.rnn import EMI_BasicLSTM, EMI_FastGRNN, EMI_FastRNN, EMI_GRU
from edgeml.trainer.emirnnTrainer import EMI_Trainer, EMI_Driver
import edgeml.utils

In [8]:
def lstm_experiment_generator(params, path = './DSAAR/64_16/'):
    """
        Function that will generate the experiments to be run.
        Inputs : 
        (1) Dictionary params, to set the network parameters.
        (2) Name of the Model to be run from [EMI-LSTM, EMI-FastGRNN, EMI-GRU]
        (3) Path to the dataset, where the csv files are present.
    """
    
    #Copy the contents of the params dictionary.
    lstm_dict = {**params}
    
    #---------------------------PARAM SETTING----------------------#
    
    # Network parameters for our LSTM + FC Layer
    NUM_HIDDEN = params["NUM_HIDDEN"]
    NUM_TIMESTEPS = params["NUM_TIMESTEPS"]
    ORIGINAL_NUM_TIMESTEPS = params["ORIGINAL_NUM_TIMESTEPS"]
    NUM_FEATS = params["NUM_FEATS"]
    FORGET_BIAS = params["FORGET_BIAS"]
    NUM_OUTPUT = params["NUM_OUTPUT"]
    USE_DROPOUT = True if (params["USE_DROPOUT"] == 1) else False
    KEEP_PROB = params["KEEP_PROB"]

    # For dataset API
    PREFETCH_NUM = params["PREFETCH_NUM"]
    BATCH_SIZE = params["BATCH_SIZE"]

    # Number of epochs in *one iteration*
    NUM_EPOCHS = params["NUM_EPOCHS"]
    # Number of iterations in *one round*. After each iteration,
    # the model is dumped to disk. At the end of the current
    # round, the best model among all the dumped models in the
    # current round is picked up..
    NUM_ITER = params["NUM_ITER"]
    # A round consists of multiple training iterations and a belief
    # update step using the best model from all of these iterations
    NUM_ROUNDS = params["NUM_ROUNDS"]
    LEARNING_RATE = params["LEARNING_RATE"]

    # A staging direcory to store models
    MODEL_PREFIX = params["MODEL_PREFIX"]
    
    #----------------------END OF PARAM SETTING----------------------#
    
    #----------------------DATA LOADING------------------------------#
    
    x_train, y_train = np.load(path + 'x_train.npy'), np.load(path + 'y_train.npy')
    x_test, y_test = np.load(path + 'x_test.npy'), np.load(path + 'y_test.npy')
    x_val, y_val = np.load(path + 'x_val.npy'), np.load(path + 'y_val.npy')

    # BAG_TEST, BAG_TRAIN, BAG_VAL represent bag_level labels. These are used for the label update
    # step of EMI/MI RNN
    BAG_TEST = np.argmax(y_test[:, 0, :], axis=1)
    BAG_TRAIN = np.argmax(y_train[:, 0, :], axis=1)
    BAG_VAL = np.argmax(y_val[:, 0, :], axis=1)
    NUM_SUBINSTANCE = x_train.shape[1]
    print("x_train shape is:", x_train.shape)
    print("y_train shape is:", y_train.shape)
    print("x_test shape is:", x_val.shape)
    print("y_test shape is:", y_val.shape)
    
    #----------------------END OF DATA LOADING------------------------------#    
    
    #----------------------COMPUTATION GRAPH--------------------------------#
    
    # Define the linear secondary classifier
    def createExtendedGraph(self, baseOutput, *args, **kwargs):
        W1 = tf.Variable(np.random.normal(size=[NUM_HIDDEN, NUM_OUTPUT]).astype('float32'), name='W1')
        B1 = tf.Variable(np.random.normal(size=[NUM_OUTPUT]).astype('float32'), name='B1')
        y_cap = tf.add(tf.tensordot(baseOutput, W1, axes=1), B1, name='y_cap_tata')
        self.output = y_cap
        self.graphCreated = True

    def restoreExtendedGraph(self, graph, *args, **kwargs):
        y_cap = graph.get_tensor_by_name('y_cap_tata:0')
        self.output = y_cap
        self.graphCreated = True

    def feedDictFunc(self, keep_prob=None, inference=False, **kwargs):
        if inference is False:
            feedDict = {self._emiGraph.keep_prob: keep_prob}
        else:
            feedDict = {self._emiGraph.keep_prob: 1.0}
        return feedDict

    EMI_BasicLSTM._createExtendedGraph = createExtendedGraph
    EMI_BasicLSTM._restoreExtendedGraph = restoreExtendedGraph

    if USE_DROPOUT is True:
        EMI_Driver.feedDictFunc = feedDictFunc
    
    inputPipeline = EMI_DataPipeline(NUM_SUBINSTANCE, NUM_TIMESTEPS, NUM_FEATS, NUM_OUTPUT)
    emiLSTM = EMI_BasicLSTM(NUM_SUBINSTANCE, NUM_HIDDEN, NUM_TIMESTEPS, NUM_FEATS,
                            forgetBias=FORGET_BIAS, useDropout=USE_DROPOUT)
    emiTrainer = EMI_Trainer(NUM_TIMESTEPS, NUM_OUTPUT, lossType='xentropy',
                             stepSize=LEARNING_RATE)
    
    tf.reset_default_graph()
    g1 = tf.Graph()    
    with g1.as_default():
        # Obtain the iterators to each batch of the data
        x_batch, y_batch = inputPipeline()
        # Create the forward computation graph based on the iterators
        y_cap = emiLSTM(x_batch)
        # Create loss graphs and training routines
        emiTrainer(y_cap, y_batch)
        
    #------------------------------END OF COMPUTATION GRAPH------------------------------#
    
    #-------------------------------------EMI DRIVER-------------------------------------#
        
    with g1.as_default():
        emiDriver = EMI_Driver(inputPipeline, emiLSTM, emiTrainer)

    emiDriver.initializeSession(g1)
    y_updated, modelStats = emiDriver.run(numClasses=NUM_OUTPUT, x_train=x_train,
                                          y_train=y_train, bag_train=BAG_TRAIN,
                                          x_val=x_val, y_val=y_val, bag_val=BAG_VAL,
                                          numIter=NUM_ITER, keep_prob=KEEP_PROB,
                                          numRounds=NUM_ROUNDS, batchSize=BATCH_SIZE,
                                          numEpochs=NUM_EPOCHS, modelPrefix=MODEL_PREFIX,
                                          fracEMI=0.5, updatePolicy='top-k', k=1)
    
    #-------------------------------END OF EMI DRIVER-------------------------------------#
    
    #-----------------------------------EARLY SAVINGS-------------------------------------#
    
    """
        Early Prediction Policy: We make an early prediction based on the predicted classes
        probability. If the predicted class probability > minProb at some step, we make
        a prediction at that step.
    """
    def earlyPolicy_minProb(instanceOut, minProb, **kwargs):
        assert instanceOut.ndim == 2
        classes = np.argmax(instanceOut, axis=1)
        prob = np.max(instanceOut, axis=1)
        index = np.where(prob >= minProb)[0]
        if len(index) == 0:
            assert (len(instanceOut) - 1) == (len(classes) - 1)
            return classes[-1], len(instanceOut) - 1
        index = index[0]
        return classes[index], index

    def getEarlySaving(predictionStep, numTimeSteps, returnTotal=False):
        predictionStep = predictionStep + 1
        predictionStep = np.reshape(predictionStep, -1)
        totalSteps = np.sum(predictionStep)
        maxSteps = len(predictionStep) * numTimeSteps
        savings = 1.0 - (totalSteps / maxSteps)
        if returnTotal:
            return savings, totalSteps
        return savings
    
    #--------------------------------END OF EARLY SAVINGS---------------------------------#
    
    #----------------------------------------BEST MODEL-----------------------------------#
    
    k = 2
    predictions, predictionStep = emiDriver.getInstancePredictions(x_test, y_test, earlyPolicy_minProb,
                                                                   minProb=0.99, keep_prob=1.0)
    bagPredictions = emiDriver.getBagPredictions(predictions, minSubsequenceLen=k, numClass=NUM_OUTPUT)
    print('Accuracy at k = %d: %f' % (k,  np.mean((bagPredictions == BAG_TEST).astype(int))))
    mi_savings = (1 - NUM_TIMESTEPS / ORIGINAL_NUM_TIMESTEPS)
    emi_savings = getEarlySaving(predictionStep, NUM_TIMESTEPS)
    total_savings = mi_savings + (1 - mi_savings) * emi_savings
    print('Savings due to MI-RNN : %f' % mi_savings)
    print('Savings due to Early prediction: %f' % emi_savings)
    print('Total Savings: %f' % (total_savings))
    
    #Store in the dictionary.
    lstm_dict["k"] = k
    lstm_dict["accuracy"] = np.mean((bagPredictions == BAG_TEST).astype(int))
    lstm_dict["total_savings"] = total_savings
    lstm_dict["y_test"] = BAG_TEST
    lstm_dict["y_pred"] = bagPredictions
    
    # A slightly more detailed analysis method is provided. 
    df = emiDriver.analyseModel(predictions, BAG_TEST, NUM_SUBINSTANCE, NUM_OUTPUT)
    print (tabulate(df, headers=list(df.columns), tablefmt='grid'))
    
    lstm_dict["detailed analysis"] = df
    #----------------------------------END OF BEST MODEL-----------------------------------#
    
    #----------------------------------PICKING THE BEST MODEL------------------------------#
    
    devnull = open(os.devnull, 'r')
    for val in modelStats:
        round_, acc, modelPrefix, globalStep = val
        emiDriver.loadSavedGraphToNewSession(modelPrefix, globalStep, redirFile=devnull)
        predictions, predictionStep = emiDriver.getInstancePredictions(x_test, y_test, earlyPolicy_minProb,
                                                                   minProb=0.99, keep_prob=1.0)

        bagPredictions = emiDriver.getBagPredictions(predictions, minSubsequenceLen=k, numClass=NUM_OUTPUT)
        print("Round: %2d, Validation accuracy: %.4f" % (round_, acc), end='')
        print(', Test Accuracy (k = %d): %f, ' % (k,  np.mean((bagPredictions == BAG_TEST).astype(int))), end='')
        print('Additional savings: %f' % getEarlySaving(predictionStep, NUM_TIMESTEPS)) 
        
    
    #-------------------------------END OF PICKING THE BEST MODEL--------------------------#

    return lstm_dict

In [9]:
def experiment_generator(params, path, model = 'lstm'):
    
    
    if (model == 'lstm'): return lstm_experiment_generator(params, path)
    elif (model == 'fastgrnn'): return fastgrnn_experiment_generator(params, path)
    elif (model == 'gru'): return gru_experiment_generator(params, path)
    elif (model == 'baseline'): return baseline_experiment_generator(params, path)
    
    return 

In [10]:
pwd

'/home/sf/data/EdgeML/tf/examples/EMI-RNN'

In [11]:
cd '/home/sf/data/WESAD/88_30/'

/home/sf/data/WESAD/88_30


In [12]:
import tensorflow as tf

In [14]:
## Baseline EMI-LSTM

dataset = 'WESAD'
path = '/home/sf/data/WESAD/88_30/'

#Choose model from among [lstm, fastgrnn, gru]
model = 'lstm'

# Dictionary to set the parameters.
params = {
    "NUM_HIDDEN" : 128,
    "NUM_TIMESTEPS" : 88, #subinstance length.
    "ORIGINAL_NUM_TIMESTEPS" : 175,
    "NUM_FEATS" : 8,
    "FORGET_BIAS" : 1.0,
    "NUM_OUTPUT" : 3,
    "USE_DROPOUT" : 1, # '1' -> True. '0' -> False
    "KEEP_PROB" : 0.75,
    "PREFETCH_NUM" : 5,
    "BATCH_SIZE" : 175,
    "NUM_EPOCHS" : 2,
    "NUM_ITER" : 4,
    "NUM_ROUNDS" : 6,
    "LEARNING_RATE" : 0.001,
    "FRAC_EMI" : 0.5,
    "MODEL_PREFIX" : dataset + '/model-' + str(model)
}

#Preprocess data, and load the train,test and validation splits.
lstm_dict = lstm_experiment_generator(params, path)

#Create the directory to store the results of this run.
dirname = "/home/sf/data/WESAD/lstm/"
pathlib.Path(dirname).mkdir(parents=True, exist_ok=True)
print ("Results for this run have been saved at" , dirname, ".")

now = datetime.datetime.now()
filename = list((str(now.year),"-",str(now.month),"-",str(now.day),"|",str(now.hour),"-",str(now.minute)))
filename = ''.join(filename)

#Save the dictionary containing the params and the results.
pkl.dump(lstm_dict,open(dirname + "/lstm_dict_" + filename + ".pkl",mode='wb'))

x_train shape is: (95450, 4, 88, 8)
y_train shape is: (95450, 4, 3)
x_test shape is: (10606, 4, 88, 8)
y_test shape is: (10606, 4, 3)
Update policy: top-k
Training with MI-RNN loss for 3 rounds
Round: 0
Epoch   1 Batch   534 ( 1080) Loss 0.00117 Acc 0.97000 | Val acc 0.97426 | Model saved to WESAD/model-lstm, global_step 1000
Epoch   1 Batch   534 ( 1080) Loss 0.00043 Acc 0.98857 | Val acc 0.98718 | Model saved to WESAD/model-lstm, global_step 1001
Epoch   1 Batch   534 ( 1080) Loss 0.00084 Acc 0.98000 | Val acc 0.99133 | Model saved to WESAD/model-lstm, global_step 1002
Epoch   1 Batch   534 ( 1080) Loss 0.00020 Acc 0.99429 | Val acc 0.98982 | Model saved to WESAD/model-lstm, global_step 1003
INFO:tensorflow:Restoring parameters from WESAD/model-lstm-1002
Round: 1
Epoch   1 Batch   534 ( 1080) Loss 0.00012 Acc 0.99714 | Val acc 0.99519 | Model saved to WESAD/model-lstm, global_step 1004
Epoch   1 Batch   534 ( 1080) Loss 0.00011 Acc 0.99571 | Val acc 0.99208 | Model saved to WESAD/mod

NameError: name 'pathlib' is not defined