In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
from cdtw import pydtw
import seaborn as sns
from tqdm import tqdm
import os
import json
import h5py

In [2]:
from keras import backend as K
from keras.regularizers import l2, activity_l2

from keras import backend as K
from keras.engine.topology import Layer
from keras.optimizers import RMSprop, SGD, Adam
from keras.layers.core import Dense, Dropout, Activation, Flatten, Lambda, Merge
from keras.layers.recurrent import LSTM, GRU
from keras.models import Sequential, Model, load_model
from keras.layers import Input, Bidirectional, merge
from keras.layers.convolutional import Convolution1D, AtrousConvolution1D
from keras.layers.pooling import MaxPooling1D, AveragePooling1D, GlobalMaxPooling1D

Using TensorFlow backend.


In [3]:
def read_train(fname):
    subjects = {}
    with h5py.File(fname, "r") as data_file:
        for subject, subject_data in data_file.items():
            print(subject)
            X = subject_data["data"][:]
            y = subject_data["labels"][:]
            subjects[subject] = (X, y)
    return subjects

def read_test(fname):
    subjects = {}
    with h5py.File(fname, "r") as data_file:
        X = {}
        for subject, subject_data in data_file.items():
            X[subject] = {}
            for chunk_id, chunk in data_file[subject].items():
                X[subject][chunk_id] = chunk[:]

    return X

def batch(ts, y, n=1):
    l = len(ts)
    for ndx in range(0, l-n, 1):
        yield (ts[ndx:min(ndx + n, l)], y[ndx:min(ndx + n, l)])

def label_batch(batch):
    if all([i == 1 for i in batch[1]]):
        return 1
    elif all([i == 0 for i in batch[1]]):
        return 0
    elif all([i == 2 for i in batch[1]]):
        return 2
    return -1
        
def get_data():
    train = read_train("train.h5")
    test = read_test("test.h5")
    

    subject_datas = {}
    for subject, data in tqdm(train.items()):
        subject_ts = data[0].T
        subject_y = data[1][0]
        batches = [i for i in batch(subject_ts, subject_y, n=1125)]
        batches = [(i[0], label_batch(i)) for i in batches]
        batches = [i for i in batches if i[1] != -1]
        batches = [i for i in batches if len(i[0]) == 1125]
        subject_datas[subject] = batches
    
    X = []
    y = []
    for subj, subj_data in tqdm(subject_datas.items()):
        X.extend([i[0] for i in subj_data])
        y.extend([i[1] for i in subj_data])
    return X, y, test

In [4]:
X, y, test = get_data()

subject_0
subject_1
subject_2
subject_3


100%|██████████| 4/4 [01:14<00:00, 19.86s/it]
100%|██████████| 4/4 [00:00<00:00, 145.58it/s]


In [5]:
X = np.array(X)
y = np.array(y)

In [6]:
def shuffle_in_unison_scary(a, b):
    rng_state = np.random.get_state()
    np.random.shuffle(a)
    np.random.set_state(rng_state)
    np.random.shuffle(b)

In [7]:
shuffle_in_unison_scary(X, y)

In [8]:
validation_start = len(X) - 30
X_train = X[:validation_start]
y_train = y[:validation_start]
X_val = X[validation_start:]
y_val = y[validation_start:]

In [9]:
from scipy.signal import resample


def toarr(label):
    arr = np.zeros(3)
    arr[label] = 1
    return arr

def data_generator(X, Y, batch_size):
    while True:
        inds = np.random.choice(len(X), batch_size)
        x = X[inds]
        y = Y[inds]
        y = np.vstack([toarr(i) for i in y])
        x_256 = np.array([resample(i, 256) for i in x])
        x_500 = np.array([resample(i, 500) for i in x])
        x = np.array([i for i in x])
        yield ([x_256, x_500, x], y)

In [10]:
def multiscale(chunk):
    resampled_256 = resample(chunk, 256)
    resampled_500 = resample(chunk, 500)
    return [resampled_256, resampled_500, chunk]

In [None]:
def get_base_model(input_len, fsize):
    '''Base network to be shared (eq. to feature extraction).
    '''
    with K.tf.device('/gpu:1'):
        input_seq = Input(shape=(input_len, 24))
        nb_filters = 150
        convolved = Convolution1D(nb_filters, fsize, border_mode="same", activation="tanh")(input_seq)
        processed = GlobalMaxPooling1D()(convolved)
        compressed = Dense(150, activation="tanh")(processed)
        compressed = Dropout(0.3)(compressed)
        compressed = Dense(150, activation="tanh")(compressed)
        model = Model(input=input_seq, output=compressed)            
        return model

In [None]:
with K.tf.device('/gpu:1'):
    
    input256_seq = Input(shape=(256, 24))
    input500_seq = Input(shape=(500, 24))
    input1125_seq = Input(shape=(1125, 24))
    
    base_network256 = get_base_model(256, 4)
    base_network500 = get_base_model(500, 7)
    base_network1125 = get_base_model(1125, 10)
    
    embedding_256 = base_network256(input256_seq)
    embedding_500 = base_network500(input500_seq)
    embedding_1125 = base_network256(input1125_seq)
    
    merged = merge([embedding_256, embedding_500, embedding_1125], mode="concat")
    out = Dense(3, activation='softmax')(merged)
    
    model = Model(input=[input256_seq, input500_seq, input1125_seq], output=out)
    
    #opt = SGD(lr=0.001, momentum=0.9, nesterov=True, clipvalue=0.0001)
    opt = RMSprop(lr=0.005, clipvalue=10**6)
    #opt = Adam(lr=0.001)
    model.compile(loss="categorical_crossentropy", optimizer=opt)

In [13]:
with K.tf.device('/gpu:2'):
    model = load_model("convnet-multiscale-true-022unk")

In [21]:
from keras.callbacks import EarlyStopping
nb_epoch = 100000
earlyStopping = EarlyStopping(monitor='val_loss', patience=10, verbose=0, mode='auto')
samples_per_epoch = 100000

with K.tf.device('/gpu:2'):
    model.fit_generator(data_generator(X_train, y_train, batch_size=50), samples_per_epoch, nb_epoch, 
                        callbacks=[earlyStopping], verbose=1)#, nb_val_samples=20000,
                        #validation_data=data_generator(X_val, y_val, batch_size=40))

Epoch 1/100000
Epoch 2/100000

KeyboardInterrupt: 

In [26]:
# BLEND NNS
for blend_id in range(25, 35):
    with K.tf.device('/gpu:2'):
        model.fit_generator(data_generator(X_train, y_train, batch_size=100), samples_per_epoch=30000, nb_epoch=1, 
                            callbacks=[earlyStopping], verbose=1)
        
        df = []
        for subj in test:
            for chunk in tqdm(test[subj]):
                data = {}
                data["subject_id"] = int(subj.split("_")[-1])
                data["chunk_id"] = int(chunk.split("_")[-1])
                arr = test[subj][chunk].T
                preds = model.predict([np.array([i]) for i in multiscale(arr)])[0]
                data["class_0_score"] = preds[0]
                data["class_1_score"] = preds[1]
                data["class_2_score"] = preds[2]
                for i in range(0, 1125):
                    data["tick"] = i
                    df.append(data.copy())
        df = pd.DataFrame(df)
        df = df[["subject_id", "chunk_id", "tick", "class_0_score",
                 "class_1_score","class_2_score"]]
        
        df.to_csv('submit_blended_' + str(blend_id) + '.csv', index=False)

Epoch 1/1

 24%|██▍       | 12/49 [00:00<00:00, 119.99it/s]




100%|██████████| 49/49 [00:00<00:00, 135.88it/s]
100%|██████████| 89/89 [00:00<00:00, 215.36it/s]
100%|██████████| 49/49 [00:00<00:00, 248.00it/s]
100%|██████████| 89/89 [00:00<00:00, 219.17it/s]


Epoch 1/1

  0%|          | 0/49 [00:00<?, ?it/s]




100%|██████████| 49/49 [00:00<00:00, 126.00it/s]
100%|██████████| 89/89 [00:00<00:00, 190.92it/s]
100%|██████████| 49/49 [00:00<00:00, 161.84it/s]
100%|██████████| 89/89 [00:00<00:00, 190.56it/s]


Epoch 1/1

  0%|          | 0/49 [00:00<?, ?it/s]




100%|██████████| 49/49 [00:00<00:00, 175.59it/s]
100%|██████████| 89/89 [00:00<00:00, 188.29it/s]
100%|██████████| 49/49 [00:00<00:00, 170.44it/s]
100%|██████████| 89/89 [00:00<00:00, 202.76it/s]


Epoch 1/1

 29%|██▊       | 14/49 [00:00<00:00, 139.19it/s]




100%|██████████| 49/49 [00:00<00:00, 190.23it/s]
100%|██████████| 89/89 [00:00<00:00, 235.47it/s]
100%|██████████| 49/49 [00:00<00:00, 221.90it/s]
100%|██████████| 89/89 [00:00<00:00, 171.74it/s]


Epoch 1/1

  0%|          | 0/49 [00:00<?, ?it/s]




100%|██████████| 49/49 [00:00<00:00, 119.83it/s]
100%|██████████| 89/89 [00:00<00:00, 195.06it/s]
100%|██████████| 49/49 [00:00<00:00, 187.22it/s]
100%|██████████| 89/89 [00:00<00:00, 201.47it/s]


Epoch 1/1

 27%|██▋       | 13/49 [00:00<00:00, 119.17it/s]




100%|██████████| 49/49 [00:00<00:00, 142.53it/s]
100%|██████████| 89/89 [00:00<00:00, 203.55it/s]
100%|██████████| 49/49 [00:00<00:00, 192.02it/s]
100%|██████████| 89/89 [00:00<00:00, 193.69it/s]


Epoch 1/1

 31%|███       | 15/49 [00:00<00:00, 145.08it/s]




100%|██████████| 49/49 [00:00<00:00, 181.02it/s]
100%|██████████| 89/89 [00:00<00:00, 199.11it/s]
100%|██████████| 49/49 [00:00<00:00, 229.50it/s]
100%|██████████| 89/89 [00:00<00:00, 204.20it/s]


Epoch 1/1

 37%|███▋      | 18/49 [00:00<00:00, 174.26it/s]




100%|██████████| 49/49 [00:00<00:00, 130.09it/s]
100%|██████████| 89/89 [00:00<00:00, 208.28it/s]
100%|██████████| 49/49 [00:00<00:00, 189.43it/s]
100%|██████████| 89/89 [00:00<00:00, 181.52it/s]


Epoch 1/1

 41%|████      | 20/49 [00:00<00:00, 191.91it/s]




100%|██████████| 49/49 [00:00<00:00, 202.34it/s]
100%|██████████| 89/89 [00:00<00:00, 216.69it/s]
100%|██████████| 49/49 [00:00<00:00, 212.14it/s]
100%|██████████| 89/89 [00:00<00:00, 212.55it/s]


Epoch 1/1

 29%|██▊       | 14/49 [00:00<00:00, 135.21it/s]




100%|██████████| 49/49 [00:00<00:00, 188.83it/s]
100%|██████████| 89/89 [00:00<00:00, 169.04it/s]
100%|██████████| 49/49 [00:00<00:00, 178.81it/s]
100%|██████████| 89/89 [00:00<00:00, 182.79it/s]


In [15]:
model.save("convnet-multiscale-deep-021unk")

In [19]:
df = []
for subj in test:
    for chunk in tqdm(test[subj]):
        data = {}
        data["subject_id"] = int(subj.split("_")[-1])
        data["chunk_id"] = int(chunk.split("_")[-1])
        arr = test[subj][chunk].T
        preds = model.predict([np.array([i]) for i in multiscale(arr)])[0]
        data["class_0_score"] = preds[0]
        data["class_1_score"] = preds[1]
        data["class_2_score"] = preds[2]
        for i in range(0, 1125):
            data["tick"] = i
            df.append(data.copy())
df = pd.DataFrame(df)
df = df[["subject_id", "chunk_id", "tick", "class_0_score",
         "class_1_score","class_2_score"]]

100%|██████████| 49/49 [00:00<00:00, 165.60it/s]
100%|██████████| 89/89 [00:00<00:00, 185.89it/s]
100%|██████████| 49/49 [00:00<00:00, 189.62it/s]
100%|██████████| 89/89 [00:00<00:00, 175.60it/s]


In [20]:
df.to_csv('submit_true_multiscale_016_large_batch.csv', index=False)