495 lines (494 with data), 13.7 kB
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"%matplotlib inline\n",
"import pandas as pd\n",
"import numpy as np\n",
"from cdtw import pydtw\n",
"import seaborn as sns\n",
"from tqdm import tqdm\n",
"import os\n",
"import json\n",
"import h5py"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"from keras import backend as K\n",
"from keras.regularizers import l2, activity_l2\n",
"\n",
"from keras import backend as K\n",
"from keras.engine.topology import Layer\n",
"from keras.optimizers import RMSprop, SGD, Adam\n",
"from keras.layers.core import Dense, Dropout, Activation, Flatten, Lambda, Merge\n",
"from keras.layers.recurrent import LSTM, GRU\n",
"from keras.models import Sequential, Model, load_model\n",
"from keras.layers import Input, Bidirectional, merge\n",
" from keras.layers.convolutional import Convolution1D\n",
"from keras.layers.pooling import MaxPooling1D, AveragePooling1D, GlobalMaxPooling1D"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def read_train(fname):\n",
" subjects = {}\n",
" with h5py.File(fname, \"r\") as data_file:\n",
" for subject, subject_data in data_file.items():\n",
" print(subject)\n",
" X = subject_data[\"data\"][:]\n",
" y = subject_data[\"labels\"][:]\n",
" subjects[subject] = (X, y)\n",
" return subjects\n",
"\n",
"def read_test(fname):\n",
" subjects = {}\n",
" with h5py.File(fname, \"r\") as data_file:\n",
" X = {}\n",
" for subject, subject_data in data_file.items():\n",
" X[subject] = {}\n",
" for chunk_id, chunk in data_file[subject].items():\n",
" X[subject][chunk_id] = chunk[:]\n",
"\n",
" return X"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"train = read_train(\"train.h5\")\n",
"test = read_test(\"test.h5\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def batch(ts, y, n=1):\n",
" l = len(ts)\n",
" for ndx in range(0, l-n, 1):\n",
" yield (ts[ndx:min(ndx + n, l)], y[ndx:min(ndx + n, l)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def label_batch(batch):\n",
" if all([i == 1 for i in batch[1]]):\n",
" return 1\n",
" elif all([i == 0 for i in batch[1]]):\n",
" return 0\n",
" elif all([i == 2 for i in batch[1]]):\n",
" return 2\n",
" return -1\n",
"\n",
"subject_datas = {}\n",
"for subject, data in tqdm(train.items()):\n",
" subject_ts = data[0].T\n",
" subject_y = data[1][0]\n",
" batches = [i for i in batch(subject_ts, subject_y, n=1125)]\n",
" batches = [(i[0], label_batch(i)) for i in batches]\n",
" batches = [i for i in batches if i[1] != -1]\n",
" batches = [i for i in batches if len(i[0]) == 1125]\n",
" subject_datas[subject] = batches"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"subject_datas[\"subject_1\"][0][0].shape"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X = []\n",
"y = []\n",
"for subj, subj_data in tqdm(subject_datas.items()):\n",
" X.extend([i[0] for i in subj_data])\n",
" y.extend([i[1] for i in subj_data])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"X = np.array(X)\n",
"y = np.array(y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def shuffle_in_unison_scary(a, b):\n",
" rng_state = np.random.get_state()\n",
" np.random.shuffle(a)\n",
" np.random.set_state(rng_state)\n",
" np.random.shuffle(b)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"shuffle_in_unison_scary(X, y)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"def toarr(label):\n",
" arr = np.zeros(3)\n",
" arr[label] = 1\n",
" return arr\n",
"y_arr = np.vstack([toarr(i) for i in y])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"validation_start = len(X) - 30000\n",
"X_train = X[:validation_start]\n",
"y_train = y_arr[:validation_start]\n",
"X_val = X[validation_start:]\n",
"y_val = y_arr[validation_start:]"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"def get_base_model():\n",
" '''Base network to be shared (eq. to feature extraction).\n",
" '''\n",
" with K.tf.device('/gpu:2'):\n",
" input_seq = Input(shape=(1125, 24))\n",
" \n",
" filter_sizes = [5, 7, 14]\n",
" nb_filters = 100\n",
" filter_size = 7\n",
" different_scales = []\n",
" for fsize in filter_sizes:\n",
" convolved = Convolution1D(nb_filters, fsize, border_mode=\"same\", activation=\"tanh\")(input_seq)\n",
" processed = GlobalMaxPooling1D()(convolved)\n",
" different_scales.append(processed)\n",
" \n",
" different_scales = merge(different_scales, mode='concat')\n",
" compressed = Dense(150, activation=\"tanh\")(different_scales)\n",
" compressed = Dropout(0.2)(compressed)\n",
" compressed = Dense(150, activation=\"tanh\")(compressed)\n",
" model = Model(input=input_seq, output=compressed)\n",
" #return model\n",
" #filter_size = 5\n",
" \n",
" #convolved = Convolution1D(nb_filters, filter_size, border_mode=\"same\", activation=\"tanh\")(input_seq)\n",
" #processed = GlobalMaxPooling1D()(convolved)\n",
" #compressed = Dense(300, activation=\"tanh\")(processed)\n",
" #compressed = Dropout(0.3)(compressed)\n",
" #compressed = Dense(300, activation=\"linear\")(compressed)\n",
" #model = Model(input=input_seq, output=compressed) \n",
" return model"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"with K.tf.device('/gpu:2'):\n",
" base_network = get_base_model()\n",
" input_seq = Input(shape=(1125, 24))\n",
"\n",
" embedding = base_network(input_seq)\n",
" out = Dense(3, activation='softmax')(embedding)\n",
" \n",
" model = Model(input=input_seq, output=out)\n",
" \n",
" #opt = SGD(lr=0.001, momentum=0.9, nesterov=True, clipvalue=0.0001)\n",
" #opt = RMSprop(lr=0.001, clipvalue=10**6)\n",
" opt = Adam(lr=0.01)\n",
" model.compile(loss=\"categorical_crossentropy\", optimizer=opt)"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"<tf.Tensor 'Tanh_3:0' shape=(?, 150) dtype=float32>"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.layers[-2].layers[-3].get_output_at(0)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"collapsed": false
},
"outputs": [
{
"data": {
"text/plain": [
"[<tf.Tensor 'Tanh_4:0' shape=(?, 150) dtype=float32>]"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"model.layers[-2].outputs = [model.layers[-2].layers[-3].get_output_at(0)]"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"collapsed": false
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'X_train' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m\u001b[0m",
"\u001b[0;31mNameError\u001b[0mTraceback (most recent call last)",
"\u001b[0;32m<ipython-input-10-e101f3d38d74>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 6\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mK\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/gpu:2'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m model.fit(X_train, y_train, batch_size=60, callbacks=[earlyStopping],\n\u001b[0m\u001b[1;32m 8\u001b[0m \u001b[0mnb_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_split\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 9\u001b[0m class_weight=None, sample_weight=None)\n",
"\u001b[0;31mNameError\u001b[0m: name 'X_train' is not defined"
]
}
],
"source": [
"from keras.callbacks import EarlyStopping\n",
"nb_epoch = 100000\n",
"earlyStopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')\n",
"#samples_per_epoch = 50000\n",
"\n",
"with K.tf.device('/gpu:2'):\n",
" model.fit(X_train, y_train, batch_size=60, callbacks=[earlyStopping],\n",
" nb_epoch=100, verbose=1, validation_split=0.2, shuffle=True,\n",
" class_weight=None, sample_weight=None)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"model.layers[-2].layers[-8]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"model.save(\"convnet-multiscale\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"preds = [np.argmax(i) for i in model.predict(X_val)]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"from sklearn.metrics import accuracy_score\n",
"accuracy_score([np.argmax(i) for i in y_val], preds)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# GENERATES SUBMISSION DF\n",
"df = []\n",
"for subj in test:\n",
" for chunk in tqdm(test[subj]):\n",
" data = {}\n",
" data[\"subject_id\"] = int(subj.split(\"_\")[-1])\n",
" data[\"chunk_id\"] = int(chunk.split(\"_\")[-1])\n",
" arr = test[subj][chunk].T\n",
" preds = model.predict(np.array([arr]))[0]\n",
" data[\"class_0_score\"] = preds[0]\n",
" data[\"class_1_score\"] = preds[1]\n",
" data[\"class_2_score\"] = preds[2]\n",
" for i in range(0, 1125):\n",
" data[\"tick\"] = i\n",
" df.append(data.copy())\n",
"df = pd.DataFrame(df)\n",
"df = df[[\"subject_id\", \"chunk_id\", \"tick\", \"class_0_score\",\n",
" \"class_1_score\",\"class_2_score\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"df.to_csv('submit_multiscale_untrained.csv', index=False)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 1
}