Diff of /ConvNet.ipynb [000000] .. [a8e9b4]

Switch to side-by-side view

--- a
+++ b/ConvNet.ipynb
@@ -0,0 +1,494 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "%matplotlib inline\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from cdtw import pydtw\n",
+    "import seaborn as sns\n",
+    "from tqdm import tqdm\n",
+    "import os\n",
+    "import json\n",
+    "import h5py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Using TensorFlow backend.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from keras import backend as K\n",
+    "from keras.regularizers import l2, activity_l2\n",
+    "\n",
+    "from keras import backend as K\n",
+    "from keras.engine.topology import Layer\n",
+    "from keras.optimizers import RMSprop, SGD, Adam\n",
+    "from keras.layers.core import Dense, Dropout, Activation, Flatten, Lambda, Merge\n",
+    "from keras.layers.recurrent import LSTM, GRU\n",
+    "from keras.models import Sequential, Model, load_model\n",
+    "from keras.layers import Input, Bidirectional, merge\n",
+    "    from keras.layers.convolutional import Convolution1D\n",
+    "from keras.layers.pooling import MaxPooling1D, AveragePooling1D, GlobalMaxPooling1D"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def read_train(fname):\n",
+    "    subjects = {}\n",
+    "    with h5py.File(fname, \"r\") as data_file:\n",
+    "        for subject, subject_data in data_file.items():\n",
+    "            print(subject)\n",
+    "            X = subject_data[\"data\"][:]\n",
+    "            y = subject_data[\"labels\"][:]\n",
+    "            subjects[subject] = (X, y)\n",
+    "    return subjects\n",
+    "\n",
+    "def read_test(fname):\n",
+    "    subjects = {}\n",
+    "    with h5py.File(fname, \"r\") as data_file:\n",
+    "        X = {}\n",
+    "        for subject, subject_data in data_file.items():\n",
+    "            X[subject] = {}\n",
+    "            for chunk_id, chunk in data_file[subject].items():\n",
+    "                X[subject][chunk_id] = chunk[:]\n",
+    "\n",
+    "    return X"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "train = read_train(\"train.h5\")\n",
+    "test = read_test(\"test.h5\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def batch(ts, y, n=1):\n",
+    "    l = len(ts)\n",
+    "    for ndx in range(0, l-n, 1):\n",
+    "        yield (ts[ndx:min(ndx + n, l)], y[ndx:min(ndx + n, l)])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def label_batch(batch):\n",
+    "    if all([i == 1 for i in batch[1]]):\n",
+    "        return 1\n",
+    "    elif all([i == 0 for i in batch[1]]):\n",
+    "        return 0\n",
+    "    elif all([i == 2 for i in batch[1]]):\n",
+    "        return 2\n",
+    "    return -1\n",
+    "\n",
+    "subject_datas = {}\n",
+    "for subject, data in tqdm(train.items()):\n",
+    "    subject_ts = data[0].T\n",
+    "    subject_y = data[1][0]\n",
+    "    batches = [i for i in batch(subject_ts, subject_y, n=1125)]\n",
+    "    batches = [(i[0], label_batch(i)) for i in batches]\n",
+    "    batches = [i for i in batches if i[1] != -1]\n",
+    "    batches = [i for i in batches if len(i[0]) == 1125]\n",
+    "    subject_datas[subject] = batches"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "subject_datas[\"subject_1\"][0][0].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "X = []\n",
+    "y = []\n",
+    "for subj, subj_data in tqdm(subject_datas.items()):\n",
+    "    X.extend([i[0] for i in subj_data])\n",
+    "    y.extend([i[1] for i in subj_data])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "X = np.array(X)\n",
+    "y = np.array(y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def shuffle_in_unison_scary(a, b):\n",
+    "    rng_state = np.random.get_state()\n",
+    "    np.random.shuffle(a)\n",
+    "    np.random.set_state(rng_state)\n",
+    "    np.random.shuffle(b)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "shuffle_in_unison_scary(X, y)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "def toarr(label):\n",
+    "    arr = np.zeros(3)\n",
+    "    arr[label] = 1\n",
+    "    return arr\n",
+    "y_arr = np.vstack([toarr(i) for i in y])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "validation_start = len(X) - 30000\n",
+    "X_train = X[:validation_start]\n",
+    "y_train = y_arr[:validation_start]\n",
+    "X_val = X[validation_start:]\n",
+    "y_val = y_arr[validation_start:]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "def get_base_model():\n",
+    "    '''Base network to be shared (eq. to feature extraction).\n",
+    "    '''\n",
+    "    with K.tf.device('/gpu:2'):\n",
+    "        input_seq = Input(shape=(1125, 24))\n",
+    "        \n",
+    "        filter_sizes = [5, 7, 14]\n",
+    "        nb_filters = 100\n",
+    "        filter_size = 7\n",
+    "        different_scales = []\n",
+    "        for fsize in filter_sizes:\n",
+    "            convolved = Convolution1D(nb_filters, fsize, border_mode=\"same\", activation=\"tanh\")(input_seq)\n",
+    "            processed = GlobalMaxPooling1D()(convolved)\n",
+    "            different_scales.append(processed)\n",
+    "            \n",
+    "        different_scales = merge(different_scales, mode='concat')\n",
+    "        compressed = Dense(150, activation=\"tanh\")(different_scales)\n",
+    "        compressed = Dropout(0.2)(compressed)\n",
+    "        compressed = Dense(150, activation=\"tanh\")(compressed)\n",
+    "        model = Model(input=input_seq, output=compressed)\n",
+    "        #return model\n",
+    "        #filter_size = 5\n",
+    "        \n",
+    "        #convolved = Convolution1D(nb_filters, filter_size, border_mode=\"same\", activation=\"tanh\")(input_seq)\n",
+    "        #processed = GlobalMaxPooling1D()(convolved)\n",
+    "        #compressed = Dense(300, activation=\"tanh\")(processed)\n",
+    "        #compressed = Dropout(0.3)(compressed)\n",
+    "        #compressed = Dense(300, activation=\"linear\")(compressed)\n",
+    "        #model = Model(input=input_seq, output=compressed)            \n",
+    "        return model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "with K.tf.device('/gpu:2'):\n",
+    "    base_network = get_base_model()\n",
+    "    input_seq = Input(shape=(1125, 24))\n",
+    "\n",
+    "    embedding = base_network(input_seq)\n",
+    "    out = Dense(3, activation='softmax')(embedding)\n",
+    "    \n",
+    "    model = Model(input=input_seq, output=out)\n",
+    "    \n",
+    "    #opt = SGD(lr=0.001, momentum=0.9, nesterov=True, clipvalue=0.0001)\n",
+    "    #opt = RMSprop(lr=0.001, clipvalue=10**6)\n",
+    "    opt = Adam(lr=0.01)\n",
+    "    model.compile(loss=\"categorical_crossentropy\", optimizer=opt)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "<tf.Tensor 'Tanh_3:0' shape=(?, 150) dtype=float32>"
+      ]
+     },
+     "execution_count": 13,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.layers[-2].layers[-3].get_output_at(0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[<tf.Tensor 'Tanh_4:0' shape=(?, 150) dtype=float32>]"
+      ]
+     },
+     "execution_count": 9,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.layers[-2].outputs = [model.layers[-2].layers[-3].get_output_at(0)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'X_train' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0mTraceback (most recent call last)",
+      "\u001b[0;32m<ipython-input-10-e101f3d38d74>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;32mwith\u001b[0m \u001b[0mK\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mtf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mdevice\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m'/gpu:2'\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m     model.fit(X_train, y_train, batch_size=60, callbacks=[earlyStopping],\n\u001b[0m\u001b[1;32m      8\u001b[0m               \u001b[0mnb_epoch\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m100\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mverbose\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m1\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mvalidation_split\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m0.2\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mshuffle\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0mTrue\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m               class_weight=None, sample_weight=None)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'X_train' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "from keras.callbacks import EarlyStopping\n",
+    "nb_epoch = 100000\n",
+    "earlyStopping = EarlyStopping(monitor='val_loss', patience=3, verbose=0, mode='auto')\n",
+    "#samples_per_epoch = 50000\n",
+    "\n",
+    "with K.tf.device('/gpu:2'):\n",
+    "    model.fit(X_train, y_train, batch_size=60, callbacks=[earlyStopping],\n",
+    "              nb_epoch=100, verbose=1, validation_split=0.2, shuffle=True,\n",
+    "              class_weight=None, sample_weight=None)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model.layers[-2].layers[-8]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "model.save(\"convnet-multiscale\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "preds = [np.argmax(i) for i in model.predict(X_val)]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "from sklearn.metrics import accuracy_score\n",
+    "accuracy_score([np.argmax(i) for i in y_val], preds)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "# GENERATES SUBMISSION DF\n",
+    "df = []\n",
+    "for subj in test:\n",
+    "    for chunk in tqdm(test[subj]):\n",
+    "        data = {}\n",
+    "        data[\"subject_id\"] = int(subj.split(\"_\")[-1])\n",
+    "        data[\"chunk_id\"] = int(chunk.split(\"_\")[-1])\n",
+    "        arr = test[subj][chunk].T\n",
+    "        preds = model.predict(np.array([arr]))[0]\n",
+    "        data[\"class_0_score\"] = preds[0]\n",
+    "        data[\"class_1_score\"] = preds[1]\n",
+    "        data[\"class_2_score\"] = preds[2]\n",
+    "        for i in range(0, 1125):\n",
+    "            data[\"tick\"] = i\n",
+    "            df.append(data.copy())\n",
+    "df = pd.DataFrame(df)\n",
+    "df = df[[\"subject_id\", \"chunk_id\", \"tick\", \"class_0_score\",\n",
+    "         \"class_1_score\",\"class_2_score\"]]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": false
+   },
+   "outputs": [],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "df.to_csv('submit_multiscale_untrained.csv', index=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}