Diff of /PreprocessECG.ipynb [000000] .. [c49678]

Switch to side-by-side view

--- a
+++ b/PreprocessECG.ipynb
@@ -0,0 +1,156 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preprocessing for ECG Classification\n",
+    "\n",
+    "> Copyright 2019 Dave Fernandes. All Rights Reserved.\n",
+    "> \n",
+    "> Licensed under the Apache License, Version 2.0 (the \"License\");\n",
+    "> you may not use this file except in compliance with the License.\n",
+    "> You may obtain a copy of the License at\n",
+    ">\n",
+    "> http://www.apache.org/licenses/LICENSE-2.0\n",
+    ">  \n",
+    "> Unless required by applicable law or agreed to in writing, software\n",
+    "> distributed under the License is distributed on an \"AS IS\" BASIS,\n",
+    "> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
+    "> See the License for the specific language governing permissions and\n",
+    "> limitations under the License."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Data can be downloaded from: https://www.kaggle.com/shayanfazeli/heartbeat\n",
+    "\n",
+    "- Randomly sample 100 of each class of time-series for the test set. This is just over 10% of the samples in the smallest class.\n",
+    "- Remaining data is balanced for the training set by upsampling under-represented classes."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "import pickle\n",
+    "\n",
+    "CSV_1 = './Data/mitbih_train.csv'\n",
+    "CSV_2 = './Data/mitbih_test.csv'\n",
+    "\n",
+    "TRAIN_SET = './Data/train_set.pickle'\n",
+    "TEST_SET = './Data/test_set.pickle'\n",
+    "\n",
+    "raw_1 = pd.read_csv(CSV_1, header=None)\n",
+    "raw_2 = pd.read_csv(CSV_2, header=None)\n",
+    "raw = pd.concat([raw_1, raw_2], axis=0)\n",
+    "\n",
+    "shuffled = raw.sample(frac=1, axis=0)\n",
+    "del raw\n",
+    "del raw_1\n",
+    "del raw_2\n",
+    "\n",
+    "values = shuffled.values\n",
+    "x = values[:, :-1]\n",
+    "y = values[:, -1].astype(int)\n",
+    "del values\n",
+    "del shuffled"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "TEST_CLASS_SIZE = 100\n",
+    "\n",
+    "class_x = []\n",
+    "class_count = []\n",
+    "\n",
+    "for label in range(5):\n",
+    "    x_i = x[y == label]\n",
+    "    \n",
+    "    # Take the first TEST_CLASS_SIZE elements for the test set\n",
+    "    if label == 0:\n",
+    "        x_test = x_i[:TEST_CLASS_SIZE, :]\n",
+    "        y_test = np.zeros((TEST_CLASS_SIZE)).astype(int)\n",
+    "    else:\n",
+    "        x_test = np.concatenate((x_test, x_i[:TEST_CLASS_SIZE, :]), axis=0)\n",
+    "        y_test = np.concatenate((y_test, np.zeros((TEST_CLASS_SIZE)).astype(int) + label))\n",
+    "        \n",
+    "    # Use the remainder of the elements for the training set\n",
+    "    x_i = x_i[TEST_CLASS_SIZE:, :]\n",
+    "    class_x.append(x_i)\n",
+    "    class_count.append(len(x_i))\n",
+    "\n",
+    "# Compute the multiple of class elements needed to balance the classes\n",
+    "counts = (np.floor(max(class_count) / np.array(class_count))).astype(int)\n",
+    "print('Multiples:', counts)\n",
+    "\n",
+    "# Append repeated values for under-represented classes\n",
+    "for label in range(5):\n",
+    "    count = counts[label]\n",
+    "    if label == 0:\n",
+    "        x_bal = class_x[label]\n",
+    "        y_bal = np.zeros((class_count[label])).astype(int)\n",
+    "        count -= 1\n",
+    "\n",
+    "    for j in range(count):\n",
+    "        x_bal = np.concatenate((x_bal, class_x[label]), axis=0)\n",
+    "        y_bal = np.concatenate((y_bal, np.zeros((class_count[label])).astype(int) + label))\n",
+    "\n",
+    "print('Training set shapes:', np.shape(x_bal), np.shape(y_bal))\n",
+    "print('Test set shapes:', np.shape(x_test), np.shape(y_test))\n",
+    "\n",
+    "with open(TEST_SET, 'wb') as file:\n",
+    "    pickle.dump({'x': x_test, 'y': y_test}, file)\n",
+    "\n",
+    "with open(TRAIN_SET, 'wb') as file:\n",
+    "    pickle.dump({'x': x_bal, 'y': y_bal}, file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Next\n",
+    "Run the `ClassifyECG.ipynb` notebook next..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}