--- a +++ b/PreprocessECG.ipynb @@ -0,0 +1,156 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Preprocessing for ECG Classification\n", + "\n", + "> Copyright 2019 Dave Fernandes. All Rights Reserved.\n", + "> \n", + "> Licensed under the Apache License, Version 2.0 (the \"License\");\n", + "> you may not use this file except in compliance with the License.\n", + "> You may obtain a copy of the License at\n", + ">\n", + "> http://www.apache.org/licenses/LICENSE-2.0\n", + "> \n", + "> Unless required by applicable law or agreed to in writing, software\n", + "> distributed under the License is distributed on an \"AS IS\" BASIS,\n", + "> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n", + "> See the License for the specific language governing permissions and\n", + "> limitations under the License." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data can be downloaded from: https://www.kaggle.com/shayanfazeli/heartbeat\n", + "\n", + "- Randomly sample 100 of each class of time-series for the test set. This is just over 10% of the samples in the smallest class.\n", + "- Remaining data is balanced for the training set by upsampling under-represented classes." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "import pandas as pd\n", + "import pickle\n", + "\n", + "CSV_1 = './Data/mitbih_train.csv'\n", + "CSV_2 = './Data/mitbih_test.csv'\n", + "\n", + "TRAIN_SET = './Data/train_set.pickle'\n", + "TEST_SET = './Data/test_set.pickle'\n", + "\n", + "raw_1 = pd.read_csv(CSV_1, header=None)\n", + "raw_2 = pd.read_csv(CSV_2, header=None)\n", + "raw = pd.concat([raw_1, raw_2], axis=0)\n", + "\n", + "shuffled = raw.sample(frac=1, axis=0)\n", + "del raw\n", + "del raw_1\n", + "del raw_2\n", + "\n", + "values = shuffled.values\n", + "x = values[:, :-1]\n", + "y = values[:, -1].astype(int)\n", + "del values\n", + "del shuffled" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "TEST_CLASS_SIZE = 100\n", + "\n", + "class_x = []\n", + "class_count = []\n", + "\n", + "for label in range(5):\n", + " x_i = x[y == label]\n", + " \n", + " # Take the first TEST_CLASS_SIZE elements for the test set\n", + " if label == 0:\n", + " x_test = x_i[:TEST_CLASS_SIZE, :]\n", + " y_test = np.zeros((TEST_CLASS_SIZE)).astype(int)\n", + " else:\n", + " x_test = np.concatenate((x_test, x_i[:TEST_CLASS_SIZE, :]), axis=0)\n", + " y_test = np.concatenate((y_test, np.zeros((TEST_CLASS_SIZE)).astype(int) + label))\n", + " \n", + " # Use the remainder of the elements for the training set\n", + " x_i = x_i[TEST_CLASS_SIZE:, :]\n", + " class_x.append(x_i)\n", + " class_count.append(len(x_i))\n", + "\n", + "# Compute the multiple of class elements needed to balance the classes\n", + "counts = (np.floor(max(class_count) / np.array(class_count))).astype(int)\n", + "print('Multiples:', counts)\n", + "\n", + "# Append repeated values for under-represented classes\n", + "for label in range(5):\n", + " count = counts[label]\n", + " if label == 0:\n", + " x_bal = class_x[label]\n", + " y_bal = np.zeros((class_count[label])).astype(int)\n", + " count -= 1\n", + "\n", + " for j in range(count):\n", + " x_bal = np.concatenate((x_bal, class_x[label]), axis=0)\n", + " y_bal = np.concatenate((y_bal, np.zeros((class_count[label])).astype(int) + label))\n", + "\n", + "print('Training set shapes:', np.shape(x_bal), np.shape(y_bal))\n", + "print('Test set shapes:', np.shape(x_test), np.shape(y_test))\n", + "\n", + "with open(TEST_SET, 'wb') as file:\n", + " pickle.dump({'x': x_test, 'y': y_test}, file)\n", + "\n", + "with open(TRAIN_SET, 'wb') as file:\n", + " pickle.dump({'x': x_bal, 'y': y_bal}, file)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Next\n", + "Run the `ClassifyECG.ipynb` notebook next..." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.7" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}