157 lines (156 with data), 4.8 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Preprocessing for ECG Classification\n",
"\n",
"> Copyright 2019 Dave Fernandes. All Rights Reserved.\n",
"> \n",
"> Licensed under the Apache License, Version 2.0 (the \"License\");\n",
"> you may not use this file except in compliance with the License.\n",
"> You may obtain a copy of the License at\n",
">\n",
"> http://www.apache.org/licenses/LICENSE-2.0\n",
"> \n",
"> Unless required by applicable law or agreed to in writing, software\n",
"> distributed under the License is distributed on an \"AS IS\" BASIS,\n",
"> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
"> See the License for the specific language governing permissions and\n",
"> limitations under the License."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Data can be downloaded from: https://www.kaggle.com/shayanfazeli/heartbeat\n",
"\n",
"- Randomly sample 100 of each class of time-series for the test set. This is just over 10% of the samples in the smallest class.\n",
"- Remaining data is balanced for the training set by upsampling under-represented classes."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import pandas as pd\n",
"import pickle\n",
"\n",
"CSV_1 = './Data/mitbih_train.csv'\n",
"CSV_2 = './Data/mitbih_test.csv'\n",
"\n",
"TRAIN_SET = './Data/train_set.pickle'\n",
"TEST_SET = './Data/test_set.pickle'\n",
"\n",
"raw_1 = pd.read_csv(CSV_1, header=None)\n",
"raw_2 = pd.read_csv(CSV_2, header=None)\n",
"raw = pd.concat([raw_1, raw_2], axis=0)\n",
"\n",
"shuffled = raw.sample(frac=1, axis=0)\n",
"del raw\n",
"del raw_1\n",
"del raw_2\n",
"\n",
"values = shuffled.values\n",
"x = values[:, :-1]\n",
"y = values[:, -1].astype(int)\n",
"del values\n",
"del shuffled"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"TEST_CLASS_SIZE = 100\n",
"\n",
"class_x = []\n",
"class_count = []\n",
"\n",
"for label in range(5):\n",
" x_i = x[y == label]\n",
" \n",
" # Take the first TEST_CLASS_SIZE elements for the test set\n",
" if label == 0:\n",
" x_test = x_i[:TEST_CLASS_SIZE, :]\n",
" y_test = np.zeros((TEST_CLASS_SIZE)).astype(int)\n",
" else:\n",
" x_test = np.concatenate((x_test, x_i[:TEST_CLASS_SIZE, :]), axis=0)\n",
" y_test = np.concatenate((y_test, np.zeros((TEST_CLASS_SIZE)).astype(int) + label))\n",
" \n",
" # Use the remainder of the elements for the training set\n",
" x_i = x_i[TEST_CLASS_SIZE:, :]\n",
" class_x.append(x_i)\n",
" class_count.append(len(x_i))\n",
"\n",
"# Compute the multiple of class elements needed to balance the classes\n",
"counts = (np.floor(max(class_count) / np.array(class_count))).astype(int)\n",
"print('Multiples:', counts)\n",
"\n",
"# Append repeated values for under-represented classes\n",
"for label in range(5):\n",
" count = counts[label]\n",
" if label == 0:\n",
" x_bal = class_x[label]\n",
" y_bal = np.zeros((class_count[label])).astype(int)\n",
" count -= 1\n",
"\n",
" for j in range(count):\n",
" x_bal = np.concatenate((x_bal, class_x[label]), axis=0)\n",
" y_bal = np.concatenate((y_bal, np.zeros((class_count[label])).astype(int) + label))\n",
"\n",
"print('Training set shapes:', np.shape(x_bal), np.shape(y_bal))\n",
"print('Test set shapes:', np.shape(x_test), np.shape(y_test))\n",
"\n",
"with open(TEST_SET, 'wb') as file:\n",
" pickle.dump({'x': x_test, 'y': y_test}, file)\n",
"\n",
"with open(TRAIN_SET, 'wb') as file:\n",
" pickle.dump({'x': x_bal, 'y': y_bal}, file)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Next\n",
"Run the `ClassifyECG.ipynb` notebook next..."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.7"
}
},
"nbformat": 4,
"nbformat_minor": 2
}