Diff of /PreprocessECG.ipynb [000000] .. [c49678]

Switch to unified view

a b/PreprocessECG.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "markdown",
5
   "metadata": {},
6
   "source": [
7
    "# Preprocessing for ECG Classification\n",
8
    "\n",
9
    "> Copyright 2019 Dave Fernandes. All Rights Reserved.\n",
10
    "> \n",
11
    "> Licensed under the Apache License, Version 2.0 (the \"License\");\n",
12
    "> you may not use this file except in compliance with the License.\n",
13
    "> You may obtain a copy of the License at\n",
14
    ">\n",
15
    "> http://www.apache.org/licenses/LICENSE-2.0\n",
16
    ">  \n",
17
    "> Unless required by applicable law or agreed to in writing, software\n",
18
    "> distributed under the License is distributed on an \"AS IS\" BASIS,\n",
19
    "> WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n",
20
    "> See the License for the specific language governing permissions and\n",
21
    "> limitations under the License."
22
   ]
23
  },
24
  {
25
   "cell_type": "markdown",
26
   "metadata": {},
27
   "source": [
28
    "Data can be downloaded from: https://www.kaggle.com/shayanfazeli/heartbeat\n",
29
    "\n",
30
    "- Randomly sample 100 of each class of time-series for the test set. This is just over 10% of the samples in the smallest class.\n",
31
    "- Remaining data is balanced for the training set by upsampling under-represented classes."
32
   ]
33
  },
34
  {
35
   "cell_type": "code",
36
   "execution_count": null,
37
   "metadata": {},
38
   "outputs": [],
39
   "source": [
40
    "import numpy as np\n",
41
    "import pandas as pd\n",
42
    "import pickle\n",
43
    "\n",
44
    "CSV_1 = './Data/mitbih_train.csv'\n",
45
    "CSV_2 = './Data/mitbih_test.csv'\n",
46
    "\n",
47
    "TRAIN_SET = './Data/train_set.pickle'\n",
48
    "TEST_SET = './Data/test_set.pickle'\n",
49
    "\n",
50
    "raw_1 = pd.read_csv(CSV_1, header=None)\n",
51
    "raw_2 = pd.read_csv(CSV_2, header=None)\n",
52
    "raw = pd.concat([raw_1, raw_2], axis=0)\n",
53
    "\n",
54
    "shuffled = raw.sample(frac=1, axis=0)\n",
55
    "del raw\n",
56
    "del raw_1\n",
57
    "del raw_2\n",
58
    "\n",
59
    "values = shuffled.values\n",
60
    "x = values[:, :-1]\n",
61
    "y = values[:, -1].astype(int)\n",
62
    "del values\n",
63
    "del shuffled"
64
   ]
65
  },
66
  {
67
   "cell_type": "code",
68
   "execution_count": null,
69
   "metadata": {},
70
   "outputs": [],
71
   "source": [
72
    "TEST_CLASS_SIZE = 100\n",
73
    "\n",
74
    "class_x = []\n",
75
    "class_count = []\n",
76
    "\n",
77
    "for label in range(5):\n",
78
    "    x_i = x[y == label]\n",
79
    "    \n",
80
    "    # Take the first TEST_CLASS_SIZE elements for the test set\n",
81
    "    if label == 0:\n",
82
    "        x_test = x_i[:TEST_CLASS_SIZE, :]\n",
83
    "        y_test = np.zeros((TEST_CLASS_SIZE)).astype(int)\n",
84
    "    else:\n",
85
    "        x_test = np.concatenate((x_test, x_i[:TEST_CLASS_SIZE, :]), axis=0)\n",
86
    "        y_test = np.concatenate((y_test, np.zeros((TEST_CLASS_SIZE)).astype(int) + label))\n",
87
    "        \n",
88
    "    # Use the remainder of the elements for the training set\n",
89
    "    x_i = x_i[TEST_CLASS_SIZE:, :]\n",
90
    "    class_x.append(x_i)\n",
91
    "    class_count.append(len(x_i))\n",
92
    "\n",
93
    "# Compute the multiple of class elements needed to balance the classes\n",
94
    "counts = (np.floor(max(class_count) / np.array(class_count))).astype(int)\n",
95
    "print('Multiples:', counts)\n",
96
    "\n",
97
    "# Append repeated values for under-represented classes\n",
98
    "for label in range(5):\n",
99
    "    count = counts[label]\n",
100
    "    if label == 0:\n",
101
    "        x_bal = class_x[label]\n",
102
    "        y_bal = np.zeros((class_count[label])).astype(int)\n",
103
    "        count -= 1\n",
104
    "\n",
105
    "    for j in range(count):\n",
106
    "        x_bal = np.concatenate((x_bal, class_x[label]), axis=0)\n",
107
    "        y_bal = np.concatenate((y_bal, np.zeros((class_count[label])).astype(int) + label))\n",
108
    "\n",
109
    "print('Training set shapes:', np.shape(x_bal), np.shape(y_bal))\n",
110
    "print('Test set shapes:', np.shape(x_test), np.shape(y_test))\n",
111
    "\n",
112
    "with open(TEST_SET, 'wb') as file:\n",
113
    "    pickle.dump({'x': x_test, 'y': y_test}, file)\n",
114
    "\n",
115
    "with open(TRAIN_SET, 'wb') as file:\n",
116
    "    pickle.dump({'x': x_bal, 'y': y_bal}, file)"
117
   ]
118
  },
119
  {
120
   "cell_type": "markdown",
121
   "metadata": {},
122
   "source": [
123
    "## Next\n",
124
    "Run the `ClassifyECG.ipynb` notebook next..."
125
   ]
126
  },
127
  {
128
   "cell_type": "code",
129
   "execution_count": null,
130
   "metadata": {},
131
   "outputs": [],
132
   "source": []
133
  }
134
 ],
135
 "metadata": {
136
  "kernelspec": {
137
   "display_name": "Python 3",
138
   "language": "python",
139
   "name": "python3"
140
  },
141
  "language_info": {
142
   "codemirror_mode": {
143
    "name": "ipython",
144
    "version": 3
145
   },
146
   "file_extension": ".py",
147
   "mimetype": "text/x-python",
148
   "name": "python",
149
   "nbconvert_exporter": "python",
150
   "pygments_lexer": "ipython3",
151
   "version": "3.6.7"
152
  }
153
 },
154
 "nbformat": 4,
155
 "nbformat_minor": 2
156
}