[d42376]: / src / data.py

Download this file

143 lines (124 with data), 6.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""
The data is provided by
https://physionet.org/physiobank/database/html/mitdbdir/mitdbdir.htm
The recordings were digitized at 360 samples per second per channel with 11-bit resolution over a 10 mV range.
Two or more cardiologists independently annotated each record; disagreements were resolved to obtain the computer-readable
reference annotations for each beat (approximately 110,000 annotations in all) included with the database.
Code Description
N Normal beat (displayed as . by the PhysioBank ATM, LightWAVE, pschart, and psfd)
L Left bundle branch block beat
R Right bundle branch block beat
B Bundle branch block beat (unspecified)
A Atrial premature beat
a Aberrated atrial premature beat
J Nodal (junctional) premature beat
S Supraventricular premature or ectopic beat (atrial or nodal)
V Premature ventricular contraction
r R-on-T premature ventricular contraction
F Fusion of ventricular and normal beat
e Atrial escape beat
j Nodal (junctional) escape beat
n Supraventricular escape beat (atrial or nodal)
E Ventricular escape beat
/ Paced beat
f Fusion of paced and normal beat
Q Unclassifiable beat
? Beat not classified during learning
"""
from __future__ import division, print_function
import os
from tqdm import tqdm
import numpy as np
import random
import h5py
from utils import *
from config import get_config
def preprocess( split ):
nums = ['100','101','102','103','104','105','106','107','108','109','111','112','113','114','115','116','117','118','119','121','122','123','124','200','201','202','203','205','207','208','209','210','212','213','214','215','217','219','220','221','222','223','228','230','231','232','233','234']
features = ['MLII', 'V1', 'V2', 'V4', 'V5']
if split :
testset = ['101', '105','114','118', '124', '201', '210' , '217']
trainset = [x for x in nums if x not in testset]
def dataSaver(dataSet, datasetname, labelsname):
classes = ['N','V','/','A','F','~']#,'L','R',f','j','E','a']#,'J','Q','e','S']
Nclass = len(classes)
datadict, datalabel= dict(), dict()
for feature in features:
datadict[feature] = list()
datalabel[feature] = list()
def dataprocess():
input_size = config.input_size
for num in tqdm(dataSet):
from wfdb import rdrecord, rdann
record = rdrecord('dataset/'+ num, smooth_frames= True)
from sklearn import preprocessing
signals0 = preprocessing.scale(np.nan_to_num(record.p_signal[:,0])).tolist()
signals1 = preprocessing.scale(np.nan_to_num(record.p_signal[:,1])).tolist()
from scipy.signal import find_peaks
peaks, _ = find_peaks(signals0, distance=150)
feature0, feature1 = record.sig_name[0], record.sig_name[1]
global lppened0, lappend1, dappend0, dappend1
lappend0 = datalabel[feature0].append
lappend1 = datalabel[feature1].append
dappend0 = datadict[feature0].append
dappend1 = datadict[feature1].append
# skip a first peak to have enough range of the sample
for peak in tqdm(peaks[1:-1]):
start, end = peak-input_size//2 , peak+input_size//2
ann = rdann('dataset/'+ num, extension='atr', sampfrom = start, sampto = end, return_label_elements=['symbol'])
def to_dict(chosenSym):
y = [0]*Nclass
y[classes.index(chosenSym)] = 1
lappend0(y)
lappend1(y)
dappend0(signals0[start:end])
dappend1(signals1[start:end])
annSymbol = ann.symbol
# remove some of "N" which breaks the balance of dataset
if len(annSymbol) == 1 and (annSymbol[0] in classes) and (annSymbol[0] != "N" or np.random.random()<0.15):
to_dict(annSymbol[0])
print("processing data...")
dataprocess()
noises = add_noise(config)
for feature in ["MLII", "V1"]:
d = np.array(datadict[feature])
if len(d) > 15*10**3:
n = np.array(noises["trainset"])
else:
n = np.array(noises["testset"])
datadict[feature]=np.concatenate((d,n))
size, _ = n.shape
l = np.array(datalabel[feature])
noise_label = [0]*Nclass
noise_label[-1] = 1
noise_label = np.array([noise_label] * size)
datalabel[feature] = np.concatenate((l, noise_label))
with h5py.File(datasetname, 'w') as f:
for key, data in datadict.items():
f.create_dataset(key, data=data)
with h5py.File(labelsname, 'w') as f:
for key, data in datalabel.items():
f.create_dataset(key, data=data)
if split:
dataSaver(trainset, 'dataset/train.keras', 'dataset/trainlabel.keras')
dataSaver(testset, 'dataset/test.keras', 'dataset/testlabel.keras')
else:
dataSaver(nums, 'dataset/targetdata.keras', 'dataset/labeldata.keras')
def main(config):
def Downloadmitdb():
ext = ['dat', 'hea', 'atr']
nums = ['100','101','102','103','104','105','106','107','108','109','111','112','113','114','115','116','117','118','119','121','122','123','124','200','201','202','203','205','207','208','209','210','212','213','214','215','217','219','220','221','222','223','228','230','231','232','233','234']
for num in tqdm(nums):
for e in ext:
url = "https://physionet.org/physiobank/database/mitdb/"
url = url + num +"."+e
mkdir_recursive('dataset')
cmd = "cd dataset && curl -O "+url
os.system(cmd)
if config.downloading:
Downloadmitdb()
#print("do not download")
return preprocess(config.split)
if __name__=="__main__":
config = get_config()
main(config)