--- a +++ b/python/load_MITBIH.py @@ -0,0 +1,600 @@ +#!/usr/bin/env python + +""" +load_MITBIH.py + +Download .csv files and annotations from: + kaggle.com/mondejar/mitbih-database + +VARPA, University of Coruna +Mondejar Guerra, Victor M. +23 Oct 2017 +""" + +import os +import csv +import gc +import cPickle as pickle + +import numpy as np +import matplotlib.pyplot as plt +from scipy.signal import medfilt +import scipy.stats +import pywt +import time +import sklearn +from sklearn import decomposition +from sklearn.decomposition import PCA, IncrementalPCA + +from features_ECG import * + +from numpy.polynomial.hermite import hermfit, hermval + + + +def create_features_labels_name(DS, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag): + + features_labels_name = db_path + 'features/' + 'w_' + str(winL) + '_' + str(winR) + '_' + DS + + if do_preprocess: + features_labels_name += '_rm_bsline' + + if maxRR: + features_labels_name += '_maxRR' + + if use_RR: + features_labels_name += '_RR' + + if norm_RR: + features_labels_name += '_norm_RR' + + for descp in compute_morph: + features_labels_name += '_' + descp + + if reduced_DS: + features_labels_name += '_reduced' + + if leads_flag[0] == 1: + features_labels_name += '_MLII' + + if leads_flag[1] == 1: + features_labels_name += '_V1' + + features_labels_name += '.p' + + return features_labels_name + + +def save_wvlt_PCA(PCA, pca_k, family, level): + f = open('Wvlt_' + family + '_' + str(level) + '_PCA_' + str(pca_k) + '.p', 'wb') + pickle.dump(PCA, f, 2) + f.close + + +def load_wvlt_PCA(pca_k, family, level): + f = open('Wvlt_' + family + '_' + str(level) + '_PCA_' + str(pca_k) + '.p', 'rb') + # disable garbage collector + gc.disable()# this improve the required loading time! + PCA = pickle.load(f) + gc.enable() + f.close() + + return PCA +# Load the data with the configuration and features selected +# Params: +# - leads_flag = [MLII, V1] set the value to 0 or 1 to reference if that lead is used +# - reduced_DS = load DS1, DS2 patients division (Chazal) or reduced version, +# i.e., only patients in common that contains both MLII and V1 +def load_mit_db(DS, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag): + + features_labels_name = create_features_labels_name(DS, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag) + + if os.path.isfile(features_labels_name): + print("Loading pickle: " + features_labels_name + "...") + f = open(features_labels_name, 'rb') + # disable garbage collector + gc.disable()# this improve the required loading time! + features, labels, patient_num_beats = pickle.load(f) + gc.enable() + f.close() + + + else: + print("Loading MIT BIH arr (" + DS + ") ...") + + # ML-II + if reduced_DS == False: + DS1 = [101, 106, 108, 109, 112, 114, 115, 116, 118, 119, 122, 124, 201, 203, 205, 207, 208, 209, 215, 220, 223, 230] + DS2 = [100, 103, 105, 111, 113, 117, 121, 123, 200, 202, 210, 212, 213, 214, 219, 221, 222, 228, 231, 232, 233, 234] + + # ML-II + V1 + else: + DS1 = [101, 106, 108, 109, 112, 115, 118, 119, 201, 203, 205, 207, 208, 209, 215, 220, 223, 230] + DS2 = [105, 111, 113, 121, 200, 202, 210, 212, 213, 214, 219, 221, 222, 228, 231, 232, 233, 234] + + mit_pickle_name = db_path + 'python_mit' + if reduced_DS: + mit_pickle_name = mit_pickle_name + '_reduced_' + + if do_preprocess: + mit_pickle_name = mit_pickle_name + '_rm_bsline' + + mit_pickle_name = mit_pickle_name + '_wL_' + str(winL) + '_wR_' + str(winR) + mit_pickle_name = mit_pickle_name + '_' + DS + '.p' + + # If the data with that configuration has been already computed Load pickle + if os.path.isfile(mit_pickle_name): + f = open(mit_pickle_name, 'rb') + # disable garbage collector + gc.disable()# this improve the required loading time! + my_db = pickle.load(f) + gc.enable() + f.close() + else: # Load data and compute de RR features + if DS == 'DS1': + my_db = load_signal(DS1, winL, winR, do_preprocess) + else: + my_db = load_signal(DS2, winL, winR, do_preprocess) + + print("Saving signal processed data ...") + # Save data + # Protocol version 0 itr_features_balanceds the original ASCII protocol and is backwards compatible with earlier versions of Python. + # Protocol version 1 is the old binary format which is also compatible with earlier versions of Python. + # Protocol version 2 was introduced in Python 2.3. It provides much more efficient pickling of new-style classes. + f = open(mit_pickle_name, 'wb') + pickle.dump(my_db, f, 2) + f.close + + + features = np.array([], dtype=float) + labels = np.array([], dtype=np.int32) + + # This array contains the number of beats for each patient (for cross_val) + patient_num_beats = np.array([], dtype=np.int32) + for p in range(len(my_db.beat)): + patient_num_beats = np.append(patient_num_beats, len(my_db.beat[p])) + + # Compute RR features + if use_RR or norm_RR: + if DS == 'DS1': + RR = [RR_intervals() for i in range(len(DS1))] + else: + RR = [RR_intervals() for i in range(len(DS2))] + + print("Computing RR intervals ...") + + for p in range(len(my_db.beat)): + if maxRR: + RR[p] = compute_RR_intervals(my_db.R_pos[p]) + else: + RR[p] = compute_RR_intervals(my_db.orig_R_pos[p]) + + RR[p].pre_R = RR[p].pre_R[(my_db.valid_R[p] == 1)] + RR[p].post_R = RR[p].post_R[(my_db.valid_R[p] == 1)] + RR[p].local_R = RR[p].local_R[(my_db.valid_R[p] == 1)] + RR[p].global_R = RR[p].global_R[(my_db.valid_R[p] == 1)] + + + if use_RR: + f_RR = np.empty((0,4)) + for p in range(len(RR)): + row = np.column_stack((RR[p].pre_R, RR[p].post_R, RR[p].local_R, RR[p].global_R)) + f_RR = np.vstack((f_RR, row)) + + features = np.column_stack((features, f_RR)) if features.size else f_RR + + if norm_RR: + f_RR_norm = np.empty((0,4)) + for p in range(len(RR)): + # Compute avg values! + avg_pre_R = np.average(RR[p].pre_R) + avg_post_R = np.average(RR[p].post_R) + avg_local_R = np.average(RR[p].local_R) + avg_global_R = np.average(RR[p].global_R) + + row = np.column_stack((RR[p].pre_R / avg_pre_R, RR[p].post_R / avg_post_R, RR[p].local_R / avg_local_R, RR[p].global_R / avg_global_R)) + f_RR_norm = np.vstack((f_RR_norm, row)) + + features = np.column_stack((features, f_RR_norm)) if features.size else f_RR_norm + + ######################################################################################### + # Compute morphological features + print("Computing morphological features (" + DS + ") ...") + + num_leads = np.sum(leads_flag) + + # Raw + if 'resample_10' in compute_morph: + print("Resample_10 ...") + start = time.time() + + f_raw = np.empty((0, 10 * num_leads)) + + for p in range(len(my_db.beat)): + for beat in my_db.beat[p]: + f_raw_lead = np.empty([]) + for s in range(2): + if leads_flag[s] == 1: + resamp_beat = scipy.signal.resample(beat[s], 10) + if f_raw_lead.size == 1: + f_raw_lead = resamp_beat + else: + f_raw_lead = np.hstack((f_raw_lead, resamp_beat)) + f_raw = np.vstack((f_raw, f_raw_lead)) + + features = np.column_stack((features, f_raw)) if features.size else f_raw + + end = time.time() + print("Time resample: " + str(format(end - start, '.2f')) + " sec" ) + + if 'raw' in compute_morph: + print("Raw ...") + start = time.time() + + f_raw = np.empty((0, (winL + winR) * num_leads)) + + for p in range(len(my_db.beat)): + for beat in my_db.beat[p]: + f_raw_lead = np.empty([]) + for s in range(2): + if leads_flag[s] == 1: + if f_raw_lead.size == 1: + f_raw_lead = beat[s] + else: + f_raw_lead = np.hstack((f_raw_lead, beat[s])) + f_raw = np.vstack((f_raw, f_raw_lead)) + + features = np.column_stack((features, f_raw)) if features.size else f_raw + + end = time.time() + print("Time raw: " + str(format(end - start, '.2f')) + " sec" ) + # LBP 1D + # 1D-local binary pattern based feature extraction for classification of epileptic EEG signals: 2014, unas 55 citas, Q2-Q1 Matematicas + # https://ac.els-cdn.com/S0096300314008285/1-s2.0-S0096300314008285-main.pdf?_tid=8a8433a6-e57f-11e7-98ec-00000aab0f6c&acdnat=1513772341_eb5d4d26addb6c0b71ded4fd6cc23ed5 + + # 1D-LBP method, which derived from implementation steps of 2D-LBP, was firstly proposed by Chatlani et al. for detection of speech signals that is non-stationary in nature [23] + + # From Raw signal + + # TODO: Some kind of preprocesing or clean high frequency noise? + + # Compute 2 Histograms: LBP or Uniform LBP + # LBP 8 = 0-255 + # U-LBP 8 = 0-58 + # Uniform LBP are only those pattern wich only presents 2 (or less) transitions from 0-1 or 1-0 + # All the non-uniform patterns are asigned to the same value in the histogram + + if 'u-lbp' in compute_morph: + print("u-lbp ...") + + f_lbp = np.empty((0, 59 * num_leads)) + + for p in range(len(my_db.beat)): + for beat in my_db.beat[p]: + f_lbp_lead = np.empty([]) + for s in range(2): + if leads_flag[s] == 1: + if f_lbp_lead.size == 1: + + f_lbp_lead = compute_Uniform_LBP(beat[s], 8) + else: + f_lbp_lead = np.hstack((f_lbp_lead, compute_Uniform_LBP(beat[s], 8))) + f_lbp = np.vstack((f_lbp, f_lbp_lead)) + + features = np.column_stack((features, f_lbp)) if features.size else f_lbp + print(features.shape) + + if 'lbp' in compute_morph: + print("lbp ...") + + f_lbp = np.empty((0, 16 * num_leads)) + + for p in range(len(my_db.beat)): + for beat in my_db.beat[p]: + f_lbp_lead = np.empty([]) + for s in range(2): + if leads_flag[s] == 1: + if f_lbp_lead.size == 1: + + f_lbp_lead = compute_LBP(beat[s], 4) + else: + f_lbp_lead = np.hstack((f_lbp_lead, compute_LBP(beat[s], 4))) + f_lbp = np.vstack((f_lbp, f_lbp_lead)) + + features = np.column_stack((features, f_lbp)) if features.size else f_lbp + print(features.shape) + + + if 'hbf5' in compute_morph: + print("hbf ...") + + f_hbf = np.empty((0, 15 * num_leads)) + + for p in range(len(my_db.beat)): + for beat in my_db.beat[p]: + f_hbf_lead = np.empty([]) + for s in range(2): + if leads_flag[s] == 1: + if f_hbf_lead.size == 1: + + f_hbf_lead = compute_HBF(beat[s]) + else: + f_hbf_lead = np.hstack((f_hbf_lead, compute_HBF(beat[s]))) + f_hbf = np.vstack((f_hbf, f_hbf_lead)) + + features = np.column_stack((features, f_hbf)) if features.size else f_hbf + print(features.shape) + + # Wavelets + if 'wvlt' in compute_morph: + print("Wavelets ...") + + f_wav = np.empty((0, 23 * num_leads)) + + for p in range(len(my_db.beat)): + for b in my_db.beat[p]: + f_wav_lead = np.empty([]) + for s in range(2): + if leads_flag[s] == 1: + if f_wav_lead.size == 1: + f_wav_lead = compute_wavelet_descriptor(b[s], 'db1', 3) + else: + f_wav_lead = np.hstack((f_wav_lead, compute_wavelet_descriptor(b[s], 'db1', 3))) + f_wav = np.vstack((f_wav, f_wav_lead)) + #f_wav = np.vstack((f_wav, compute_wavelet_descriptor(b, 'db1', 3))) + + features = np.column_stack((features, f_wav)) if features.size else f_wav + + + # Wavelets + if 'wvlt+pca' in compute_morph: + pca_k = 7 + print("Wavelets + PCA ("+ str(pca_k) + "...") + + family = 'db1' + level = 3 + + f_wav = np.empty((0, 23 * num_leads)) + + for p in range(len(my_db.beat)): + for b in my_db.beat[p]: + f_wav_lead = np.empty([]) + for s in range(2): + if leads_flag[s] == 1: + if f_wav_lead.size == 1: + f_wav_lead = compute_wavelet_descriptor(b[s], family, level) + else: + f_wav_lead = np.hstack((f_wav_lead, compute_wavelet_descriptor(b[s], family, level))) + f_wav = np.vstack((f_wav, f_wav_lead)) + #f_wav = np.vstack((f_wav, compute_wavelet_descriptor(b, 'db1', 3))) + + + if DS == 'DS1': + # Compute PCA + #PCA = sklearn.decomposition.KernelPCA(pca_k) # gamma_pca + IPCA = IncrementalPCA(n_components=pca_k, batch_size=10)# NOTE: due to memory errors, we employ IncrementalPCA + IPCA.fit(f_wav) + + # Save PCA + save_wvlt_PCA(IPCA, pca_k, family, level) + else: + # Load PCAfrom sklearn.decomposition import PCA, IncrementalPCA + IPCA = load_wvlt_PCA( pca_k, family, level) + # Extract the PCA + #f_wav_PCA = np.empty((0, pca_k * num_leads)) + f_wav_PCA = IPCA.transform(f_wav) + features = np.column_stack((features, f_wav_PCA)) if features.size else f_wav_PCA + + # HOS + if 'HOS' in compute_morph: + print("HOS ...") + n_intervals = 6 + lag = int(round( (winL + winR )/ n_intervals)) + + f_HOS = np.empty((0, (n_intervals-1) * 2 * num_leads)) + for p in range(len(my_db.beat)): + for b in my_db.beat[p]: + f_HOS_lead = np.empty([]) + for s in range(2): + if leads_flag[s] == 1: + if f_HOS_lead.size == 1: + f_HOS_lead = compute_hos_descriptor(b[s], n_intervals, lag) + else: + f_HOS_lead = np.hstack((f_HOS_lead, compute_hos_descriptor(b[s], n_intervals, lag))) + f_HOS = np.vstack((f_HOS, f_HOS_lead)) + #f_HOS = np.vstack((f_HOS, compute_hos_descriptor(b, n_intervals, lag))) + + features = np.column_stack((features, f_HOS)) if features.size else f_HOS + print(features.shape) + + # My morphological descriptor + if 'myMorph' in compute_morph: + print("My Descriptor ...") + f_myMorhp = np.empty((0,4 * num_leads)) + for p in range(len(my_db.beat)): + for b in my_db.beat[p]: + f_myMorhp_lead = np.empty([]) + for s in range(2): + if leads_flag[s] == 1: + if f_myMorhp_lead.size == 1: + f_myMorhp_lead = compute_my_own_descriptor(b[s], winL, winR) + else: + f_myMorhp_lead = np.hstack((f_myMorhp_lead, compute_my_own_descriptor(b[s], winL, winR))) + f_myMorhp = np.vstack((f_myMorhp, f_myMorhp_lead)) + #f_myMorhp = np.vstack((f_myMorhp, compute_my_own_descriptor(b, winL, winR))) + + features = np.column_stack((features, f_myMorhp)) if features.size else f_myMorhp + + + labels = np.array(sum(my_db.class_ID, [])).flatten() + print("labels") + + # Set labels array! + print('writing pickle: ' + features_labels_name + '...') + f = open(features_labels_name, 'wb') + pickle.dump([features, labels, patient_num_beats], f, 2) + f.close + + return features, labels, patient_num_beats + + +# DS: contains the patient list for load +# winL, winR: indicates the size of the window centred at R-peak at left and right side +# do_preprocess: indicates if preprocesing of remove baseline on signal is performed +def load_signal(DS, winL, winR, do_preprocess): + + class_ID = [[] for i in range(len(DS))] + beat = [[] for i in range(len(DS))] # record, beat, lead + R_poses = [ np.array([]) for i in range(len(DS))] + Original_R_poses = [ np.array([]) for i in range(len(DS))] + valid_R = [ np.array([]) for i in range(len(DS))] + my_db = mit_db() + patients = [] + + # Lists + # beats = [] + # classes = [] + # valid_R = np.empty([]) + # R_poses = np.empty([]) + # Original_R_poses = np.empty([]) + + size_RR_max = 20 + + pathDB = '/home/mondejar/dataset/ECG/' + DB_name = 'mitdb' + fs = 360 + jump_lines = 1 + + # Read files: signal (.csv ) annotations (.txt) + fRecords = list() + fAnnotations = list() + + lst = os.listdir(pathDB + DB_name + "/csv") + lst.sort() + for file in lst: + if file.endswith(".csv"): + if int(file[0:3]) in DS: + fRecords.append(file) + elif file.endswith(".txt"): + if int(file[0:3]) in DS: + fAnnotations.append(file) + + MITBIH_classes = ['N', 'L', 'R', 'e', 'j', 'A', 'a', 'J', 'S', 'V', 'E', 'F']#, 'P', '/', 'f', 'u'] + AAMI_classes = [] + AAMI_classes.append(['N', 'L', 'R']) # N + AAMI_classes.append(['A', 'a', 'J', 'S', 'e', 'j']) # SVEB + AAMI_classes.append(['V', 'E']) # VEB + AAMI_classes.append(['F']) # F + #AAMI_classes.append(['P', '/', 'f', 'u']) # Q + + RAW_signals = [] + r_index = 0 + + #for r, a in zip(fRecords, fAnnotations): + for r in range(0, len(fRecords)): + + print("Processing signal " + str(r) + " / " + str(len(fRecords)) + "...") + + # 1. Read signalR_poses + filename = pathDB + DB_name + "/csv/" + fRecords[r] + print filename + f = open(filename, 'rb') + reader = csv.reader(f, delimiter=',') + next(reader) # skip first line! + MLII_index = 1 + V1_index = 2 + if int(fRecords[r][0:3]) == 114: + MLII_index = 2 + V1_index = 1 + + MLII = [] + V1 = [] + for row in reader: + MLII.append((int(row[MLII_index]))) + V1.append((int(row[V1_index]))) + f.close() + + + RAW_signals.append((MLII, V1)) ## NOTE a copy must be created in order to preserve the original signal + # display_signal(MLII) + + # 2. Read annotations + filename = pathDB + DB_name + "/csv/" + fAnnotations[r] + print filename + f = open(filename, 'rb') + next(f) # skip first line! + + annotations = [] + for line in f: + annotations.append(line) + f.close + # 3. Preprocessing signal! + if do_preprocess: + #scipy.signal + # median_filter1D + baseline = medfilt(MLII, 71) + baseline = medfilt(baseline, 215) + + # Remove Baseline + for i in range(0, len(MLII)): + MLII[i] = MLII[i] - baseline[i] + + # TODO Remove High Freqs + + # median_filter1D + baseline = medfilt(V1, 71) + baseline = medfilt(baseline, 215) + + # Remove Baseline + for i in range(0, len(V1)): + V1[i] = V1[i] - baseline[i] + + + # Extract the R-peaks from annotations + for a in annotations: + aS = a.split() + + pos = int(aS[1]) + originalPos = int(aS[1]) + classAnttd = aS[2] + if pos > size_RR_max and pos < (len(MLII) - size_RR_max): + index, value = max(enumerate(MLII[pos - size_RR_max : pos + size_RR_max]), key=operator.itemgetter(1)) + pos = (pos - size_RR_max) + index + + peak_type = 0 + #pos = pos-1 + + if classAnttd in MITBIH_classes: + if(pos > winL and pos < (len(MLII) - winR)): + beat[r].append( (MLII[pos - winL : pos + winR], V1[pos - winL : pos + winR])) + for i in range(0,len(AAMI_classes)): + if classAnttd in AAMI_classes[i]: + class_AAMI = i + break #exit loop + #convert class + class_ID[r].append(class_AAMI) + + valid_R[r] = np.append(valid_R[r], 1) + else: + valid_R[r] = np.append(valid_R[r], 0) + else: + valid_R[r] = np.append(valid_R[r], 0) + + R_poses[r] = np.append(R_poses[r], pos) + Original_R_poses[r] = np.append(Original_R_poses[r], originalPos) + + #R_poses[r] = R_poses[r][(valid_R[r] == 1)] + #Original_R_poses[r] = Original_R_poses[r][(valid_R[r] == 1)] + + + # Set the data into a bigger struct that keep all the records! + my_db.filename = fRecords + + my_db.raw_signal = RAW_signals + my_db.beat = beat # record, beat, lead + my_db.class_ID = class_ID + my_db.valid_R = valid_R + my_db.R_pos = R_poses + my_db.orig_R_pos = Original_R_poses + + return my_db