MIT-BIH-ECG / Git / Diff of /python/load

Models:
ReneeD/
MIT-BIH-ECG
Downloads: 5
Diff of /python/load_MITBIH.py [000000] .. [4d064f]
Switch to side-by-side view

--- a
+++ b/python/load_MITBIH.py
@@ -0,0 +1,600 @@
+#!/usr/bin/env python
+
+"""
+load_MITBIH.py
+
+Download .csv files and annotations from:
+    kaggle.com/mondejar/mitbih-database
+
+VARPA, University of Coruna
+Mondejar Guerra, Victor M.
+23 Oct 2017
+"""
+
+import os
+import csv
+import gc
+import cPickle as pickle
+
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy.signal import medfilt
+import scipy.stats
+import pywt
+import time
+import sklearn
+from sklearn import decomposition
+from sklearn.decomposition import PCA, IncrementalPCA
+
+from features_ECG import *
+
+from numpy.polynomial.hermite import hermfit, hermval
+
+
+
+def create_features_labels_name(DS, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag):
+    
+    features_labels_name = db_path + 'features/' + 'w_' + str(winL) + '_' + str(winR) + '_' + DS 
+
+    if do_preprocess:
+        features_labels_name += '_rm_bsline'
+
+    if maxRR:
+        features_labels_name += '_maxRR'
+
+    if use_RR:
+        features_labels_name += '_RR'
+    
+    if norm_RR:
+        features_labels_name += '_norm_RR'
+
+    for descp in compute_morph:
+        features_labels_name += '_' + descp
+
+    if reduced_DS:
+        features_labels_name += '_reduced'
+        
+    if leads_flag[0] == 1:
+        features_labels_name += '_MLII'
+
+    if leads_flag[1] == 1:
+        features_labels_name += '_V1'
+
+    features_labels_name += '.p'
+
+    return features_labels_name
+
+
+def save_wvlt_PCA(PCA, pca_k, family, level):
+    f = open('Wvlt_' + family + '_' + str(level) + '_PCA_' + str(pca_k) + '.p', 'wb')
+    pickle.dump(PCA, f, 2)
+    f.close
+
+
+def load_wvlt_PCA(pca_k, family, level):
+    f = open('Wvlt_' + family + '_' + str(level) + '_PCA_' + str(pca_k) + '.p', 'rb')
+    # disable garbage collector       
+    gc.disable()# this improve the required loading time!
+    PCA = pickle.load(f)
+    gc.enable()
+    f.close()
+
+    return PCA
+# Load the data with the configuration and features selected
+# Params:
+# - leads_flag = [MLII, V1] set the value to 0 or 1 to reference if that lead is used
+# - reduced_DS = load DS1, DS2 patients division (Chazal) or reduced version, 
+#                i.e., only patients in common that contains both MLII and V1 
+def load_mit_db(DS, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag):
+
+    features_labels_name = create_features_labels_name(DS, winL, winR, do_preprocess, maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag) 
+
+    if os.path.isfile(features_labels_name):
+        print("Loading pickle: " + features_labels_name + "...")
+        f = open(features_labels_name, 'rb')
+        # disable garbage collector       
+        gc.disable()# this improve the required loading time!
+        features, labels, patient_num_beats = pickle.load(f)
+        gc.enable()
+        f.close()
+
+
+    else:
+        print("Loading MIT BIH arr (" + DS + ") ...")
+
+        # ML-II
+        if reduced_DS == False:
+            DS1 = [101, 106, 108, 109, 112, 114, 115, 116, 118, 119, 122, 124, 201, 203, 205, 207, 208, 209, 215, 220, 223, 230]
+            DS2 = [100, 103, 105, 111, 113, 117, 121, 123, 200, 202, 210, 212, 213, 214, 219, 221, 222, 228, 231, 232, 233, 234]
+
+        # ML-II + V1
+        else:
+            DS1 = [101, 106, 108, 109, 112, 115, 118, 119, 201, 203, 205, 207, 208, 209, 215, 220, 223, 230]
+            DS2 = [105, 111, 113, 121, 200, 202, 210, 212, 213, 214, 219, 221, 222, 228, 231, 232, 233, 234]
+
+        mit_pickle_name = db_path + 'python_mit'
+        if reduced_DS:
+            mit_pickle_name = mit_pickle_name + '_reduced_'
+
+        if do_preprocess:
+            mit_pickle_name = mit_pickle_name + '_rm_bsline'
+
+        mit_pickle_name = mit_pickle_name + '_wL_' + str(winL) + '_wR_' + str(winR)
+        mit_pickle_name = mit_pickle_name + '_' + DS + '.p'
+
+        # If the data with that configuration has been already computed Load pickle
+        if os.path.isfile(mit_pickle_name):
+            f = open(mit_pickle_name, 'rb')
+            # disable garbage collector       
+            gc.disable()# this improve the required loading time!
+            my_db = pickle.load(f)
+            gc.enable()
+            f.close()
+        else: # Load data and compute de RR features 
+            if DS == 'DS1':
+                my_db = load_signal(DS1, winL, winR, do_preprocess)
+            else:
+                my_db = load_signal(DS2, winL, winR, do_preprocess)
+
+            print("Saving signal processed data ...")
+            # Save data
+            # Protocol version 0 itr_features_balanceds the original ASCII protocol and is backwards compatible with earlier versions of Python.
+            # Protocol version 1 is the old binary format which is also compatible with earlier versions of Python.
+            # Protocol version 2 was introduced in Python 2.3. It provides much more efficient pickling of new-style classes.
+            f = open(mit_pickle_name, 'wb')
+            pickle.dump(my_db, f, 2)
+            f.close
+
+        
+        features = np.array([], dtype=float)
+        labels = np.array([], dtype=np.int32)
+
+        # This array contains the number of beats for each patient (for cross_val)
+        patient_num_beats = np.array([], dtype=np.int32)
+        for p in range(len(my_db.beat)):
+            patient_num_beats = np.append(patient_num_beats, len(my_db.beat[p]))
+
+        # Compute RR features
+        if use_RR or norm_RR:
+            if DS == 'DS1':
+                RR = [RR_intervals() for i in range(len(DS1))]
+            else:
+                RR = [RR_intervals() for i in range(len(DS2))]
+
+            print("Computing RR intervals ...")
+
+            for p in range(len(my_db.beat)):
+                if maxRR:
+                    RR[p] = compute_RR_intervals(my_db.R_pos[p])
+                else:
+                    RR[p] = compute_RR_intervals(my_db.orig_R_pos[p])
+                    
+                RR[p].pre_R = RR[p].pre_R[(my_db.valid_R[p] == 1)]
+                RR[p].post_R = RR[p].post_R[(my_db.valid_R[p] == 1)]
+                RR[p].local_R = RR[p].local_R[(my_db.valid_R[p] == 1)]
+                RR[p].global_R = RR[p].global_R[(my_db.valid_R[p] == 1)]
+
+
+        if use_RR:
+            f_RR = np.empty((0,4))
+            for p in range(len(RR)):
+                row = np.column_stack((RR[p].pre_R, RR[p].post_R, RR[p].local_R, RR[p].global_R))
+                f_RR = np.vstack((f_RR, row))
+
+            features = np.column_stack((features, f_RR)) if features.size else f_RR
+        
+        if norm_RR:
+            f_RR_norm = np.empty((0,4))
+            for p in range(len(RR)):
+                # Compute avg values!
+                avg_pre_R = np.average(RR[p].pre_R)
+                avg_post_R = np.average(RR[p].post_R)
+                avg_local_R = np.average(RR[p].local_R)
+                avg_global_R = np.average(RR[p].global_R)
+
+                row = np.column_stack((RR[p].pre_R / avg_pre_R, RR[p].post_R / avg_post_R, RR[p].local_R / avg_local_R, RR[p].global_R / avg_global_R))
+                f_RR_norm = np.vstack((f_RR_norm, row))
+
+            features = np.column_stack((features, f_RR_norm))  if features.size else f_RR_norm
+
+        #########################################################################################
+        # Compute morphological features
+        print("Computing morphological features (" + DS + ") ...")
+
+        num_leads = np.sum(leads_flag)
+
+        # Raw
+        if 'resample_10' in compute_morph:
+            print("Resample_10 ...")
+            start = time.time()
+
+            f_raw = np.empty((0, 10 * num_leads))
+
+            for p in range(len(my_db.beat)):
+                for beat in my_db.beat[p]:
+                    f_raw_lead = np.empty([])
+                    for s in range(2):
+                        if leads_flag[s] == 1:
+                            resamp_beat = scipy.signal.resample(beat[s], 10)
+                            if f_raw_lead.size == 1:
+                                f_raw_lead =  resamp_beat
+                            else:
+                                f_raw_lead = np.hstack((f_raw_lead, resamp_beat))
+                    f_raw = np.vstack((f_raw, f_raw_lead))
+
+            features = np.column_stack((features, f_raw))  if features.size else f_raw
+
+            end = time.time()
+            print("Time resample: " + str(format(end - start, '.2f')) + " sec" )
+
+        if 'raw' in compute_morph:
+            print("Raw ...")
+            start = time.time()
+
+            f_raw = np.empty((0, (winL + winR) * num_leads))
+
+            for p in range(len(my_db.beat)):
+                for beat in my_db.beat[p]:
+                    f_raw_lead = np.empty([])
+                    for s in range(2):
+                        if leads_flag[s] == 1:
+                            if f_raw_lead.size == 1:
+                                f_raw_lead =  beat[s]
+                            else:
+                                f_raw_lead = np.hstack((f_raw_lead, beat[s]))
+                    f_raw = np.vstack((f_raw, f_raw_lead))
+
+            features = np.column_stack((features, f_raw))  if features.size else f_raw
+
+            end = time.time()
+            print("Time raw: " + str(format(end - start, '.2f')) + " sec" )
+        # LBP 1D
+        # 1D-local binary pattern based feature extraction for classification of epileptic EEG signals: 2014, unas 55 citas, Q2-Q1 Matematicas
+        # https://ac.els-cdn.com/S0096300314008285/1-s2.0-S0096300314008285-main.pdf?_tid=8a8433a6-e57f-11e7-98ec-00000aab0f6c&acdnat=1513772341_eb5d4d26addb6c0b71ded4fd6cc23ed5
+
+        # 1D-LBP method, which derived from implementation steps of 2D-LBP, was firstly proposed by Chatlani et al. for detection of speech signals that is non-stationary in nature [23]
+
+        # From Raw signal
+
+        # TODO: Some kind of preprocesing or clean high frequency noise?
+
+        # Compute 2 Histograms: LBP or Uniform LBP
+        # LBP 8 = 0-255
+        # U-LBP 8 = 0-58
+        # Uniform LBP are only those pattern wich only presents 2 (or less) transitions from 0-1 or 1-0
+        # All the non-uniform patterns are asigned to the same value in the histogram
+
+        if 'u-lbp' in compute_morph:
+            print("u-lbp ...")
+
+            f_lbp = np.empty((0, 59 * num_leads))
+
+            for p in range(len(my_db.beat)):
+                for beat in my_db.beat[p]:
+                    f_lbp_lead = np.empty([])
+                    for s in range(2):
+                        if leads_flag[s] == 1:
+                            if f_lbp_lead.size == 1:
+
+                                f_lbp_lead = compute_Uniform_LBP(beat[s], 8)
+                            else:
+                                f_lbp_lead = np.hstack((f_lbp_lead, compute_Uniform_LBP(beat[s], 8)))
+                    f_lbp = np.vstack((f_lbp, f_lbp_lead))
+
+            features = np.column_stack((features, f_lbp))  if features.size else f_lbp
+            print(features.shape)
+
+        if 'lbp' in compute_morph:
+            print("lbp ...")
+
+            f_lbp = np.empty((0, 16 * num_leads))
+
+            for p in range(len(my_db.beat)):
+                for beat in my_db.beat[p]:
+                    f_lbp_lead = np.empty([])
+                    for s in range(2):
+                        if leads_flag[s] == 1:
+                            if f_lbp_lead.size == 1:
+
+                                f_lbp_lead = compute_LBP(beat[s], 4)
+                            else:
+                                f_lbp_lead = np.hstack((f_lbp_lead, compute_LBP(beat[s], 4)))
+                    f_lbp = np.vstack((f_lbp, f_lbp_lead))
+
+            features = np.column_stack((features, f_lbp))  if features.size else f_lbp
+            print(features.shape)
+
+
+        if 'hbf5' in compute_morph:
+            print("hbf ...")
+
+            f_hbf = np.empty((0, 15 * num_leads))
+
+            for p in range(len(my_db.beat)):
+                for beat in my_db.beat[p]:
+                    f_hbf_lead = np.empty([])
+                    for s in range(2):
+                        if leads_flag[s] == 1:
+                            if f_hbf_lead.size == 1:
+
+                                f_hbf_lead = compute_HBF(beat[s])
+                            else:
+                                f_hbf_lead = np.hstack((f_hbf_lead, compute_HBF(beat[s])))
+                    f_hbf = np.vstack((f_hbf, f_hbf_lead))
+
+            features = np.column_stack((features, f_hbf))  if features.size else f_hbf
+            print(features.shape)
+
+        # Wavelets
+        if 'wvlt' in compute_morph:
+            print("Wavelets ...")
+
+            f_wav = np.empty((0, 23 * num_leads))
+
+            for p in range(len(my_db.beat)):
+                for b in my_db.beat[p]:
+                    f_wav_lead = np.empty([])
+                    for s in range(2):
+                        if leads_flag[s] == 1:
+                            if f_wav_lead.size == 1:
+                                f_wav_lead =  compute_wavelet_descriptor(b[s], 'db1', 3)
+                            else:
+                                f_wav_lead = np.hstack((f_wav_lead, compute_wavelet_descriptor(b[s], 'db1', 3)))
+                    f_wav = np.vstack((f_wav, f_wav_lead))
+                    #f_wav = np.vstack((f_wav, compute_wavelet_descriptor(b,  'db1', 3)))
+
+            features = np.column_stack((features, f_wav))  if features.size else f_wav
+
+
+        # Wavelets
+        if 'wvlt+pca' in compute_morph:
+            pca_k = 7
+            print("Wavelets + PCA ("+ str(pca_k) + "...")
+            
+            family = 'db1'
+            level = 3
+
+            f_wav = np.empty((0, 23 * num_leads))
+
+            for p in range(len(my_db.beat)):
+                for b in my_db.beat[p]:
+                    f_wav_lead = np.empty([])
+                    for s in range(2):
+                        if leads_flag[s] == 1:
+                            if f_wav_lead.size == 1:
+                                f_wav_lead =  compute_wavelet_descriptor(b[s], family, level)
+                            else:
+                                f_wav_lead = np.hstack((f_wav_lead, compute_wavelet_descriptor(b[s], family, level)))
+                    f_wav = np.vstack((f_wav, f_wav_lead))
+                    #f_wav = np.vstack((f_wav, compute_wavelet_descriptor(b,  'db1', 3)))
+
+
+            if DS == 'DS1':
+                # Compute PCA
+                #PCA = sklearn.decomposition.KernelPCA(pca_k) # gamma_pca
+                IPCA = IncrementalPCA(n_components=pca_k, batch_size=10)# NOTE: due to memory errors, we employ IncrementalPCA
+                IPCA.fit(f_wav) 
+
+                # Save PCA
+                save_wvlt_PCA(IPCA, pca_k, family, level)
+            else:
+                # Load PCAfrom sklearn.decomposition import PCA, IncrementalPCA
+                IPCA = load_wvlt_PCA( pca_k, family, level)
+            # Extract the PCA
+            #f_wav_PCA = np.empty((0, pca_k * num_leads))
+            f_wav_PCA = IPCA.transform(f_wav)
+            features = np.column_stack((features, f_wav_PCA))  if features.size else f_wav_PCA
+
+        # HOS
+        if 'HOS' in compute_morph:
+            print("HOS ...")
+            n_intervals = 6
+            lag = int(round( (winL + winR )/ n_intervals))
+
+            f_HOS = np.empty((0, (n_intervals-1) * 2 * num_leads))
+            for p in range(len(my_db.beat)):
+                for b in my_db.beat[p]:
+                    f_HOS_lead = np.empty([])
+                    for s in range(2):
+                        if leads_flag[s] == 1:
+                            if f_HOS_lead.size == 1:
+                                f_HOS_lead =  compute_hos_descriptor(b[s], n_intervals, lag)
+                            else:
+                                f_HOS_lead = np.hstack((f_HOS_lead, compute_hos_descriptor(b[s], n_intervals, lag)))
+                    f_HOS = np.vstack((f_HOS, f_HOS_lead))
+                    #f_HOS = np.vstack((f_HOS, compute_hos_descriptor(b, n_intervals, lag)))
+
+            features = np.column_stack((features, f_HOS))  if features.size else f_HOS
+            print(features.shape)
+
+        # My morphological descriptor
+        if 'myMorph' in compute_morph:
+            print("My Descriptor ...")
+            f_myMorhp = np.empty((0,4 * num_leads))
+            for p in range(len(my_db.beat)):
+                for b in my_db.beat[p]:
+                    f_myMorhp_lead = np.empty([])
+                    for s in range(2):
+                        if leads_flag[s] == 1:
+                            if f_myMorhp_lead.size == 1:
+                                f_myMorhp_lead =  compute_my_own_descriptor(b[s], winL, winR)
+                            else:
+                                f_myMorhp_lead = np.hstack((f_myMorhp_lead, compute_my_own_descriptor(b[s], winL, winR)))
+                    f_myMorhp = np.vstack((f_myMorhp, f_myMorhp_lead))
+                    #f_myMorhp = np.vstack((f_myMorhp, compute_my_own_descriptor(b, winL, winR)))
+                    
+            features = np.column_stack((features, f_myMorhp))  if features.size else f_myMorhp
+
+        
+        labels = np.array(sum(my_db.class_ID, [])).flatten()
+        print("labels")
+
+        # Set labels array!
+        print('writing pickle: ' +  features_labels_name + '...')
+        f = open(features_labels_name, 'wb')
+        pickle.dump([features, labels, patient_num_beats], f, 2)
+        f.close
+
+    return features, labels, patient_num_beats
+
+
+# DS: contains the patient list for load
+# winL, winR: indicates the size of the window centred at R-peak at left and right side
+# do_preprocess: indicates if preprocesing of remove baseline on signal is performed
+def load_signal(DS, winL, winR, do_preprocess):
+
+    class_ID = [[] for i in range(len(DS))]
+    beat = [[] for i in range(len(DS))] # record, beat, lead
+    R_poses = [ np.array([]) for i in range(len(DS))]
+    Original_R_poses = [ np.array([]) for i in range(len(DS))]   
+    valid_R = [ np.array([]) for i in range(len(DS))]
+    my_db = mit_db()
+    patients = []
+
+    # Lists 
+    # beats = []
+    # classes = []
+    # valid_R = np.empty([])
+    # R_poses = np.empty([])
+    # Original_R_poses = np.empty([])
+
+    size_RR_max = 20
+
+    pathDB = '/home/mondejar/dataset/ECG/'
+    DB_name = 'mitdb'
+    fs = 360
+    jump_lines = 1
+
+    # Read files: signal (.csv )  annotations (.txt)    
+    fRecords = list()
+    fAnnotations = list()
+
+    lst = os.listdir(pathDB + DB_name + "/csv")
+    lst.sort()
+    for file in lst:
+        if file.endswith(".csv"):
+            if int(file[0:3]) in DS:
+                fRecords.append(file)
+        elif file.endswith(".txt"):
+            if int(file[0:3]) in DS:
+                fAnnotations.append(file)        
+
+    MITBIH_classes = ['N', 'L', 'R', 'e', 'j', 'A', 'a', 'J', 'S', 'V', 'E', 'F']#, 'P', '/', 'f', 'u']
+    AAMI_classes = []
+    AAMI_classes.append(['N', 'L', 'R'])                    # N
+    AAMI_classes.append(['A', 'a', 'J', 'S', 'e', 'j'])     # SVEB 
+    AAMI_classes.append(['V', 'E'])                         # VEB
+    AAMI_classes.append(['F'])                              # F
+    #AAMI_classes.append(['P', '/', 'f', 'u'])              # Q
+
+    RAW_signals = []
+    r_index = 0
+
+    #for r, a in zip(fRecords, fAnnotations):
+    for r in range(0, len(fRecords)):
+
+        print("Processing signal " + str(r) + " / " + str(len(fRecords)) + "...")
+
+        # 1. Read signalR_poses
+        filename = pathDB + DB_name + "/csv/" + fRecords[r]
+        print filename
+        f = open(filename, 'rb')
+        reader = csv.reader(f, delimiter=',')
+        next(reader) # skip first line!
+        MLII_index = 1
+        V1_index = 2
+        if int(fRecords[r][0:3]) == 114:
+            MLII_index = 2
+            V1_index = 1
+
+        MLII = []
+        V1 = []
+        for row in reader:
+            MLII.append((int(row[MLII_index])))
+            V1.append((int(row[V1_index])))
+        f.close()
+
+
+        RAW_signals.append((MLII, V1)) ## NOTE a copy must be created in order to preserve the original signal
+        # display_signal(MLII)
+
+        # 2. Read annotations
+        filename = pathDB + DB_name + "/csv/" + fAnnotations[r]
+        print filename
+        f = open(filename, 'rb')
+        next(f) # skip first line!
+
+        annotations = []
+        for line in f:
+            annotations.append(line)
+        f.close
+        # 3. Preprocessing signal!
+        if do_preprocess:
+            #scipy.signal
+            # median_filter1D
+            baseline = medfilt(MLII, 71) 
+            baseline = medfilt(baseline, 215) 
+
+            # Remove Baseline
+            for i in range(0, len(MLII)):
+                MLII[i] = MLII[i] - baseline[i]
+
+            # TODO Remove High Freqs
+
+            # median_filter1D
+            baseline = medfilt(V1, 71) 
+            baseline = medfilt(baseline, 215) 
+
+            # Remove Baseline
+            for i in range(0, len(V1)):
+                V1[i] = V1[i] - baseline[i]
+
+
+        # Extract the R-peaks from annotations
+        for a in annotations:
+            aS = a.split()
+            
+            pos = int(aS[1])
+            originalPos = int(aS[1])
+            classAnttd = aS[2]
+            if pos > size_RR_max and pos < (len(MLII) - size_RR_max):
+                index, value = max(enumerate(MLII[pos - size_RR_max : pos + size_RR_max]), key=operator.itemgetter(1))
+                pos = (pos - size_RR_max) + index
+
+            peak_type = 0
+            #pos = pos-1
+            
+            if classAnttd in MITBIH_classes:
+                if(pos > winL and pos < (len(MLII) - winR)):
+                    beat[r].append( (MLII[pos - winL : pos + winR], V1[pos - winL : pos + winR]))
+                    for i in range(0,len(AAMI_classes)):
+                        if classAnttd in AAMI_classes[i]:
+                            class_AAMI = i
+                            break #exit loop
+                    #convert class
+                    class_ID[r].append(class_AAMI)
+
+                    valid_R[r] = np.append(valid_R[r], 1)
+                else:
+                    valid_R[r] = np.append(valid_R[r], 0)
+            else:
+                valid_R[r] = np.append(valid_R[r], 0)
+            
+            R_poses[r] = np.append(R_poses[r], pos)
+            Original_R_poses[r] = np.append(Original_R_poses[r], originalPos)
+        
+        #R_poses[r] = R_poses[r][(valid_R[r] == 1)]
+        #Original_R_poses[r] = Original_R_poses[r][(valid_R[r] == 1)]
+
+        
+    # Set the data into a bigger struct that keep all the records!
+    my_db.filename = fRecords
+
+    my_db.raw_signal = RAW_signals
+    my_db.beat = beat # record, beat, lead
+    my_db.class_ID = class_ID
+    my_db.valid_R = valid_R
+    my_db.R_pos = R_poses
+    my_db.orig_R_pos = Original_R_poses
+
+    return my_db