--- a +++ b/python/train_SVM.py @@ -0,0 +1,361 @@ +#!/usr/bin/env python + +""" +train_SVM.py + +VARPA, University of Coruna +Mondejar Guerra, Victor M. +23 Oct 2017 +""" + +from load_MITBIH import * +from evaluation_AAMI import * +from aggregation_voting_strategies import * +from oversampling import * +from cross_validation import * +from feature_selection import * + +import sklearn +from sklearn.externals import joblib +from sklearn.preprocessing import StandardScaler +from sklearn import svm + +from sklearn import decomposition + +import os + +def create_svm_model_name(model_svm_path, winL, winR, do_preprocess, + maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, + oversamp_method, leads_flag, reduced_DS, pca_k, delimiter): + + if reduced_DS == True: + model_svm_path = model_svm_path + delimiter + 'exp_2' + + if leads_flag[0] == 1: + model_svm_path = model_svm_path + delimiter + 'MLII' + + if leads_flag[1] == 1: + model_svm_path = model_svm_path + delimiter + 'V1' + + if oversamp_method: + model_svm_path = model_svm_path + delimiter + oversamp_method + + if feature_selection: + model_svm_path = model_svm_path + delimiter + feature_selection + + if do_preprocess: + model_svm_path = model_svm_path + delimiter + 'rm_bsln' + + if maxRR: + model_svm_path = model_svm_path + delimiter + 'maxRR' + + if use_RR: + model_svm_path = model_svm_path + delimiter + 'RR' + + if norm_RR: + model_svm_path = model_svm_path + delimiter + 'norm_RR' + + for descp in compute_morph: + model_svm_path = model_svm_path + delimiter + descp + + if use_weight_class: + model_svm_path = model_svm_path + delimiter + 'weighted' + + if pca_k > 0: + model_svm_path = model_svm_path + delimiter + 'pca_' + str(pca_k) + + return model_svm_path + + +# Eval the SVM model and export the results +def eval_model(svm_model, features, labels, multi_mode, voting_strategy, output_path, C_value, gamma_value, DS): + if multi_mode == 'ovo': + decision_ovo = svm_model.decision_function(features) + + if voting_strategy == 'ovo_voting': + predict_ovo, counter = ovo_voting(decision_ovo, 4) + + elif voting_strategy == 'ovo_voting_both': + predict_ovo, counter = ovo_voting_both(decision_ovo, 4) + + elif voting_strategy == 'ovo_voting_exp': + predict_ovo, counter = ovo_voting_exp(decision_ovo, 4) + + # svm_model.predict_log_proba svm_model.predict_proba svm_model.predict ... + perf_measures = compute_AAMI_performance_measures(predict_ovo, labels) + + """ + elif multi_mode == 'ovr':cr + decision_ovr = svm_model.decision_function(features) + predict_ovr = svm_model.predict(features) + perf_measures = compute_AAMI_performance_measures(predict_ovr, labels) + """ + + # Write results and also predictions on DS2 + if not os.path.exists(output_path): + os.makedirs(output_path) + + if gamma_value != 0.0: + write_AAMI_results( perf_measures, output_path + '/' + DS + 'C_' + str(C_value) + 'g_' + str(gamma_value) + + '_score_Ijk_' + str(format(perf_measures.Ijk, '.2f')) + '_' + voting_strategy + '.txt') + else: + write_AAMI_results( perf_measures, output_path + '/' + DS + 'C_' + str(C_value) + + '_score_Ijk_' + str(format(perf_measures.Ijk, '.2f')) + '_' + voting_strategy + '.txt') + + # Array to .csv + if multi_mode == 'ovo': + if gamma_value != 0.0: + np.savetxt(output_path + '/' + DS + 'C_' + str(C_value) + 'g_' + str(gamma_value) + + '_decision_ovo.csv', decision_ovo) + np.savetxt(output_path + '/' + DS + 'C_' + str(C_value) + 'g_' + str(gamma_value) + + '_predict_' + voting_strategy + '.csv', predict_ovo.astype(int), '%.0f') + else: + np.savetxt(output_path + '/' + DS + 'C_' + str(C_value) + + '_decision_ovo.csv', decision_ovo) + np.savetxt(output_path + '/' + DS + 'C_' + str(C_value) + + '_predict_' + voting_strategy + '.csv', predict_ovo.astype(int), '%.0f') + + elif multi_mode == 'ovr': + np.savetxt(output_path + '/' + DS + 'C_' + str(C_value) + + '_decision_ovr.csv', prob_ovr) + np.savetxt(output_path + '/' + DS + 'C_' + str(C_value) + + '_predict_' + voting_strategy + '.csv', predict_ovr.astype(int), '%.0f') + + print("Results writed at " + output_path + '/' + DS + 'C_' + str(C_value)) + + + +def create_oversamp_name(reduced_DS, do_preprocess, compute_morph, winL, winR, maxRR, use_RR, norm_RR, pca_k): + oversamp_features_pickle_name = '' + if reduced_DS: + oversamp_features_pickle_name += '_reduced_' + + if do_preprocess: + oversamp_features_pickle_name += '_rm_bsline' + + if maxRR: + oversamp_features_pickle_name += '_maxRR' + + if use_RR: + oversamp_features_pickle_name += '_RR' + + if norm_RR: + oversamp_features_pickle_name += '_norm_RR' + + for descp in compute_morph: + oversamp_features_pickle_name += '_' + descp + + if pca_k > 0: + oversamp_features_pickle_name += '_pca_' + str(pca_k) + + oversamp_features_pickle_name += '_wL_' + str(winL) + '_wR_' + str(winR) + + return oversamp_features_pickle_name + + + +def main(multi_mode='ovo', winL=90, winR=90, do_preprocess=True, use_weight_class=True, + maxRR=True, use_RR=True, norm_RR=True, compute_morph={''}, oversamp_method = '', pca_k = '', feature_selection = '', do_cross_val = '', C_value = 0.001, gamma_value = 0.0, reduced_DS = False, leads_flag = [1,0]): + print("Runing train_SVM.py!") + + db_path = '/home/mondejar/dataset/ECG/mitdb/m_learning/scikit/' + + # Load train data + [tr_features, tr_labels, tr_patient_num_beats] = load_mit_db('DS1', winL, winR, do_preprocess, + maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag) + + # Load Test data + [eval_features, eval_labels, eval_patient_num_beats] = load_mit_db('DS2', winL, winR, do_preprocess, + maxRR, use_RR, norm_RR, compute_morph, db_path, reduced_DS, leads_flag) + if reduced_DS == True: + np.savetxt('mit_db/' + 'exp_2_' + 'DS2_labels.csv', eval_labels.astype(int), '%.0f') + else: + np.savetxt('mit_db/' + 'DS2_labels.csv', eval_labels.astype(int), '%.0f') + + #if reduced_DS == True: + # np.savetxt('mit_db/' + 'exp_2_' + 'DS1_labels.csv', tr_labels.astype(int), '%.0f') + #else: + #np.savetxt('mit_db/' + 'DS1_labels.csv', tr_labels.astype(int), '%.0f') + + ############################################################## + # 0) TODO if feature_Selection: + # before oversamp!!????? + + # TODO perform normalization before the oversampling? + if oversamp_method: + # Filename + oversamp_features_pickle_name = create_oversamp_name(reduced_DS, do_preprocess, compute_morph, winL, winR, maxRR, use_RR, norm_RR, pca_k) + + # Do oversampling + tr_features, tr_labels = perform_oversampling(oversamp_method, db_path + 'oversamp/python_mit', oversamp_features_pickle_name, tr_features, tr_labels) + + # Normalization of the input data + # scaled: zero mean unit variance ( z-score ) + scaler = StandardScaler() + scaler.fit(tr_features) + tr_features_scaled = scaler.transform(tr_features) + + # scaled: zero mean unit variance ( z-score ) + eval_features_scaled = scaler.transform(eval_features) + ############################################################## + # 0) ????????????? feature_Selection: also after Oversampling??? + if feature_selection: + print("Runing feature selection") + best_features = 7 + tr_features_scaled, features_index_sorted = run_feature_selection(tr_features_scaled, tr_labels, feature_selection, best_features) + eval_features_scaled = eval_features_scaled[:, features_index_sorted[0:best_features]] + # 1) + if pca_k > 0: + + # Load if exists?? + # NOTE PCA do memory error! + + # NOTE 11 Enero: TEST WITH IPCA!!!!!! + start = time.time() + + print("Runing IPCA " + str(pca_k) + "...") + + # Run PCA + IPCA = sklearn.decomposition.IncrementalPCA(pca_k, batch_size=pca_k) # gamma_pca + + #tr_features_scaled = KPCA.fit_transform(tr_features_scaled) + IPCA.fit(tr_features_scaled) + + # Apply PCA on test data! + tr_features_scaled = IPCA.transform(tr_features_scaled) + eval_features_scaled = IPCA.transform(eval_features_scaled) + + """ + print("Runing TruncatedSVD (singular value decomposition (SVD)!!!) (alternative to PCA) " + str(pca_k) + "...") + + svd = decomposition.TruncatedSVD(n_components=pca_k, algorithm='arpack') + svd.fit(tr_features_scaled) + tr_features_scaled = svd.transform(tr_features_scaled) + eval_features_scaled = svd.transform(eval_features_scaled) + + """ + end = time.time() + + print("Time runing IPCA (rbf): " + str(format(end - start, '.2f')) + " sec" ) + ############################################################## + # 2) Cross-validation: + + if do_cross_val: + print("Runing cross val...") + start = time.time() + + # TODO Save data over the k-folds and ranked by the best average values in separated files + perf_measures_path = create_svm_model_name('/home/mondejar/Dropbox/ECG/code/ecg_classification/python/results/' + multi_mode, winL, winR, do_preprocess, + maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag, reduced_DS, pca_k, '/') + + # TODO implement this method! check to avoid NaN scores.... + + if do_cross_val == 'pat_cv': # Cross validation with one fold per patient + cv_scores, c_values = run_cross_val(tr_features_scaled, tr_labels, tr_patient_num_beats, do_cross_val, len(tr_patient_num_beats)) + + if not os.path.exists(perf_measures_path): + os.makedirs(perf_measures_path) + np.savetxt(perf_measures_path + '/cross_val_k-pat_cv_F_score.csv', (c_values, cv_scores.astype(float)), "%f") + + elif do_cross_val == 'beat_cv': # cross validation by class id samples + k_folds = {5} + for k in k_folds: + ijk_scores, c_values = run_cross_val(tr_features_scaled, tr_labels, tr_patient_num_beats, do_cross_val, k) + # TODO Save data over the k-folds and ranked by the best average values in separated files + perf_measures_path = create_svm_model_name('/home/mondejar/Dropbox/ECG/code/ecg_classification/python/results/' + multi_mode, winL, winR, do_preprocess, + maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag, reduced_DS, pca_k, '/') + + if not os.path.exists(perf_measures_path): + os.makedirs(perf_measures_path) + np.savetxt(perf_measures_path + '/cross_val_k-' + str(k) + '_Ijk_score.csv', (c_values, ijk_scores.astype(float)), "%f") + + end = time.time() + print("Time runing Cross Validation: " + str(format(end - start, '.2f')) + " sec" ) + else: + + ################################################################################################ + # 3) Train SVM model + + # TODO load best params from cross validation! + + use_probability = False + + model_svm_path = db_path + 'svm_models/' + multi_mode + '_rbf' + + model_svm_path = create_svm_model_name(model_svm_path, winL, winR, do_preprocess, + maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, + oversamp_method, leads_flag, reduced_DS, pca_k, '_') + + if gamma_value != 0.0: + model_svm_path = model_svm_path + '_C_' + str(C_value) + '_g_' + str(gamma_value) +'.joblib.pkl' + else: + model_svm_path = model_svm_path + '_C_' + str(C_value) + '.joblib.pkl' + + print("Training model on MIT-BIH DS1: " + model_svm_path + "...") + + if os.path.isfile(model_svm_path): + # Load the trained model! + svm_model = joblib.load(model_svm_path) + + else: + class_weights = {} + for c in range(4): + class_weights.update({c:len(tr_labels) / float(np.count_nonzero(tr_labels == c))}) + + #class_weight='balanced', + if gamma_value != 0.0: # NOTE 0.0 means 1/n_features default value + svm_model = svm.SVC(C=C_value, kernel='rbf', degree=3, gamma=gamma_value, + coef0=0.0, shrinking=True, probability=use_probability, tol=0.001, + cache_size=200, class_weight=class_weights, verbose=False, + max_iter=-1, decision_function_shape=multi_mode, random_state=None) + else: + svm_model = svm.SVC(C=C_value, kernel='rbf', degree=3, gamma='auto', + coef0=0.0, shrinking=True, probability=use_probability, tol=0.001, + cache_size=200, class_weight=class_weights, verbose=False, + max_iter=-1, decision_function_shape=multi_mode, random_state=None) + + # Let's Train! + + start = time.time() + svm_model.fit(tr_features_scaled, tr_labels) + end = time.time() + # TODO assert that the class_ID appears with the desired order, + # with the goal of ovo make the combinations properly + print("Trained completed!\n\t" + model_svm_path + "\n \ + \tTime required: " + str(format(end - start, '.2f')) + " sec" ) + + # Export model: save/write trained SVM model + joblib.dump(svm_model, model_svm_path) + + # TODO Export StandardScaler() + + ######################################################################### + # 4) Test SVM model + print("Testing model on MIT-BIH DS2: " + model_svm_path + "...") + + ############################################################################################################ + # EVALUATION + ############################################################################################################ + + # Evaluate the model on the training data + perf_measures_path = create_svm_model_name('/home/mondejar/Dropbox/ECG/code/ecg_classification/python/results/' + multi_mode, winL, winR, do_preprocess, + maxRR, use_RR, norm_RR, compute_morph, use_weight_class, feature_selection, oversamp_method, leads_flag, reduced_DS, pca_k, '/') + + # ovo_voting: + # Simply add 1 to the win class + print("Evaluation on DS1 ...") + eval_model(svm_model, tr_features_scaled, tr_labels, multi_mode, 'ovo_voting', perf_measures_path, C_value, gamma_value, 'Train_') + + # Let's test new data! + print("Evaluation on DS2 ...") + eval_model(svm_model, eval_features_scaled, eval_labels, multi_mode, 'ovo_voting', perf_measures_path, C_value, gamma_value, '') + + + # ovo_voting_exp: + # Consider the post prob adding to both classes + print("Evaluation on DS1 ...") + eval_model(svm_model, tr_features_scaled, tr_labels, multi_mode, 'ovo_voting_exp', perf_measures_path, C_value, gamma_value, 'Train_') + + # Let's test new data! + print("Evaluation on DS2 ...") + eval_model(svm_model, eval_features_scaled, eval_labels, multi_mode, 'ovo_voting_exp', perf_measures_path, C_value, gamma_value, '')