Switch to unified view

a b/python/cross_validation.py
1
#!/usr/bin/env python
2
3
"""
4
train_SVM.py
5
    
6
VARPA, University of Coruna
7
Mondejar Guerra, Victor M.
8
15 Dec 2017
9
"""
10
from evaluation_AAMI import *
11
from sklearn import svm
12
from aggregation_voting_strategies import *
13
14
15
# Eval the SVM model and export the results
16
def eval_crossval_fold(svm_model, features, labels, multi_mode, voting_strategy):
17
    if multi_mode == 'ovo':
18
        decision_ovo        = svm_model.decision_function(features)
19
20
        if voting_strategy == 'ovo_voting':
21
            predict_ovo, counter    = ovo_voting(decision_ovo, 4)
22
23
        elif voting_strategy == 'ovo_voting_both':
24
            predict_ovo, counter    = ovo_voting_both(decision_ovo, 4)
25
26
        elif voting_strategy == 'ovo_voting_exp':
27
            predict_ovo, counter    = ovo_voting_exp(decision_ovo, 4)
28
29
        # svm_model.predict_log_proba  svm_model.predict_proba   svm_model.predict ...
30
        perf_measures = compute_AAMI_performance_measures(predict_ovo, labels)
31
32
    return perf_measures
33
34
35
def run_cross_val(features, labels, patient_num_beats, division_mode, k):
36
    print("Runing Cross validation...")
37
38
    # C_values
39
    # gamma_values
40
    C_values = [0.0001, 0.001, 0.01, 0.1, 1, 10, 50, 100, 200, 1000]
41
    #ijk_scores = np.zeros(len(C_values))
42
    cv_scores  = np.zeros(len(C_values))
43
    index_cv = 0
44
    n_classes = 4
45
46
    for c_svm in C_values:
47
        # for g in g_values...
48
49
        features_k_fold = [np.array([]) for i in range(k)]
50
        label_k_fold = [np.array([]) for i in range(k)]
51
52
        ################
53
        # PREPARE DATA
54
        ################
55
        if division_mode == 'pat_cv':
56
            k = 22
57
            base = 0
58
            for kk in range(k):
59
                features_k_fold[kk] = features[base:base+patient_num_beats[kk]]
60
                label_k_fold[kk]    = labels[base:base+patient_num_beats[kk]]
61
                base = patient_num_beats[kk] + 1
62
63
        # NOTE: 22 k-folds will be very computational cost
64
        # NOTE: division by patient and oversampling couldnt by used!!!!
65
        if division_mode == 'beat_cv':
66
            
67
            # NOTE: class sklearn.model_selection.StratifiedKFold(n_splits=3, shuffle=False, random_state=None)[source]
68
            # Stratified K-Folds cross-validator
69
            # Provides train/test indices to split data in train/test sets.
70
            # Thirun_cross_vals cross-validation object is a variation of KFold that returns stratified folds. 
71
            # The folds are made by preserving the percentage of samples for each class!!
72
73
            # Sort features and labels by class ID
74
            features_by_class = {}
75
76
            for c in range(n_classes):
77
                features_by_class[c] = features[labels == c]
78
                
79
                # Then split each instances group in k-folds
80
                # generate the k-folds with the same class ID proportions in each one
81
                instances_class = len(features_by_class[c])
82
                increment = instances_class / k
83
                base = 0
84
                for kk in range(k):
85
                    features_k_fold[kk] = np.vstack((features_k_fold[kk], features_by_class[c][base:base + increment]))  if features_k_fold[kk].size else features_by_class[c][base:base+increment]
86
                    label_k_fold[kk] = np.hstack((label_k_fold[kk], np.zeros(increment) + c))  if label_k_fold[kk].size else np.zeros(increment) + c
87
                    base = increment + 1
88
            
89
90
91
        ################
92
        # RUN CROSS VAL
93
        ################
94
        for kk in range(k):
95
            # Rotate each iteration one fold for test and the rest for training
96
97
            # for each k-fold select the train and validation data
98
            val_features = features_k_fold[kk]
99
            val_labels = label_k_fold[kk]
100
            tr_features = np.array([])
101
            tr_labels =np.array([])
102
            for kkk in range(k):
103
                if kkk != kk:
104
                    tr_features = np.vstack((tr_features, features_k_fold[kkk])) if tr_features.size else features_k_fold[kkk]# select k fold
105
                    tr_labels =  np.append(tr_labels, label_k_fold[kkk])
106
107
            # pipeline = Pipeline([('transformer', scalar), ('estimator', clf)])
108
            # instead of "StandardScaler()"
109
110
            ####################################################33
111
            # Train
112
            multi_mode = 'ovo'
113
            class_weights = {}
114
            for c in range(n_classes):
115
                class_weights.update({c:len(tr_labels) / float(np.count_nonzero(tr_labels == c))})
116
            #class_weight='balanced', 
117
            svm_model = svm.SVC(C=c_svm, kernel='rbf', degree=3, gamma='auto', 
118
                coef0=0.0, shrinking=True, probability=False, tol=0.001, 
119
                cache_size=200, class_weight=class_weights, verbose=False, 
120
                max_iter=-1, decision_function_shape=multi_mode, random_state=None)
121
            
122
            # Let's Train!
123
            svm_model.fit(tr_features, tr_labels) 
124
        
125
            #########################################################################
126
            # 4) Test SVM model
127
            # ovo_voting:
128
            # Simply add 1 to the win class
129
            perf_measures = eval_crossval_fold(svm_model, val_features, val_labels, multi_mode, 'ovo_voting_exp')
130
            
131
            # TODO evaluar con el propio Ijk?? de esta manera obtendremos el SVM entrenando en maximizar esa medida...
132
            #ijk_scores[index_cv] += perf_measures.Ijk
133
134
            cv_scores[index_cv] += np.average(perf_measures.F_measure)
135
136
            # TODO g-mean?
137
            # Zhang et al computes the g-mean. But they computed the g-mean value for each SVM model of the 1 vs 1. NvsS, NvsV, ..., SvsV....
138
139
            # NOTE we could use the F-measure average from each class??
140
141
            print("C value (" + str(c_svm) + " Cross val k " + str(kk) +  "/" + str(k) + "  AVG(F-measure) = " + str( cv_scores[index_cv] / float(kk+1))) 
142
        # range k
143
        # beat division
144
        
145
        cv_scores[index_cv] /= float(k)# Average this result with the rest of the k-folds
146
        # NOTE: what measure maximize in the cross val???? 
147
148
        index_cv += 1
149
150
    # c_values
151
    
152
    return cv_scores, C_values