|
a |
|
b/python/cross_validation.py |
|
|
1 |
#!/usr/bin/env python |
|
|
2 |
|
|
|
3 |
""" |
|
|
4 |
train_SVM.py |
|
|
5 |
|
|
|
6 |
VARPA, University of Coruna |
|
|
7 |
Mondejar Guerra, Victor M. |
|
|
8 |
15 Dec 2017 |
|
|
9 |
""" |
|
|
10 |
from evaluation_AAMI import * |
|
|
11 |
from sklearn import svm |
|
|
12 |
from aggregation_voting_strategies import * |
|
|
13 |
|
|
|
14 |
|
|
|
15 |
# Eval the SVM model and export the results |
|
|
16 |
def eval_crossval_fold(svm_model, features, labels, multi_mode, voting_strategy): |
|
|
17 |
if multi_mode == 'ovo': |
|
|
18 |
decision_ovo = svm_model.decision_function(features) |
|
|
19 |
|
|
|
20 |
if voting_strategy == 'ovo_voting': |
|
|
21 |
predict_ovo, counter = ovo_voting(decision_ovo, 4) |
|
|
22 |
|
|
|
23 |
elif voting_strategy == 'ovo_voting_both': |
|
|
24 |
predict_ovo, counter = ovo_voting_both(decision_ovo, 4) |
|
|
25 |
|
|
|
26 |
elif voting_strategy == 'ovo_voting_exp': |
|
|
27 |
predict_ovo, counter = ovo_voting_exp(decision_ovo, 4) |
|
|
28 |
|
|
|
29 |
# svm_model.predict_log_proba svm_model.predict_proba svm_model.predict ... |
|
|
30 |
perf_measures = compute_AAMI_performance_measures(predict_ovo, labels) |
|
|
31 |
|
|
|
32 |
return perf_measures |
|
|
33 |
|
|
|
34 |
|
|
|
35 |
def run_cross_val(features, labels, patient_num_beats, division_mode, k): |
|
|
36 |
print("Runing Cross validation...") |
|
|
37 |
|
|
|
38 |
# C_values |
|
|
39 |
# gamma_values |
|
|
40 |
C_values = [0.0001, 0.001, 0.01, 0.1, 1, 10, 50, 100, 200, 1000] |
|
|
41 |
#ijk_scores = np.zeros(len(C_values)) |
|
|
42 |
cv_scores = np.zeros(len(C_values)) |
|
|
43 |
index_cv = 0 |
|
|
44 |
n_classes = 4 |
|
|
45 |
|
|
|
46 |
for c_svm in C_values: |
|
|
47 |
# for g in g_values... |
|
|
48 |
|
|
|
49 |
features_k_fold = [np.array([]) for i in range(k)] |
|
|
50 |
label_k_fold = [np.array([]) for i in range(k)] |
|
|
51 |
|
|
|
52 |
################ |
|
|
53 |
# PREPARE DATA |
|
|
54 |
################ |
|
|
55 |
if division_mode == 'pat_cv': |
|
|
56 |
k = 22 |
|
|
57 |
base = 0 |
|
|
58 |
for kk in range(k): |
|
|
59 |
features_k_fold[kk] = features[base:base+patient_num_beats[kk]] |
|
|
60 |
label_k_fold[kk] = labels[base:base+patient_num_beats[kk]] |
|
|
61 |
base = patient_num_beats[kk] + 1 |
|
|
62 |
|
|
|
63 |
# NOTE: 22 k-folds will be very computational cost |
|
|
64 |
# NOTE: division by patient and oversampling couldnt by used!!!! |
|
|
65 |
if division_mode == 'beat_cv': |
|
|
66 |
|
|
|
67 |
# NOTE: class sklearn.model_selection.StratifiedKFold(n_splits=3, shuffle=False, random_state=None)[source] |
|
|
68 |
# Stratified K-Folds cross-validator |
|
|
69 |
# Provides train/test indices to split data in train/test sets. |
|
|
70 |
# Thirun_cross_vals cross-validation object is a variation of KFold that returns stratified folds. |
|
|
71 |
# The folds are made by preserving the percentage of samples for each class!! |
|
|
72 |
|
|
|
73 |
# Sort features and labels by class ID |
|
|
74 |
features_by_class = {} |
|
|
75 |
|
|
|
76 |
for c in range(n_classes): |
|
|
77 |
features_by_class[c] = features[labels == c] |
|
|
78 |
|
|
|
79 |
# Then split each instances group in k-folds |
|
|
80 |
# generate the k-folds with the same class ID proportions in each one |
|
|
81 |
instances_class = len(features_by_class[c]) |
|
|
82 |
increment = instances_class / k |
|
|
83 |
base = 0 |
|
|
84 |
for kk in range(k): |
|
|
85 |
features_k_fold[kk] = np.vstack((features_k_fold[kk], features_by_class[c][base:base + increment])) if features_k_fold[kk].size else features_by_class[c][base:base+increment] |
|
|
86 |
label_k_fold[kk] = np.hstack((label_k_fold[kk], np.zeros(increment) + c)) if label_k_fold[kk].size else np.zeros(increment) + c |
|
|
87 |
base = increment + 1 |
|
|
88 |
|
|
|
89 |
|
|
|
90 |
|
|
|
91 |
################ |
|
|
92 |
# RUN CROSS VAL |
|
|
93 |
################ |
|
|
94 |
for kk in range(k): |
|
|
95 |
# Rotate each iteration one fold for test and the rest for training |
|
|
96 |
|
|
|
97 |
# for each k-fold select the train and validation data |
|
|
98 |
val_features = features_k_fold[kk] |
|
|
99 |
val_labels = label_k_fold[kk] |
|
|
100 |
tr_features = np.array([]) |
|
|
101 |
tr_labels =np.array([]) |
|
|
102 |
for kkk in range(k): |
|
|
103 |
if kkk != kk: |
|
|
104 |
tr_features = np.vstack((tr_features, features_k_fold[kkk])) if tr_features.size else features_k_fold[kkk]# select k fold |
|
|
105 |
tr_labels = np.append(tr_labels, label_k_fold[kkk]) |
|
|
106 |
|
|
|
107 |
# pipeline = Pipeline([('transformer', scalar), ('estimator', clf)]) |
|
|
108 |
# instead of "StandardScaler()" |
|
|
109 |
|
|
|
110 |
####################################################33 |
|
|
111 |
# Train |
|
|
112 |
multi_mode = 'ovo' |
|
|
113 |
class_weights = {} |
|
|
114 |
for c in range(n_classes): |
|
|
115 |
class_weights.update({c:len(tr_labels) / float(np.count_nonzero(tr_labels == c))}) |
|
|
116 |
#class_weight='balanced', |
|
|
117 |
svm_model = svm.SVC(C=c_svm, kernel='rbf', degree=3, gamma='auto', |
|
|
118 |
coef0=0.0, shrinking=True, probability=False, tol=0.001, |
|
|
119 |
cache_size=200, class_weight=class_weights, verbose=False, |
|
|
120 |
max_iter=-1, decision_function_shape=multi_mode, random_state=None) |
|
|
121 |
|
|
|
122 |
# Let's Train! |
|
|
123 |
svm_model.fit(tr_features, tr_labels) |
|
|
124 |
|
|
|
125 |
######################################################################### |
|
|
126 |
# 4) Test SVM model |
|
|
127 |
# ovo_voting: |
|
|
128 |
# Simply add 1 to the win class |
|
|
129 |
perf_measures = eval_crossval_fold(svm_model, val_features, val_labels, multi_mode, 'ovo_voting_exp') |
|
|
130 |
|
|
|
131 |
# TODO evaluar con el propio Ijk?? de esta manera obtendremos el SVM entrenando en maximizar esa medida... |
|
|
132 |
#ijk_scores[index_cv] += perf_measures.Ijk |
|
|
133 |
|
|
|
134 |
cv_scores[index_cv] += np.average(perf_measures.F_measure) |
|
|
135 |
|
|
|
136 |
# TODO g-mean? |
|
|
137 |
# Zhang et al computes the g-mean. But they computed the g-mean value for each SVM model of the 1 vs 1. NvsS, NvsV, ..., SvsV.... |
|
|
138 |
|
|
|
139 |
# NOTE we could use the F-measure average from each class?? |
|
|
140 |
|
|
|
141 |
print("C value (" + str(c_svm) + " Cross val k " + str(kk) + "/" + str(k) + " AVG(F-measure) = " + str( cv_scores[index_cv] / float(kk+1))) |
|
|
142 |
# range k |
|
|
143 |
# beat division |
|
|
144 |
|
|
|
145 |
cv_scores[index_cv] /= float(k)# Average this result with the rest of the k-folds |
|
|
146 |
# NOTE: what measure maximize in the cross val???? |
|
|
147 |
|
|
|
148 |
index_cv += 1 |
|
|
149 |
|
|
|
150 |
# c_values |
|
|
151 |
|
|
|
152 |
return cv_scores, C_values |