|
a |
|
b/baselines/baseline.py |
|
|
1 |
""" Code for baseline implementation """ |
|
|
2 |
import os |
|
|
3 |
|
|
|
4 |
import numpy as np |
|
|
5 |
import pickle as pkl |
|
|
6 |
import random |
|
|
7 |
import time |
|
|
8 |
|
|
|
9 |
from classifiers import * |
|
|
10 |
from mlp import MLP |
|
|
11 |
|
|
|
12 |
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler |
|
|
13 |
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV as random_search |
|
|
14 |
from sklearn.metrics import confusion_matrix, roc_auc_score, average_precision_score, auc, roc_curve, f1_score |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
|
|
|
18 |
class Baseline(object): |
|
|
19 |
""" |
|
|
20 |
Classifiers: lr, svm, rf, gbdt, mlp. |
|
|
21 |
""" |
|
|
22 |
def __init__(self, target, config={}): |
|
|
23 |
""" |
|
|
24 |
Args: |
|
|
25 |
batch_size: size of meta batch size (e.g. number of functions) |
|
|
26 |
""" |
|
|
27 |
self.X_pos, self.y_pos = [], [] |
|
|
28 |
self.X_neg, self.y_neg = [], [] |
|
|
29 |
self.intmd_path = 'intermediate/' |
|
|
30 |
self.target = target |
|
|
31 |
|
|
|
32 |
def load_data(self): |
|
|
33 |
with open(self.intmd_path + self.target + '.pos.mat.pkl', 'rb') as f: |
|
|
34 |
X_pos_mat, y_pos_mat = pkl.load(f) |
|
|
35 |
f.close() |
|
|
36 |
|
|
|
37 |
with open(self.intmd_path + self.target + '.neg.mat.pkl', 'rb') as f: |
|
|
38 |
X_neg_mat, y_neg_mat = pkl.load(f) |
|
|
39 |
f.close() |
|
|
40 |
|
|
|
41 |
print ("The number of positive samles is: ", len(y_pos_mat)) |
|
|
42 |
print ("The number of negative samles is: ", len(y_neg_mat)) |
|
|
43 |
|
|
|
44 |
# aggregate (and normalize) the data |
|
|
45 |
for s, array in X_pos_mat.items(): |
|
|
46 |
self.X_pos.append(np.sum(X_pos_mat[s], axis=0)) |
|
|
47 |
self.y_pos.append(y_pos_mat[s]) |
|
|
48 |
for s, array in X_neg_mat.items(): |
|
|
49 |
self.X_neg.append(np.sum(X_neg_mat[s], axis=0)) |
|
|
50 |
self.y_neg.append(y_neg_mat[s]) |
|
|
51 |
|
|
|
52 |
return (self.X_pos, self.X_neg), (self.y_pos, self.y_neg) |
|
|
53 |
|
|
|
54 |
def get_classifiers(self, X, y): |
|
|
55 |
'''split by StratifiedKFold, then use lr, svm, rf, gbdt and mlp classifiers. |
|
|
56 |
lr, svm, mlp need normalization |
|
|
57 |
''' |
|
|
58 |
X_pos, X_neg = X |
|
|
59 |
y_pos, y_neg = y |
|
|
60 |
X, y = np.concatenate((X_pos, X_neg), axis=0), np.concatenate((y_pos, y_neg), axis=0) |
|
|
61 |
p = np.random.permutation(len(X)) |
|
|
62 |
X,y = X[p],y[p] |
|
|
63 |
|
|
|
64 |
n_fold = 5 |
|
|
65 |
skf = StratifiedKFold(n_splits = n_fold, random_state = 99991) |
|
|
66 |
scaler = StandardScaler() |
|
|
67 |
# OPTION: choose one of the classifiers |
|
|
68 |
models = {"LR":lr, "KNN":knn, "SVM":svm, "RF":rf, "XGB":xgb, "MLP":MLP} |
|
|
69 |
ifold = 0 |
|
|
70 |
results = dict() |
|
|
71 |
Res = {'aucroc': [], 'spec': [], 'sen': [], 'aucprc': [], 'avepre': [], 'f1score': []} |
|
|
72 |
for train_index, test_index in skf.split(X,y): |
|
|
73 |
ifold+=1 |
|
|
74 |
print ("----------The %d-th fold-----------" %ifold) |
|
|
75 |
results[ifold] = dict() |
|
|
76 |
|
|
|
77 |
X_tr, X_te = X[train_index], X[test_index] |
|
|
78 |
y_tr, y_te = y[train_index], y[test_index] |
|
|
79 |
|
|
|
80 |
for k, m in models.items(): |
|
|
81 |
print ("The current model for optimizing is: " + k) |
|
|
82 |
#train |
|
|
83 |
if k == "MLP": |
|
|
84 |
# init: feature_dim, num_classes |
|
|
85 |
mlp = m(X_tr.shape[1], 2) |
|
|
86 |
fit_auc, fit_accuracy, fit_losses = mlp.fit(X_tr, y_tr, X_te, y_te) |
|
|
87 |
string, auc, accuracy, loss, yhat = mlp.evaluate(X_te, y_te) |
|
|
88 |
yhat = np.array(yhat, dtype="float32") |
|
|
89 |
else: |
|
|
90 |
m = m.fit(X_tr, y_tr) |
|
|
91 |
yhat = m.predict(X_te) |
|
|
92 |
#eval: aucroc, aucprc |
|
|
93 |
aucroc = roc_auc_score(y_te, yhat) |
|
|
94 |
avepre = average_precision_score(y_te, yhat) |
|
|
95 |
tn, fp, fn, tp = confusion_matrix(y_te, yhat).ravel() |
|
|
96 |
f1score = f1_score(y_te, yhat, 'micro') |
|
|
97 |
|
|
|
98 |
# true negative, false positive, false negative, true positive |
|
|
99 |
spec = tn / (tn+fp) |
|
|
100 |
sen = tp / (tp+fn) |
|
|
101 |
models[k] = m |
|
|
102 |
Res['aucroc'].append(aucroc) |
|
|
103 |
Res['spec'].append(spec) |
|
|
104 |
Res['sen'].append(sen) |
|
|
105 |
Res['aucprc'].append(aucprc) |
|
|
106 |
Res['avepre'].append(avepre) |
|
|
107 |
Res['f1score'].append(f1score) |
|
|
108 |
|
|
|
109 |
print ('aucroc mean: ', np.mean(np.array(Res['aucroc']))) |
|
|
110 |
print ('aucroc std: ', np.std(np.array(Res['aucroc']))) |
|
|
111 |
print ('spec mean: ', np.mean(np.array(Res['spec']))) |
|
|
112 |
print ('spec std: ', np.std(np.array(Res['spec']))) |
|
|
113 |
print ('sen mean: ', np.mean(np.array(Res['sen']))) |
|
|
114 |
print ('sen std: ', np.std(np.array(Res['sen']))) |
|
|
115 |
print ('avepre mean: ', np.mean(np.array(Res['avepre']))) |
|
|
116 |
print ('avepre std: ', np.std(np.array(Res['avepre']))) |
|
|
117 |
print ('f1score mean: ', np.mean(np.array(Res['f1score']))) |
|
|
118 |
print ('f1score std: ', np.std(np.array(Res['f1score']))) |
|
|
119 |
|
|
|
120 |
#### Hyperparams Search #### |
|
|
121 |
####################### |
|
|
122 |
def classic_rsearch(x,y): |
|
|
123 |
from scipy.stats import uniform as sp_rand |
|
|
124 |
from scipy.stats import randint as sp_randint |
|
|
125 |
lr1 = LR(warm_start = True, penalty = 'l1', verbose = 100, max_iter = 5000) |
|
|
126 |
lr2 = LR(warm_start = True, penalty = 'l2', verbose = 100, max_iter = 5000) |
|
|
127 |
svm = SVM(verbose = True, probability = False, max_iter= 5000) |
|
|
128 |
rf = RF(warm_start = True, verbose = 100) |
|
|
129 |
|
|
|
130 |
#random search params |
|
|
131 |
lr_params = {'C': sp_rand(1, 1e5)} |
|
|
132 |
rf_params = {'criterion': ['gini', 'entropy'], 'n_estimators': sp_randint(10, 200), 'max_features': ['auto', 'sqrt', 'log2', None]} |
|
|
133 |
mlp_params = {'hidden_layer_sizes':[(64, 64), (128, 128), (256, 256), (512, 512)], 'alpha': sp_rand(1e-6, 1e-2)} |
|
|
134 |
svm_params = {'kernel': ['rbf', 'poly'], 'C':sp_rand (1, 1e5), 'gamma': sp_rand(1e-5, 1)} |
|
|
135 |
|
|
|
136 |
results = {}; models = [] |
|
|
137 |
lst = [lr1, lr2, svm, rf] |
|
|
138 |
names = ['LR','SVM','RF'] |
|
|
139 |
params = [lr_params, lr_params, svm_params, rf_params] |
|
|
140 |
for idx in range(len(lst)): |
|
|
141 |
n_iter_search = 60 |
|
|
142 |
start = time.time() |
|
|
143 |
rsearch = random_search(estimator = lst[idx], param_distributions = params[idx], n_iter=n_iter_search, |
|
|
144 |
scoring='roc_auc', fit_params=None, n_jobs=1, |
|
|
145 |
iid=True, refit=True, cv=5, verbose=0, random_state=8) |
|
|
146 |
rsearch.fit(x, y) |
|
|
147 |
models.append(rsearch) |
|
|
148 |
results[names[idx]] = rsearch.cv_results_ |
|
|
149 |
print (names[idx]+" results complete.") |
|
|
150 |
print("RandomizedSearchCV took %.2f seconds for %d candidates" |
|
|
151 |
" parameter settings." % ((time.time() - start), n_iter_search)) |
|
|
152 |
return (data, models) |
|
|
153 |
|
|
|
154 |
|
|
|
155 |
def main(): |
|
|
156 |
target = "AD" |
|
|
157 |
bl = Baseline(target) |
|
|
158 |
X, y = bl.load_data() |
|
|
159 |
bl.get_classifiers(X, y) |
|
|
160 |
|
|
|
161 |
if __name__ == "__main__": |
|
|
162 |
main() |