[2b4aea]: / baselines / baseline.py

Download this file

163 lines (139 with data), 6.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
""" Code for baseline implementation """
import os
import numpy as np
import pickle as pkl
import random
import time
from classifiers import *
from mlp import MLP
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV as random_search
from sklearn.metrics import confusion_matrix, roc_auc_score, average_precision_score, auc, roc_curve, f1_score
class Baseline(object):
"""
Classifiers: lr, svm, rf, gbdt, mlp.
"""
def __init__(self, target, config={}):
"""
Args:
batch_size: size of meta batch size (e.g. number of functions)
"""
self.X_pos, self.y_pos = [], []
self.X_neg, self.y_neg = [], []
self.intmd_path = 'intermediate/'
self.target = target
def load_data(self):
with open(self.intmd_path + self.target + '.pos.mat.pkl', 'rb') as f:
X_pos_mat, y_pos_mat = pkl.load(f)
f.close()
with open(self.intmd_path + self.target + '.neg.mat.pkl', 'rb') as f:
X_neg_mat, y_neg_mat = pkl.load(f)
f.close()
print ("The number of positive samles is: ", len(y_pos_mat))
print ("The number of negative samles is: ", len(y_neg_mat))
# aggregate (and normalize) the data
for s, array in X_pos_mat.items():
self.X_pos.append(np.sum(X_pos_mat[s], axis=0))
self.y_pos.append(y_pos_mat[s])
for s, array in X_neg_mat.items():
self.X_neg.append(np.sum(X_neg_mat[s], axis=0))
self.y_neg.append(y_neg_mat[s])
return (self.X_pos, self.X_neg), (self.y_pos, self.y_neg)
def get_classifiers(self, X, y):
'''split by StratifiedKFold, then use lr, svm, rf, gbdt and mlp classifiers.
lr, svm, mlp need normalization
'''
X_pos, X_neg = X
y_pos, y_neg = y
X, y = np.concatenate((X_pos, X_neg), axis=0), np.concatenate((y_pos, y_neg), axis=0)
p = np.random.permutation(len(X))
X,y = X[p],y[p]
n_fold = 5
skf = StratifiedKFold(n_splits = n_fold, random_state = 99991)
scaler = StandardScaler()
# OPTION: choose one of the classifiers
models = {"LR":lr, "KNN":knn, "SVM":svm, "RF":rf, "XGB":xgb, "MLP":MLP}
ifold = 0
results = dict()
Res = {'aucroc': [], 'spec': [], 'sen': [], 'aucprc': [], 'avepre': [], 'f1score': []}
for train_index, test_index in skf.split(X,y):
ifold+=1
print ("----------The %d-th fold-----------" %ifold)
results[ifold] = dict()
X_tr, X_te = X[train_index], X[test_index]
y_tr, y_te = y[train_index], y[test_index]
for k, m in models.items():
print ("The current model for optimizing is: " + k)
#train
if k == "MLP":
# init: feature_dim, num_classes
mlp = m(X_tr.shape[1], 2)
fit_auc, fit_accuracy, fit_losses = mlp.fit(X_tr, y_tr, X_te, y_te)
string, auc, accuracy, loss, yhat = mlp.evaluate(X_te, y_te)
yhat = np.array(yhat, dtype="float32")
else:
m = m.fit(X_tr, y_tr)
yhat = m.predict(X_te)
#eval: aucroc, aucprc
aucroc = roc_auc_score(y_te, yhat)
avepre = average_precision_score(y_te, yhat)
tn, fp, fn, tp = confusion_matrix(y_te, yhat).ravel()
f1score = f1_score(y_te, yhat, 'micro')
# true negative, false positive, false negative, true positive
spec = tn / (tn+fp)
sen = tp / (tp+fn)
models[k] = m
Res['aucroc'].append(aucroc)
Res['spec'].append(spec)
Res['sen'].append(sen)
Res['aucprc'].append(aucprc)
Res['avepre'].append(avepre)
Res['f1score'].append(f1score)
print ('aucroc mean: ', np.mean(np.array(Res['aucroc'])))
print ('aucroc std: ', np.std(np.array(Res['aucroc'])))
print ('spec mean: ', np.mean(np.array(Res['spec'])))
print ('spec std: ', np.std(np.array(Res['spec'])))
print ('sen mean: ', np.mean(np.array(Res['sen'])))
print ('sen std: ', np.std(np.array(Res['sen'])))
print ('avepre mean: ', np.mean(np.array(Res['avepre'])))
print ('avepre std: ', np.std(np.array(Res['avepre'])))
print ('f1score mean: ', np.mean(np.array(Res['f1score'])))
print ('f1score std: ', np.std(np.array(Res['f1score'])))
#### Hyperparams Search ####
#######################
def classic_rsearch(x,y):
from scipy.stats import uniform as sp_rand
from scipy.stats import randint as sp_randint
lr1 = LR(warm_start = True, penalty = 'l1', verbose = 100, max_iter = 5000)
lr2 = LR(warm_start = True, penalty = 'l2', verbose = 100, max_iter = 5000)
svm = SVM(verbose = True, probability = False, max_iter= 5000)
rf = RF(warm_start = True, verbose = 100)
#random search params
lr_params = {'C': sp_rand(1, 1e5)}
rf_params = {'criterion': ['gini', 'entropy'], 'n_estimators': sp_randint(10, 200), 'max_features': ['auto', 'sqrt', 'log2', None]}
mlp_params = {'hidden_layer_sizes':[(64, 64), (128, 128), (256, 256), (512, 512)], 'alpha': sp_rand(1e-6, 1e-2)}
svm_params = {'kernel': ['rbf', 'poly'], 'C':sp_rand (1, 1e5), 'gamma': sp_rand(1e-5, 1)}
results = {}; models = []
lst = [lr1, lr2, svm, rf]
names = ['LR','SVM','RF']
params = [lr_params, lr_params, svm_params, rf_params]
for idx in range(len(lst)):
n_iter_search = 60
start = time.time()
rsearch = random_search(estimator = lst[idx], param_distributions = params[idx], n_iter=n_iter_search,
scoring='roc_auc', fit_params=None, n_jobs=1,
iid=True, refit=True, cv=5, verbose=0, random_state=8)
rsearch.fit(x, y)
models.append(rsearch)
results[names[idx]] = rsearch.cv_results_
print (names[idx]+" results complete.")
print("RandomizedSearchCV took %.2f seconds for %d candidates"
" parameter settings." % ((time.time() - start), n_iter_search))
return (data, models)
def main():
target = "AD"
bl = Baseline(target)
X, y = bl.load_data()
bl.get_classifiers(X, y)
if __name__ == "__main__":
main()