a b/baselines/baseline.py
1
""" Code for baseline implementation """
2
import os
3
4
import numpy as np
5
import pickle as pkl
6
import random
7
import time
8
9
from classifiers import *
10
from mlp import MLP
11
12
from sklearn.preprocessing import StandardScaler, MinMaxScaler, MaxAbsScaler
13
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV as random_search
14
from sklearn.metrics import confusion_matrix, roc_auc_score, average_precision_score, auc, roc_curve, f1_score
15
16
17
18
class Baseline(object):
19
    """
20
    Classifiers: lr, svm, rf, gbdt, mlp.
21
    """
22
    def __init__(self, target, config={}):
23
        """
24
        Args:
25
            batch_size: size of meta batch size (e.g. number of functions)
26
        """
27
        self.X_pos, self.y_pos = [], []
28
        self.X_neg, self.y_neg = [], []
29
        self.intmd_path = 'intermediate/'
30
        self.target = target
31
32
    def load_data(self):
33
        with open(self.intmd_path + self.target + '.pos.mat.pkl', 'rb') as f:
34
            X_pos_mat, y_pos_mat = pkl.load(f)
35
            f.close()
36
37
        with open(self.intmd_path + self.target + '.neg.mat.pkl', 'rb') as f:
38
            X_neg_mat, y_neg_mat = pkl.load(f)
39
            f.close()
40
41
        print ("The number of positive samles is: ", len(y_pos_mat))
42
        print ("The number of negative samles is: ", len(y_neg_mat))
43
44
        # aggregate (and normalize) the data
45
        for s, array in X_pos_mat.items():
46
             self.X_pos.append(np.sum(X_pos_mat[s], axis=0))
47
             self.y_pos.append(y_pos_mat[s])
48
        for s, array in X_neg_mat.items():
49
             self.X_neg.append(np.sum(X_neg_mat[s], axis=0))
50
             self.y_neg.append(y_neg_mat[s])
51
52
        return (self.X_pos, self.X_neg), (self.y_pos, self.y_neg)
53
54
    def get_classifiers(self, X, y):
55
        '''split by StratifiedKFold, then use lr, svm, rf, gbdt and mlp classifiers.
56
        lr, svm, mlp need normalization
57
        '''
58
        X_pos, X_neg = X
59
        y_pos, y_neg = y
60
        X, y = np.concatenate((X_pos, X_neg), axis=0), np.concatenate((y_pos, y_neg), axis=0)
61
        p = np.random.permutation(len(X))
62
        X,y = X[p],y[p]
63
64
        n_fold = 5
65
        skf = StratifiedKFold(n_splits = n_fold, random_state = 99991)
66
        scaler = StandardScaler()
67
        # OPTION: choose one of the classifiers
68
        models = {"LR":lr, "KNN":knn, "SVM":svm, "RF":rf, "XGB":xgb, "MLP":MLP}
69
        ifold = 0
70
        results = dict()
71
        Res = {'aucroc': [], 'spec': [], 'sen': [], 'aucprc': [], 'avepre': [], 'f1score': []}
72
        for train_index, test_index in skf.split(X,y):
73
            ifold+=1
74
            print ("----------The %d-th fold-----------" %ifold)
75
            results[ifold] = dict()
76
77
            X_tr, X_te = X[train_index], X[test_index]
78
            y_tr, y_te = y[train_index], y[test_index]
79
80
            for k, m in models.items():
81
                print ("The current model for optimizing is: " + k)
82
                #train
83
                if k == "MLP":
84
                    # init: feature_dim, num_classes
85
                    mlp = m(X_tr.shape[1], 2)
86
                    fit_auc, fit_accuracy, fit_losses = mlp.fit(X_tr, y_tr, X_te, y_te)
87
                    string, auc, accuracy, loss, yhat = mlp.evaluate(X_te, y_te)
88
                    yhat = np.array(yhat, dtype="float32")
89
                else:
90
                    m = m.fit(X_tr, y_tr)
91
                    yhat = m.predict(X_te)
92
                #eval: aucroc, aucprc
93
                aucroc = roc_auc_score(y_te, yhat)
94
                avepre = average_precision_score(y_te, yhat)
95
                tn, fp, fn, tp = confusion_matrix(y_te, yhat).ravel()
96
                f1score = f1_score(y_te, yhat, 'micro')
97
98
                # true negative, false positive, false negative, true positive
99
                spec = tn / (tn+fp)
100
                sen = tp / (tp+fn)
101
                models[k] = m
102
                Res['aucroc'].append(aucroc)
103
                Res['spec'].append(spec)
104
                Res['sen'].append(sen)
105
                Res['aucprc'].append(aucprc)
106
                Res['avepre'].append(avepre)
107
                Res['f1score'].append(f1score)
108
109
        print ('aucroc mean: ', np.mean(np.array(Res['aucroc'])))
110
        print ('aucroc std: ', np.std(np.array(Res['aucroc'])))
111
        print ('spec mean: ', np.mean(np.array(Res['spec'])))
112
        print ('spec std: ', np.std(np.array(Res['spec'])))
113
        print ('sen mean: ', np.mean(np.array(Res['sen'])))
114
        print ('sen std: ', np.std(np.array(Res['sen'])))
115
        print ('avepre mean: ', np.mean(np.array(Res['avepre'])))
116
        print ('avepre std: ', np.std(np.array(Res['avepre'])))
117
        print ('f1score mean: ', np.mean(np.array(Res['f1score'])))
118
        print ('f1score std: ', np.std(np.array(Res['f1score'])))
119
120
#### Hyperparams Search ####
121
#######################
122
def classic_rsearch(x,y):
123
    from scipy.stats import uniform as sp_rand
124
    from scipy.stats import randint as sp_randint
125
    lr1 = LR(warm_start = True, penalty = 'l1', verbose = 100, max_iter = 5000)
126
    lr2 = LR(warm_start = True, penalty = 'l2', verbose = 100, max_iter = 5000)
127
    svm = SVM(verbose = True, probability = False, max_iter= 5000)
128
    rf = RF(warm_start = True, verbose = 100)
129
130
    #random search params
131
    lr_params = {'C': sp_rand(1, 1e5)}
132
    rf_params = {'criterion': ['gini', 'entropy'], 'n_estimators': sp_randint(10, 200), 'max_features': ['auto', 'sqrt', 'log2', None]}
133
    mlp_params = {'hidden_layer_sizes':[(64, 64), (128, 128), (256, 256), (512, 512)], 'alpha': sp_rand(1e-6, 1e-2)}
134
    svm_params = {'kernel': ['rbf', 'poly'], 'C':sp_rand (1, 1e5), 'gamma': sp_rand(1e-5, 1)}
135
136
    results = {}; models = []
137
    lst = [lr1, lr2, svm, rf]
138
    names = ['LR','SVM','RF']
139
    params = [lr_params, lr_params, svm_params, rf_params]
140
    for idx in range(len(lst)):
141
        n_iter_search = 60
142
        start = time.time()
143
        rsearch = random_search(estimator = lst[idx], param_distributions = params[idx], n_iter=n_iter_search,
144
                                scoring='roc_auc', fit_params=None, n_jobs=1,
145
                                iid=True, refit=True, cv=5, verbose=0, random_state=8)
146
        rsearch.fit(x, y)
147
        models.append(rsearch)
148
        results[names[idx]] = rsearch.cv_results_
149
        print (names[idx]+" results complete.")
150
        print("RandomizedSearchCV took %.2f seconds for %d candidates"
151
        " parameter settings." % ((time.time() - start), n_iter_search))
152
    return (data, models)
153
154
155
def main():
156
    target = "AD"
157
    bl = Baseline(target)
158
    X, y = bl.load_data()
159
    bl.get_classifiers(X, y)
160
161
if __name__ == "__main__":
162
    main()