Diff of /StabilityFS.py [000000] .. [f85ae2]

Switch to unified view

a b/StabilityFS.py
1
2
3
from scipy.spatial import distance, distance_matrix
4
from sklearn import svm
5
from sklearn.ensemble import RandomForestClassifier
6
from sklearn.impute import KNNImputer
7
import pandas as pd
8
import numpy as np
9
from sklearn.linear_model import LogisticRegression
10
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, make_scorer
11
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, KFold, GridSearchCV, RandomizedSearchCV
12
from sklearn.pipeline import Pipeline
13
from sklearn.preprocessing import StandardScaler
14
from sklearn.svm import LinearSVC
15
from sklearn.tree import DecisionTreeClassifier
16
from sklearn.utils import class_weight, shuffle, resample
17
from stability_selection import StabilitySelection, plot_stability_path
18
19
20
21
train_original = pd.read_csv("DataUsed/method23_real2.csv")
22
test_original = pd.read_csv("DataUsed/method23_real2_valid.csv")
23
df = train_original
24
25
# df.insert(3, "num2", num2)
26
targetIndex = -1
27
# df = df.iloc[pd.isna(df.iloc[:, targetIndex]).values == False, :]
28
# df = df.drop(columns=["Num1"])
29
30
vars = df.columns[range(len(df.columns) - 1)]
31
df = df.values
32
X = df[:, range(0, df.shape[1] - 1)]
33
Y = df[:, targetIndex]
34
35
base_estimator = Pipeline([
36
    ('scaler', StandardScaler()),
37
    ('model', LogisticRegression(penalty='l2'))
38
])
39
40
selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__C',
41
                              lambda_grid=np.logspace(-5, -1, 50)).fit(X, Y)
42
fig, ax = plot_stability_path(selector)
43
fig.show()
44
45
selected_variables = selector.get_support(indices=True)
46
selected_scores = selector.stability_scores_.mean(axis=1)
47
48
49
selectedFeatures = pd.DataFrame({"selectedVars": vars[selected_variables], "score": selected_scores[selected_variables]},index=vars[selected_variables])
50
selectedFeatures.plot(kind='barh')
51
selectedFeatures.to_excel("stabilityFeatureSelection.xlsx")
52
# print(selector.get_support(indices=True))
53
54
# X = X[:, selected_variables]
55
class1Data = X[Y == 1, :]
56
class2Data = X[Y == 0, :]
57
class1Target = Y[Y == 1]
58
class2Target = Y[Y == 0]
59
60
pipelines = []
61
res = []
62
split = 3
63
kf = KFold(n_splits=split)
64
fold = 1
65
66
trainIndexC1 = []
67
trainIndexC2 = []
68
testIndexC1 = []
69
testIndexC2 = []
70
targetTrainIndexC1 = []
71
targetTrainIndexC2 = []
72
targetTestIndexC1 = []
73
targetTestIndexC2 = []
74
for train_index, test_index in kf.split(class1Data):
75
    trainIndexC1.append(train_index)
76
    testIndexC1.append(test_index)
77
for train_index, test_index in kf.split(class2Data):
78
    trainIndexC2.append(train_index)
79
    testIndexC2.append(test_index)
80
for train_index, test_index in kf.split(class1Target):
81
    targetTrainIndexC1.append(train_index)
82
    targetTestIndexC1.append(test_index)
83
for train_index, test_index in kf.split(class2Target):
84
    targetTrainIndexC2.append(train_index)
85
    targetTestIndexC2.append(test_index)
86
87
88
def spScore(y_true, y_pred):
89
    aucValue = roc_auc_score(y_true, y_pred)
90
    rec = recall_score(y_true, y_pred)
91
    return 100 * (2 * aucValue - rec)
92
93
94
for index in range(len(targetTestIndexC2)):
95
    c1DataTrain, c1DataTest = class1Data[trainIndexC1[index], :], class1Data[testIndexC1[index], :]
96
    c2DataTrain, c2DataTest = class2Data[trainIndexC2[index], :], class2Data[testIndexC2[index], :]
97
    c1TargetTrain, c1TargetTest = class1Target[targetTrainIndexC1[index]], class1Target[targetTestIndexC1[index]]
98
    c2TargetTrain, c2TargetTest = class2Target[targetTrainIndexC2[index]], class2Target[targetTestIndexC2[index]]
99
    minorClassSize = c2DataTrain.shape[0]
100
101
    for i in range(int(c1DataTrain.shape[0] / minorClassSize)):
102
        X = np.append(class2Data, c1DataTrain[range(i * minorClassSize, (i + 1) * minorClassSize), :], axis=0)
103
        X = np.append(X, c2DataTrain, axis=0)
104
        CMS = np.append(class2Target, c1TargetTrain[range(i * minorClassSize, (i + 1) * minorClassSize)], axis=0)
105
        CMS = np.append(CMS, c2TargetTrain, axis=0)
106
107
        X_train, X_test, y_train, y_test = train_test_split(X, CMS, test_size=0.05, stratify=CMS)
108
109
        # smt = SMOTETomek()
110
111
        # tree_param = {'bootstrap': [True, False],
112
        #               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
113
        #               'max_features': ['auto', 'sqrt'],
114
        #               'min_samples_leaf': [1, 2, 4],
115
        #               'min_samples_split': [2, 5, 10],
116
        #               'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600]}
117
        # grid = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=tree_param, n_iter=10, verbose=2, n_jobs=-1, scoring=make_scorer(roc_auc_score))
118
119
        tree_param = {'criterion': ['gini', 'entropy'], 'max_depth': [5, 9, 20, 30, 40, 50, 70, 90, 120, 150]}
120
        grid = GridSearchCV(DecisionTreeClassifier(), param_grid=tree_param, scoring=make_scorer(f1_score))
121
        pipeline = pl.make_pipeline(grid)
122
        class_weights = class_weight.compute_class_weight('balanced',
123
                                                          np.unique(CMS),
124
                                                          CMS)
125
        accuracy = []
126
        recall = []
127
        fscore = []
128
        auc = []
129
130
        # sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
131
        # for train_index, test_index in sss.split(X, CMS):
132
133
        # df_ = resample(X_all, n_samples=500, replace=False, stratify=y_train)
134
        # y_ = np.round(df_[:, -1])
135
        # df = df.select_dtypes(include=['float32', 'float64', 'int'])
136
        # X_ = df_[:, 0:df_.shape[1] - 1:1]
137
        # X_, y_ = ros.fit_sample(X_train, y_train)
138
        # X_, y_ = rus.fit_sample(X_, y_)
139
        X_, y_ = X_train, y_train
140
        # X_, y_ = smt.fit_resample(X_train, y_train)
141
        # X_, y_ = resample(X_, y_,stratify=y_,n_samples=1000)
142
        # weights = np.zeros([1, len(y_)])
143
        # weights[0, y_ == 0] = class_weights[0]
144
        # weights[0, y_ == 1] = class_weights[1]
145
        pipeline.fit(X_, y_)
146
        pipelines.append(pipeline)
147
        y_pred = pipeline.predict(X_test)
148
149
        # acc = accuracy_score(y_pred, y_test)
150
        # rec = recall_score(y_pred, y_test)
151
        # f1Score = f1_score(y_pred, y_test)
152
        # aucValue = roc_auc_score(y_pred, y_test)
153
        # accuracy.append(acc)
154
        # recall.append(rec)
155
        # fscore.append(f1Score)
156
        # auc.append(aucValue)
157
        #
158
        # print("Acc: {}".format(acc))
159
        # print("recal: {}".format(rec))
160
        # print("f1Score:{}".format(f1Score))
161
        # print("AUC : {}".format(aucValue))
162
163
    Xtrain = np.append(c2DataTrain, c1DataTrain, axis=0)
164
    CMSTrain = np.append(c2TargetTrain, c1TargetTrain, axis=0)
165
    X = np.append(c2DataTest, c1DataTest, axis=0)
166
    CMS = np.append(c2TargetTest, c1TargetTest, axis=0)
167
    y_pred_train_all = np.zeros([CMSTrain.shape[0], len(pipelines)])
168
    y_pred_test_all = np.zeros([CMS.shape[0], len(pipelines)])
169
    for i, pipelineItem in enumerate(pipelines):
170
        y_pred_train_all[:, i] = pipelineItem.predict(Xtrain)
171
        y_pred_test_all[:, i] = pipelineItem.predict(X)
172
    class_weights = class_weight.compute_class_weight('balanced',
173
                                                      np.unique(CMSTrain),
174
                                                      CMSTrain)
175
    weights = np.zeros([1, len(CMSTrain)])
176
    weights[0, CMSTrain == 0] = class_weights[0]
177
    weights[0, CMSTrain == 1] = class_weights[1]
178
    # param_grid = dict(scale_pos_weight=[1, 10, 25, 50, 75, 99, 100, 1000, 10000])
179
    tree_param = {'criterion': ['gini', 'entropy'], 'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40, 50, 70, 90, 120, 150]}
180
    # tree_param = {'bootstrap': [True, False],
181
    #               'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
182
    #               'max_features': ['auto', 'sqrt'],
183
    #               'min_samples_leaf': [1, 2, 4],
184
    #               'min_samples_split': [2, 5, 10],
185
    #               'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600]}
186
    # grid = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=tree_param, n_iter=20, verbose=2, n_jobs=-1, scoring=make_scorer(roc_auc_score))
187
188
    grid = GridSearchCV(DecisionTreeClassifier(), param_grid=tree_param, scoring=make_scorer(f1_score))
189
    # grid = GridSearchCV(estimator=XGBClassifier(), param_grid=param_grid, n_jobs=-1, scoring=make_scorer(roc_auc_score))
190
191
    grid.fit(np.append(y_pred_train_all, Xtrain, axis=1), CMSTrain)
192
    y_pred_test = grid.predict(np.append(y_pred_test_all, X, axis=1))
193
    # y_pred_test = y_pred_test_all[:,0]
194
    acc = accuracy_score(y_pred_test, CMS)
195
    rec = recall_score(y_pred_test, CMS)
196
    f1Score = f1_score(y_pred_test, CMS)
197
    aucValue = roc_auc_score(y_pred_test, CMS)
198
199
    # accuracy.append(acc)
200
    # recall.append(rec)
201
    # fscore.append(f1Score)
202
    # auc.append(aucValue)
203
    res.append(pd.DataFrame({"target": CMS, "prediction": y_pred_test}))
204
205
    if fold == split:
206
        with pd.ExcelWriter('resultFolds.xlsx') as writer:
207
            for kk in range(len(res)):
208
                res[kk].to_excel(writer, sheet_name='Fold{}'.format(kk))
209
210
    fold = fold + 1
211
    print("Test ==============================")
212
    print("Test Acc: {}".format(acc))
213
    print("Test recal: {}".format(rec))
214
    print("Test f1Score:{}".format(f1Score))
215
    print("Test AUC : {}".format(aucValue))
216
    print("************************************************")
217
    continue