|
a |
|
b/StabilityFS.py |
|
|
1 |
|
|
|
2 |
|
|
|
3 |
from scipy.spatial import distance, distance_matrix |
|
|
4 |
from sklearn import svm |
|
|
5 |
from sklearn.ensemble import RandomForestClassifier |
|
|
6 |
from sklearn.impute import KNNImputer |
|
|
7 |
import pandas as pd |
|
|
8 |
import numpy as np |
|
|
9 |
from sklearn.linear_model import LogisticRegression |
|
|
10 |
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score, make_scorer |
|
|
11 |
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split, KFold, GridSearchCV, RandomizedSearchCV |
|
|
12 |
from sklearn.pipeline import Pipeline |
|
|
13 |
from sklearn.preprocessing import StandardScaler |
|
|
14 |
from sklearn.svm import LinearSVC |
|
|
15 |
from sklearn.tree import DecisionTreeClassifier |
|
|
16 |
from sklearn.utils import class_weight, shuffle, resample |
|
|
17 |
from stability_selection import StabilitySelection, plot_stability_path |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
|
|
|
21 |
train_original = pd.read_csv("DataUsed/method23_real2.csv") |
|
|
22 |
test_original = pd.read_csv("DataUsed/method23_real2_valid.csv") |
|
|
23 |
df = train_original |
|
|
24 |
|
|
|
25 |
# df.insert(3, "num2", num2) |
|
|
26 |
targetIndex = -1 |
|
|
27 |
# df = df.iloc[pd.isna(df.iloc[:, targetIndex]).values == False, :] |
|
|
28 |
# df = df.drop(columns=["Num1"]) |
|
|
29 |
|
|
|
30 |
vars = df.columns[range(len(df.columns) - 1)] |
|
|
31 |
df = df.values |
|
|
32 |
X = df[:, range(0, df.shape[1] - 1)] |
|
|
33 |
Y = df[:, targetIndex] |
|
|
34 |
|
|
|
35 |
base_estimator = Pipeline([ |
|
|
36 |
('scaler', StandardScaler()), |
|
|
37 |
('model', LogisticRegression(penalty='l2')) |
|
|
38 |
]) |
|
|
39 |
|
|
|
40 |
selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__C', |
|
|
41 |
lambda_grid=np.logspace(-5, -1, 50)).fit(X, Y) |
|
|
42 |
fig, ax = plot_stability_path(selector) |
|
|
43 |
fig.show() |
|
|
44 |
|
|
|
45 |
selected_variables = selector.get_support(indices=True) |
|
|
46 |
selected_scores = selector.stability_scores_.mean(axis=1) |
|
|
47 |
|
|
|
48 |
|
|
|
49 |
selectedFeatures = pd.DataFrame({"selectedVars": vars[selected_variables], "score": selected_scores[selected_variables]},index=vars[selected_variables]) |
|
|
50 |
selectedFeatures.plot(kind='barh') |
|
|
51 |
selectedFeatures.to_excel("stabilityFeatureSelection.xlsx") |
|
|
52 |
# print(selector.get_support(indices=True)) |
|
|
53 |
|
|
|
54 |
# X = X[:, selected_variables] |
|
|
55 |
class1Data = X[Y == 1, :] |
|
|
56 |
class2Data = X[Y == 0, :] |
|
|
57 |
class1Target = Y[Y == 1] |
|
|
58 |
class2Target = Y[Y == 0] |
|
|
59 |
|
|
|
60 |
pipelines = [] |
|
|
61 |
res = [] |
|
|
62 |
split = 3 |
|
|
63 |
kf = KFold(n_splits=split) |
|
|
64 |
fold = 1 |
|
|
65 |
|
|
|
66 |
trainIndexC1 = [] |
|
|
67 |
trainIndexC2 = [] |
|
|
68 |
testIndexC1 = [] |
|
|
69 |
testIndexC2 = [] |
|
|
70 |
targetTrainIndexC1 = [] |
|
|
71 |
targetTrainIndexC2 = [] |
|
|
72 |
targetTestIndexC1 = [] |
|
|
73 |
targetTestIndexC2 = [] |
|
|
74 |
for train_index, test_index in kf.split(class1Data): |
|
|
75 |
trainIndexC1.append(train_index) |
|
|
76 |
testIndexC1.append(test_index) |
|
|
77 |
for train_index, test_index in kf.split(class2Data): |
|
|
78 |
trainIndexC2.append(train_index) |
|
|
79 |
testIndexC2.append(test_index) |
|
|
80 |
for train_index, test_index in kf.split(class1Target): |
|
|
81 |
targetTrainIndexC1.append(train_index) |
|
|
82 |
targetTestIndexC1.append(test_index) |
|
|
83 |
for train_index, test_index in kf.split(class2Target): |
|
|
84 |
targetTrainIndexC2.append(train_index) |
|
|
85 |
targetTestIndexC2.append(test_index) |
|
|
86 |
|
|
|
87 |
|
|
|
88 |
def spScore(y_true, y_pred): |
|
|
89 |
aucValue = roc_auc_score(y_true, y_pred) |
|
|
90 |
rec = recall_score(y_true, y_pred) |
|
|
91 |
return 100 * (2 * aucValue - rec) |
|
|
92 |
|
|
|
93 |
|
|
|
94 |
for index in range(len(targetTestIndexC2)): |
|
|
95 |
c1DataTrain, c1DataTest = class1Data[trainIndexC1[index], :], class1Data[testIndexC1[index], :] |
|
|
96 |
c2DataTrain, c2DataTest = class2Data[trainIndexC2[index], :], class2Data[testIndexC2[index], :] |
|
|
97 |
c1TargetTrain, c1TargetTest = class1Target[targetTrainIndexC1[index]], class1Target[targetTestIndexC1[index]] |
|
|
98 |
c2TargetTrain, c2TargetTest = class2Target[targetTrainIndexC2[index]], class2Target[targetTestIndexC2[index]] |
|
|
99 |
minorClassSize = c2DataTrain.shape[0] |
|
|
100 |
|
|
|
101 |
for i in range(int(c1DataTrain.shape[0] / minorClassSize)): |
|
|
102 |
X = np.append(class2Data, c1DataTrain[range(i * minorClassSize, (i + 1) * minorClassSize), :], axis=0) |
|
|
103 |
X = np.append(X, c2DataTrain, axis=0) |
|
|
104 |
CMS = np.append(class2Target, c1TargetTrain[range(i * minorClassSize, (i + 1) * minorClassSize)], axis=0) |
|
|
105 |
CMS = np.append(CMS, c2TargetTrain, axis=0) |
|
|
106 |
|
|
|
107 |
X_train, X_test, y_train, y_test = train_test_split(X, CMS, test_size=0.05, stratify=CMS) |
|
|
108 |
|
|
|
109 |
# smt = SMOTETomek() |
|
|
110 |
|
|
|
111 |
# tree_param = {'bootstrap': [True, False], |
|
|
112 |
# 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], |
|
|
113 |
# 'max_features': ['auto', 'sqrt'], |
|
|
114 |
# 'min_samples_leaf': [1, 2, 4], |
|
|
115 |
# 'min_samples_split': [2, 5, 10], |
|
|
116 |
# 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600]} |
|
|
117 |
# grid = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=tree_param, n_iter=10, verbose=2, n_jobs=-1, scoring=make_scorer(roc_auc_score)) |
|
|
118 |
|
|
|
119 |
tree_param = {'criterion': ['gini', 'entropy'], 'max_depth': [5, 9, 20, 30, 40, 50, 70, 90, 120, 150]} |
|
|
120 |
grid = GridSearchCV(DecisionTreeClassifier(), param_grid=tree_param, scoring=make_scorer(f1_score)) |
|
|
121 |
pipeline = pl.make_pipeline(grid) |
|
|
122 |
class_weights = class_weight.compute_class_weight('balanced', |
|
|
123 |
np.unique(CMS), |
|
|
124 |
CMS) |
|
|
125 |
accuracy = [] |
|
|
126 |
recall = [] |
|
|
127 |
fscore = [] |
|
|
128 |
auc = [] |
|
|
129 |
|
|
|
130 |
# sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0) |
|
|
131 |
# for train_index, test_index in sss.split(X, CMS): |
|
|
132 |
|
|
|
133 |
# df_ = resample(X_all, n_samples=500, replace=False, stratify=y_train) |
|
|
134 |
# y_ = np.round(df_[:, -1]) |
|
|
135 |
# df = df.select_dtypes(include=['float32', 'float64', 'int']) |
|
|
136 |
# X_ = df_[:, 0:df_.shape[1] - 1:1] |
|
|
137 |
# X_, y_ = ros.fit_sample(X_train, y_train) |
|
|
138 |
# X_, y_ = rus.fit_sample(X_, y_) |
|
|
139 |
X_, y_ = X_train, y_train |
|
|
140 |
# X_, y_ = smt.fit_resample(X_train, y_train) |
|
|
141 |
# X_, y_ = resample(X_, y_,stratify=y_,n_samples=1000) |
|
|
142 |
# weights = np.zeros([1, len(y_)]) |
|
|
143 |
# weights[0, y_ == 0] = class_weights[0] |
|
|
144 |
# weights[0, y_ == 1] = class_weights[1] |
|
|
145 |
pipeline.fit(X_, y_) |
|
|
146 |
pipelines.append(pipeline) |
|
|
147 |
y_pred = pipeline.predict(X_test) |
|
|
148 |
|
|
|
149 |
# acc = accuracy_score(y_pred, y_test) |
|
|
150 |
# rec = recall_score(y_pred, y_test) |
|
|
151 |
# f1Score = f1_score(y_pred, y_test) |
|
|
152 |
# aucValue = roc_auc_score(y_pred, y_test) |
|
|
153 |
# accuracy.append(acc) |
|
|
154 |
# recall.append(rec) |
|
|
155 |
# fscore.append(f1Score) |
|
|
156 |
# auc.append(aucValue) |
|
|
157 |
# |
|
|
158 |
# print("Acc: {}".format(acc)) |
|
|
159 |
# print("recal: {}".format(rec)) |
|
|
160 |
# print("f1Score:{}".format(f1Score)) |
|
|
161 |
# print("AUC : {}".format(aucValue)) |
|
|
162 |
|
|
|
163 |
Xtrain = np.append(c2DataTrain, c1DataTrain, axis=0) |
|
|
164 |
CMSTrain = np.append(c2TargetTrain, c1TargetTrain, axis=0) |
|
|
165 |
X = np.append(c2DataTest, c1DataTest, axis=0) |
|
|
166 |
CMS = np.append(c2TargetTest, c1TargetTest, axis=0) |
|
|
167 |
y_pred_train_all = np.zeros([CMSTrain.shape[0], len(pipelines)]) |
|
|
168 |
y_pred_test_all = np.zeros([CMS.shape[0], len(pipelines)]) |
|
|
169 |
for i, pipelineItem in enumerate(pipelines): |
|
|
170 |
y_pred_train_all[:, i] = pipelineItem.predict(Xtrain) |
|
|
171 |
y_pred_test_all[:, i] = pipelineItem.predict(X) |
|
|
172 |
class_weights = class_weight.compute_class_weight('balanced', |
|
|
173 |
np.unique(CMSTrain), |
|
|
174 |
CMSTrain) |
|
|
175 |
weights = np.zeros([1, len(CMSTrain)]) |
|
|
176 |
weights[0, CMSTrain == 0] = class_weights[0] |
|
|
177 |
weights[0, CMSTrain == 1] = class_weights[1] |
|
|
178 |
# param_grid = dict(scale_pos_weight=[1, 10, 25, 50, 75, 99, 100, 1000, 10000]) |
|
|
179 |
tree_param = {'criterion': ['gini', 'entropy'], 'max_depth': [4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 20, 30, 40, 50, 70, 90, 120, 150]} |
|
|
180 |
# tree_param = {'bootstrap': [True, False], |
|
|
181 |
# 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None], |
|
|
182 |
# 'max_features': ['auto', 'sqrt'], |
|
|
183 |
# 'min_samples_leaf': [1, 2, 4], |
|
|
184 |
# 'min_samples_split': [2, 5, 10], |
|
|
185 |
# 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600]} |
|
|
186 |
# grid = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=tree_param, n_iter=20, verbose=2, n_jobs=-1, scoring=make_scorer(roc_auc_score)) |
|
|
187 |
|
|
|
188 |
grid = GridSearchCV(DecisionTreeClassifier(), param_grid=tree_param, scoring=make_scorer(f1_score)) |
|
|
189 |
# grid = GridSearchCV(estimator=XGBClassifier(), param_grid=param_grid, n_jobs=-1, scoring=make_scorer(roc_auc_score)) |
|
|
190 |
|
|
|
191 |
grid.fit(np.append(y_pred_train_all, Xtrain, axis=1), CMSTrain) |
|
|
192 |
y_pred_test = grid.predict(np.append(y_pred_test_all, X, axis=1)) |
|
|
193 |
# y_pred_test = y_pred_test_all[:,0] |
|
|
194 |
acc = accuracy_score(y_pred_test, CMS) |
|
|
195 |
rec = recall_score(y_pred_test, CMS) |
|
|
196 |
f1Score = f1_score(y_pred_test, CMS) |
|
|
197 |
aucValue = roc_auc_score(y_pred_test, CMS) |
|
|
198 |
|
|
|
199 |
# accuracy.append(acc) |
|
|
200 |
# recall.append(rec) |
|
|
201 |
# fscore.append(f1Score) |
|
|
202 |
# auc.append(aucValue) |
|
|
203 |
res.append(pd.DataFrame({"target": CMS, "prediction": y_pred_test})) |
|
|
204 |
|
|
|
205 |
if fold == split: |
|
|
206 |
with pd.ExcelWriter('resultFolds.xlsx') as writer: |
|
|
207 |
for kk in range(len(res)): |
|
|
208 |
res[kk].to_excel(writer, sheet_name='Fold{}'.format(kk)) |
|
|
209 |
|
|
|
210 |
fold = fold + 1 |
|
|
211 |
print("Test ==============================") |
|
|
212 |
print("Test Acc: {}".format(acc)) |
|
|
213 |
print("Test recal: {}".format(rec)) |
|
|
214 |
print("Test f1Score:{}".format(f1Score)) |
|
|
215 |
print("Test AUC : {}".format(aucValue)) |
|
|
216 |
print("************************************************") |
|
|
217 |
continue |