|
a |
|
b/Capstone code |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
""" |
|
|
4 |
Created on Fri Feb 3 09:37:21 2023 |
|
|
5 |
|
|
|
6 |
@author: ming |
|
|
7 |
""" |
|
|
8 |
|
|
|
9 |
import pandas as pd |
|
|
10 |
from fancyimpute import KNN |
|
|
11 |
from sklearn.neighbors import KNeighborsClassifier |
|
|
12 |
from sklearn.naive_bayes import GaussianNB |
|
|
13 |
from sklearn.linear_model import LogisticRegression |
|
|
14 |
from sklearn.svm import SVC |
|
|
15 |
from sklearn.tree import DecisionTreeClassifier |
|
|
16 |
from sklearn.ensemble import RandomForestClassifier |
|
|
17 |
from sklearn.neural_network import MLPClassifier |
|
|
18 |
from sklearn.model_selection import train_test_split |
|
|
19 |
from sklearn.metrics import classification_report,confusion_matrix |
|
|
20 |
from sklearn import preprocessing |
|
|
21 |
|
|
|
22 |
from sklearn.metrics import roc_auc_score |
|
|
23 |
from sklearn.model_selection import GridSearchCV |
|
|
24 |
|
|
|
25 |
|
|
|
26 |
#load data |
|
|
27 |
X=pd.read_csv(r"/Users/ming/Downloads/Xset1.csv") |
|
|
28 |
Y=pd.read_csv(r"/Users/ming/Downloads/Y.csv") |
|
|
29 |
|
|
|
30 |
#del columns |
|
|
31 |
X.dropna(thresh=100) |
|
|
32 |
|
|
|
33 |
#impute |
|
|
34 |
Xc=X.columns |
|
|
35 |
|
|
|
36 |
X = pd.DataFrame(KNN(k=6).fit_transform(X)) |
|
|
37 |
X.columns=Xc |
|
|
38 |
|
|
|
39 |
#standardize |
|
|
40 |
standardized_X = preprocessing.scale(X) |
|
|
41 |
|
|
|
42 |
#oversampling |
|
|
43 |
|
|
|
44 |
#imblearn进行随机过采样 |
|
|
45 |
from imblearn.over_sampling import RandomOverSampler |
|
|
46 |
ros = RandomOverSampler(random_state=0) |
|
|
47 |
X_resampled, y_resampled = ros.fit_resample(standardized_X, Y) |
|
|
48 |
|
|
|
49 |
|
|
|
50 |
models = { |
|
|
51 |
"knn": KNeighborsClassifier(n_neighbors=1), |
|
|
52 |
"naive_bayes": GaussianNB(), |
|
|
53 |
"logit": LogisticRegression(solver="lbfgs", multi_class="auto"), |
|
|
54 |
"svm": SVC(kernel="rbf", gamma="auto"), |
|
|
55 |
"decision_tree": DecisionTreeClassifier(criterion='gini',splitter='random',max_depth=15,min_samples_leaf=10,min_samples_split=10), |
|
|
56 |
"random_forest": RandomForestClassifier(n_estimators=60,max_depth=6,max_samples=70,max_features=5), |
|
|
57 |
"mlp": MLPClassifier() |
|
|
58 |
} |
|
|
59 |
|
|
|
60 |
trainX, testX, trainY, testY = train_test_split(X_resampled, y_resampled, random_state=3, test_size=0.2) |
|
|
61 |
|
|
|
62 |
print("use '{}' bulid model...".format("random_forest")) |
|
|
63 |
model = models["random_forest"] |
|
|
64 |
model.fit(trainX, trainY) |
|
|
65 |
|
|
|
66 |
''' |
|
|
67 |
tree_param_grid = {'n_estimators': range(50,100,10), |
|
|
68 |
'min_samples_split':range(30,111,20), |
|
|
69 |
'max_features':range(5,76,10), |
|
|
70 |
'max_depth':range(3,22,3) |
|
|
71 |
} |
|
|
72 |
|
|
|
73 |
tree_best = GridSearchCV(model,param_grid = tree_param_grid,cv = 3,scoring="roc_auc",n_jobs= -1, verbose = 100) |
|
|
74 |
#Gridsearch turning parameters |
|
|
75 |
tree_best.fit(trainX, trainY) |
|
|
76 |
|
|
|
77 |
print(tree_best.best_estimator_) |
|
|
78 |
print(tree_best.best_params_," ","得分:",tree_best.best_score_) |
|
|
79 |
''' |
|
|
80 |
|
|
|
81 |
features_importance = pd.concat([pd.DataFrame(Xc).T,pd.DataFrame(model.feature_importances_).T],axis=0).T |
|
|
82 |
features_importance.columns=["feature","importance"] |
|
|
83 |
|
|
|
84 |
## prediction |
|
|
85 |
print("elvaluation...") |
|
|
86 |
predictions = model.predict(testX) |
|
|
87 |
print(classification_report(testY, predictions)) |
|
|
88 |
auc_score = roc_auc_score(testY, predictions) |
|
|
89 |
print('CONFUSION MATRIX') |
|
|
90 |
print(confusion_matrix(testY, predictions)) |
|
|
91 |
print("AUC") |
|
|
92 |
print(auc_score) |
|
|
93 |
|
|
|
94 |
|
|
|
95 |
|
|
|
96 |
|
|
|
97 |
|
|
|
98 |
|
|
|
99 |
|
|
|
100 |
|
|
|
101 |
|
|
|
102 |
|
|
|
103 |
|
|
|
104 |
|
|
|
105 |
|