Diff of /Capstone code [000000] .. [16f085]

Switch to unified view

a b/Capstone code
1
#!/usr/bin/env python3
2
# -*- coding: utf-8 -*-
3
"""
4
Created on Fri Feb  3 09:37:21 2023
5
6
@author: ming
7
"""
8
9
import pandas as pd
10
from fancyimpute import KNN
11
from sklearn.neighbors import KNeighborsClassifier
12
from sklearn.naive_bayes import GaussianNB
13
from sklearn.linear_model import LogisticRegression
14
from sklearn.svm import SVC
15
from sklearn.tree import DecisionTreeClassifier
16
from sklearn.ensemble import RandomForestClassifier
17
from sklearn.neural_network import MLPClassifier
18
from sklearn.model_selection import train_test_split
19
from sklearn.metrics import classification_report,confusion_matrix
20
from sklearn import preprocessing
21
22
from sklearn.metrics import roc_auc_score
23
from sklearn.model_selection import GridSearchCV
24
25
26
#load data
27
X=pd.read_csv(r"/Users/ming/Downloads/Xset1.csv")
28
Y=pd.read_csv(r"/Users/ming/Downloads/Y.csv")
29
30
#del columns
31
X.dropna(thresh=100)
32
33
#impute
34
Xc=X.columns
35
36
X = pd.DataFrame(KNN(k=6).fit_transform(X)) 
37
X.columns=Xc
38
39
#standardize
40
standardized_X = preprocessing.scale(X)
41
42
#oversampling
43
44
#imblearn进行随机过采样
45
from imblearn.over_sampling import RandomOverSampler
46
ros = RandomOverSampler(random_state=0)
47
X_resampled, y_resampled = ros.fit_resample(standardized_X, Y)
48
49
50
models = {
51
    "knn": KNeighborsClassifier(n_neighbors=1),
52
    "naive_bayes": GaussianNB(),
53
    "logit": LogisticRegression(solver="lbfgs", multi_class="auto"),
54
    "svm": SVC(kernel="rbf", gamma="auto"),
55
    "decision_tree": DecisionTreeClassifier(criterion='gini',splitter='random',max_depth=15,min_samples_leaf=10,min_samples_split=10), 
56
    "random_forest": RandomForestClassifier(n_estimators=60,max_depth=6,max_samples=70,max_features=5),
57
    "mlp": MLPClassifier()
58
}
59
60
trainX, testX, trainY, testY = train_test_split(X_resampled, y_resampled, random_state=3, test_size=0.2)
61
62
print("use '{}' bulid model...".format("random_forest"))
63
model = models["random_forest"]
64
model.fit(trainX, trainY)
65
66
'''
67
tree_param_grid = {'n_estimators': range(50,100,10),
68
                  'min_samples_split':range(30,111,20),
69
                  'max_features':range(5,76,10),
70
                  'max_depth':range(3,22,3)
71
                  }
72
 
73
tree_best = GridSearchCV(model,param_grid = tree_param_grid,cv = 3,scoring="roc_auc",n_jobs= -1, verbose = 100) 
74
#Gridsearch turning parameters  
75
tree_best.fit(trainX, trainY)
76
77
print(tree_best.best_estimator_)
78
print(tree_best.best_params_,"  ","得分:",tree_best.best_score_)
79
'''
80
81
features_importance = pd.concat([pd.DataFrame(Xc).T,pd.DataFrame(model.feature_importances_).T],axis=0).T
82
features_importance.columns=["feature","importance"]
83
84
## prediction
85
print("elvaluation...")
86
predictions = model.predict(testX)
87
print(classification_report(testY, predictions))
88
auc_score = roc_auc_score(testY, predictions)
89
print('CONFUSION MATRIX')
90
print(confusion_matrix(testY, predictions))
91
print("AUC")
92
print(auc_score)
93
94
95
96
97
98
99
100
101
102
103
104
105