|
a |
|
b/XgBoost code |
|
|
1 |
#!/usr/bin/env python3 |
|
|
2 |
# -*- coding: utf-8 -*- |
|
|
3 |
""" |
|
|
4 |
Created on Sun Feb 26 15:36:18 2023 |
|
|
5 |
|
|
|
6 |
@author: ming |
|
|
7 |
""" |
|
|
8 |
|
|
|
9 |
|
|
|
10 |
|
|
|
11 |
#import packages |
|
|
12 |
from sklearn.metrics import precision_score |
|
|
13 |
|
|
|
14 |
import xgboost as xgb |
|
|
15 |
import numpy as np |
|
|
16 |
import pandas as pd |
|
|
17 |
import matplotlib.pyplot as plt |
|
|
18 |
from matplotlib.colors import ListedColormap |
|
|
19 |
from sklearn.preprocessing import LabelEncoder |
|
|
20 |
from sklearn.metrics import classification_report |
|
|
21 |
from sklearn.model_selection import GridSearchCV #网格搜索 |
|
|
22 |
import matplotlib.pyplot as plt#可视化 |
|
|
23 |
import seaborn as sns#绘图包 |
|
|
24 |
from sklearn.model_selection import train_test_split |
|
|
25 |
|
|
|
26 |
|
|
|
27 |
# import data |
|
|
28 |
X=pd.read_csv(r"/Users/ming/Downloads/Xset3.csv") |
|
|
29 |
Y=pd.read_csv(r"/Users/ming/Downloads/Y.csv") |
|
|
30 |
|
|
|
31 |
|
|
|
32 |
|
|
|
33 |
from imblearn.over_sampling import RandomOverSampler |
|
|
34 |
ros = RandomOverSampler(random_state=0) |
|
|
35 |
X_resampled, y_resampled = ros.fit_resample(X, Y) |
|
|
36 |
X_resampled.info() |
|
|
37 |
|
|
|
38 |
# split data |
|
|
39 |
trainX, testX, trainY, testY = train_test_split(X_resampled, y_resampled, random_state=3, test_size=0.2) |
|
|
40 |
|
|
|
41 |
model=xgb.XGBClassifier() |
|
|
42 |
# 训练模型 |
|
|
43 |
model.fit(trainX,trainY) |
|
|
44 |
# 预测值 |
|
|
45 |
y_pred = model.predict(testX) |
|
|
46 |
|
|
|
47 |
|
|
|
48 |
|
|
|
49 |
|
|
|
50 |
|
|
|
51 |
|
|
|
52 |
|
|
|
53 |
#评估指标 |
|
|
54 |
|
|
|
55 |
# 求出预测和真实一样的数目 |
|
|
56 |
true = np.sum(y_pred == testY.values.tolist()) |
|
|
57 |
print('the right number of prediction:', true) |
|
|
58 |
print('预测错的的结果数目为:', testY.shape[0]-true) |
|
|
59 |
# 评估指标 |
|
|
60 |
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,cohen_kappa_score |
|
|
61 |
print('预测数据的准确率为: {:.4}%'.format(accuracy_score(testY,y_pred)*100)) |
|
|
62 |
print('预测数据的精确率为:{:.4}%'.format( |
|
|
63 |
precision_score(testY,y_pred)*100)) |
|
|
64 |
print('预测数据的召回率为:{:.4}%'.format( |
|
|
65 |
recall_score(testY,y_pred)*100)) |
|
|
66 |
# print("训练数据的F1值为:", f1score_train) |
|
|
67 |
print('预测数据的F1值为:', |
|
|
68 |
f1_score(testY,y_pred)) |
|
|
69 |
print('预测数据的Cohen’s Kappa系数为:', |
|
|
70 |
cohen_kappa_score(testY,y_pred)) |
|
|
71 |
# 打印分类报告 |
|
|
72 |
print('the evaluation report:', |
|
|
73 |
classification_report(testY,y_pred)) |
|
|
74 |
|
|
|
75 |
from xgboost import plot_importance |
|
|
76 |
# plt.figure(figsize=(15,15)) |
|
|
77 |
plt.rcParams["figure.figsize"] = (14, 8) |
|
|
78 |
plot_importance(model) |
|
|
79 |
|
|
|
80 |
|
|
|
81 |
|
|
|
82 |
# ROC曲线、AUC |
|
|
83 |
from sklearn.metrics import precision_recall_curve |
|
|
84 |
from sklearn import metrics |
|
|
85 |
# 预测正例的概率 |
|
|
86 |
y_pred_prob=model.predict_proba(testX)[:,1] |
|
|
87 |
# y_pred_prob ,返回两列,第一列代表类别0,第二列代表类别1的概率 |
|
|
88 |
fpr, tpr, thresholds = metrics.roc_curve(testY,y_pred_prob, pos_label=1) |
|
|
89 |
#pos_label,代表真阳性标签,就是说是分类里面的好的标签,这个要看你的特征目标标签是0,1,还是1,2 |
|
|
90 |
roc_auc = metrics.auc(fpr, tpr) #auc为Roc曲线下的面积 |
|
|
91 |
# print(roc_auc) |
|
|
92 |
plt.figure(figsize=(8,6)) |
|
|
93 |
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') |
|
|
94 |
plt.plot(fpr, tpr, 'r',label='AUC = %0.2f'% roc_auc) |
|
|
95 |
plt.legend(loc='lower right') |
|
|
96 |
# plt.plot([0, 1], [0, 1], 'r--') |
|
|
97 |
plt.xlim([0, 1.1]) |
|
|
98 |
plt.ylim([0, 1.1]) |
|
|
99 |
plt.xlabel('False Positive Rate') #横坐标是fpr |
|
|
100 |
plt.ylabel('True Positive Rate') #纵坐标是tpr |
|
|
101 |
plt.title('Receiver operating characteristic example') |
|
|
102 |
plt.show() |
|
|
103 |
|
|
|
104 |
y_pred_prob1=model.predict_proba(X)[:,1] |
|
|
105 |
y_pred1=model.predict(X) |
|
|
106 |
|
|
|
107 |
#5-fold cross validation |
|
|
108 |
from sklearn.model_selection import cross_val_score |
|
|
109 |
scores = cross_val_score(model, X_resampled, y_resampled, cv=10) |
|
|
110 |
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std())) |
|
|
111 |
|