[16f085]: / XgBoost code

Download this file

112 lines (84 with data), 3.4 kB

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sun Feb 26 15:36:18 2023

@author: ming
"""



#import packages
from sklearn.metrics import precision_score

import xgboost as xgb
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV #网格搜索
import matplotlib.pyplot as plt#可视化
import seaborn as sns#绘图包
from sklearn.model_selection import train_test_split


# import data
X=pd.read_csv(r"/Users/ming/Downloads/Xset3.csv")
Y=pd.read_csv(r"/Users/ming/Downloads/Y.csv")



from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, Y)
X_resampled.info()

# split data
trainX, testX, trainY, testY = train_test_split(X_resampled, y_resampled, random_state=3, test_size=0.2)

model=xgb.XGBClassifier()
# 训练模型
model.fit(trainX,trainY)
# 预测值
y_pred = model.predict(testX)




 


#评估指标

# 求出预测和真实一样的数目
true = np.sum(y_pred == testY.values.tolist())
print('the right number of prediction:', true)
print('预测错的的结果数目为:', testY.shape[0]-true)
# 评估指标
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,cohen_kappa_score
print('预测数据的准确率为: {:.4}%'.format(accuracy_score(testY,y_pred)*100))
print('预测数据的精确率为:{:.4}%'.format(
      precision_score(testY,y_pred)*100))
print('预测数据的召回率为:{:.4}%'.format(
      recall_score(testY,y_pred)*100))
# print("训练数据的F1值为:", f1score_train)
print('预测数据的F1值为:',
      f1_score(testY,y_pred))
print('预测数据的Cohen’s Kappa系数为:',
      cohen_kappa_score(testY,y_pred))
# 打印分类报告
print('the evaluation report:',
      classification_report(testY,y_pred))

from xgboost import plot_importance
# plt.figure(figsize=(15,15))
plt.rcParams["figure.figsize"] = (14, 8)
plot_importance(model)



# ROC曲线、AUC
from sklearn.metrics import precision_recall_curve
from sklearn import metrics
# 预测正例的概率
y_pred_prob=model.predict_proba(testX)[:,1]
# y_pred_prob ,返回两列,第一列代表类别0,第二列代表类别1的概率
fpr, tpr, thresholds = metrics.roc_curve(testY,y_pred_prob, pos_label=1)
#pos_label,代表真阳性标签,就是说是分类里面的好的标签,这个要看你的特征目标标签是0,1,还是1,2
roc_auc = metrics.auc(fpr, tpr)  #auc为Roc曲线下的面积
# print(roc_auc)
plt.figure(figsize=(8,6))
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.plot(fpr, tpr, 'r',label='AUC = %0.2f'% roc_auc)
plt.legend(loc='lower right')
# plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1.1])
plt.ylim([0, 1.1])
plt.xlabel('False Positive Rate') #横坐标是fpr
plt.ylabel('True Positive Rate')  #纵坐标是tpr
plt.title('Receiver operating characteristic example')
plt.show()

y_pred_prob1=model.predict_proba(X)[:,1]
y_pred1=model.predict(X)

#5-fold cross validation
from sklearn.model_selection import cross_val_score
scores = cross_val_score(model, X_resampled, y_resampled, cv=10)
print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std()))