--- a +++ b/XgBoost code @@ -0,0 +1,111 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Sun Feb 26 15:36:18 2023 + +@author: ming +""" + + + +#import packages +from sklearn.metrics import precision_score + +import xgboost as xgb +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from matplotlib.colors import ListedColormap +from sklearn.preprocessing import LabelEncoder +from sklearn.metrics import classification_report +from sklearn.model_selection import GridSearchCV #网格搜索 +import matplotlib.pyplot as plt#可视化 +import seaborn as sns#绘图包 +from sklearn.model_selection import train_test_split + + +# import data +X=pd.read_csv(r"/Users/ming/Downloads/Xset3.csv") +Y=pd.read_csv(r"/Users/ming/Downloads/Y.csv") + + + +from imblearn.over_sampling import RandomOverSampler +ros = RandomOverSampler(random_state=0) +X_resampled, y_resampled = ros.fit_resample(X, Y) +X_resampled.info() + +# split data +trainX, testX, trainY, testY = train_test_split(X_resampled, y_resampled, random_state=3, test_size=0.2) + +model=xgb.XGBClassifier() +# 训练模型 +model.fit(trainX,trainY) +# 预测值 +y_pred = model.predict(testX) + + + + + + + +#评估指标 + +# 求出预测和真实一样的数目 +true = np.sum(y_pred == testY.values.tolist()) +print('the right number of prediction:', true) +print('预测错的的结果数目为:', testY.shape[0]-true) +# 评估指标 +from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,cohen_kappa_score +print('预测数据的准确率为: {:.4}%'.format(accuracy_score(testY,y_pred)*100)) +print('预测数据的精确率为:{:.4}%'.format( + precision_score(testY,y_pred)*100)) +print('预测数据的召回率为:{:.4}%'.format( + recall_score(testY,y_pred)*100)) +# print("训练数据的F1值为:", f1score_train) +print('预测数据的F1值为:', + f1_score(testY,y_pred)) +print('预测数据的Cohen’s Kappa系数为:', + cohen_kappa_score(testY,y_pred)) +# 打印分类报告 +print('the evaluation report:', + classification_report(testY,y_pred)) + +from xgboost import plot_importance +# plt.figure(figsize=(15,15)) +plt.rcParams["figure.figsize"] = (14, 8) +plot_importance(model) + + + +# ROC曲线、AUC +from sklearn.metrics import precision_recall_curve +from sklearn import metrics +# 预测正例的概率 +y_pred_prob=model.predict_proba(testX)[:,1] +# y_pred_prob ,返回两列,第一列代表类别0,第二列代表类别1的概率 +fpr, tpr, thresholds = metrics.roc_curve(testY,y_pred_prob, pos_label=1) +#pos_label,代表真阳性标签,就是说是分类里面的好的标签,这个要看你的特征目标标签是0,1,还是1,2 +roc_auc = metrics.auc(fpr, tpr) #auc为Roc曲线下的面积 +# print(roc_auc) +plt.figure(figsize=(8,6)) +plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') +plt.plot(fpr, tpr, 'r',label='AUC = %0.2f'% roc_auc) +plt.legend(loc='lower right') +# plt.plot([0, 1], [0, 1], 'r--') +plt.xlim([0, 1.1]) +plt.ylim([0, 1.1]) +plt.xlabel('False Positive Rate') #横坐标是fpr +plt.ylabel('True Positive Rate') #纵坐标是tpr +plt.title('Receiver operating characteristic example') +plt.show() + +y_pred_prob1=model.predict_proba(X)[:,1] +y_pred1=model.predict(X) + +#5-fold cross validation +from sklearn.model_selection import cross_val_score +scores = cross_val_score(model, X_resampled, y_resampled, cv=10) +print("%0.2f accuracy with a standard deviation of %0.2f" % (scores.mean(), scores.std())) +