--- a +++ b/Capstone code @@ -0,0 +1,105 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +""" +Created on Fri Feb 3 09:37:21 2023 + +@author: ming +""" + +import pandas as pd +from fancyimpute import KNN +from sklearn.neighbors import KNeighborsClassifier +from sklearn.naive_bayes import GaussianNB +from sklearn.linear_model import LogisticRegression +from sklearn.svm import SVC +from sklearn.tree import DecisionTreeClassifier +from sklearn.ensemble import RandomForestClassifier +from sklearn.neural_network import MLPClassifier +from sklearn.model_selection import train_test_split +from sklearn.metrics import classification_report,confusion_matrix +from sklearn import preprocessing + +from sklearn.metrics import roc_auc_score +from sklearn.model_selection import GridSearchCV + + +#load data +X=pd.read_csv(r"/Users/ming/Downloads/Xset1.csv") +Y=pd.read_csv(r"/Users/ming/Downloads/Y.csv") + +#del columns +X.dropna(thresh=100) + +#impute +Xc=X.columns + +X = pd.DataFrame(KNN(k=6).fit_transform(X)) +X.columns=Xc + +#standardize +standardized_X = preprocessing.scale(X) + +#oversampling + +#imblearn进行随机过采样 +from imblearn.over_sampling import RandomOverSampler +ros = RandomOverSampler(random_state=0) +X_resampled, y_resampled = ros.fit_resample(standardized_X, Y) + + +models = { + "knn": KNeighborsClassifier(n_neighbors=1), + "naive_bayes": GaussianNB(), + "logit": LogisticRegression(solver="lbfgs", multi_class="auto"), + "svm": SVC(kernel="rbf", gamma="auto"), + "decision_tree": DecisionTreeClassifier(criterion='gini',splitter='random',max_depth=15,min_samples_leaf=10,min_samples_split=10), + "random_forest": RandomForestClassifier(n_estimators=60,max_depth=6,max_samples=70,max_features=5), + "mlp": MLPClassifier() +} + +trainX, testX, trainY, testY = train_test_split(X_resampled, y_resampled, random_state=3, test_size=0.2) + +print("use '{}' bulid model...".format("random_forest")) +model = models["random_forest"] +model.fit(trainX, trainY) + +''' +tree_param_grid = {'n_estimators': range(50,100,10), + 'min_samples_split':range(30,111,20), + 'max_features':range(5,76,10), + 'max_depth':range(3,22,3) + } + +tree_best = GridSearchCV(model,param_grid = tree_param_grid,cv = 3,scoring="roc_auc",n_jobs= -1, verbose = 100) +#Gridsearch turning parameters +tree_best.fit(trainX, trainY) + +print(tree_best.best_estimator_) +print(tree_best.best_params_," ","得分:",tree_best.best_score_) +''' + +features_importance = pd.concat([pd.DataFrame(Xc).T,pd.DataFrame(model.feature_importances_).T],axis=0).T +features_importance.columns=["feature","importance"] + +## prediction +print("elvaluation...") +predictions = model.predict(testX) +print(classification_report(testY, predictions)) +auc_score = roc_auc_score(testY, predictions) +print('CONFUSION MATRIX') +print(confusion_matrix(testY, predictions)) +print("AUC") +print(auc_score) + + + + + + + + + + + + +