--- a +++ b/XGBClassification/main old.ipynb @@ -0,0 +1,859 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install `xlrd` for reading the `xls` file" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": {}, + "outputs": [], + "source": [ + "# %conda install xlrd==2.0.1\n", + "# $ conda install -c conda-forge py-xgboost-gpu\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the path to the `xls` file" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "training_file = \"../TrainDataset2024.xls\"\n", + "# training_file = \"/kaggle/input/dataset/TrainDataset2024.xls\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-12-05 00:35:47.391919: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2024-12-05 00:35:47.557489: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", + "E0000 00:00:1733358947.620102 1213 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "E0000 00:00:1733358947.638547 1213 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2024-12-05 00:35:47.796076: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Add the parent directory to the system path\n", + "sys.path.append(os.path.abspath('../')) # Adjust the path as needed\n", + "\n", + "from my_util import df_to_corr_matrix, remove_outliers\n", + "\n", + "import tensorflow as tf\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import plotly.graph_objects as go\n", + "\n", + "from matplotlib.colors import Normalize\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n", + "from sklearn.preprocessing import StandardScaler, RobustScaler\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n", + "from sklearn.svm import SVC\n", + "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n", + "from sklearn.impute import KNNImputer\n", + "\n", + "\n", + "from imblearn.over_sampling import SMOTE\n", + "from imblearn.pipeline import Pipeline\n", + "\n", + "from joblib import Parallel, delayed\n", + "\n", + "import xgboost as xgb\n", + "from xgboost import XGBClassifier\n", + "\n", + "from pickle import dump , load\n", + "\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read the data into X and y" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature\n", + "(395, 25) (395,)\n", + "['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength', 'original_glrlm_LongRunLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'HistologyType', 'ChemoGrade', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterColumn', 'original_shape_SurfaceVolumeRatio', 'original_shape_LeastAxisLength', 'original_glcm_Autocorrelation', 'original_shape_Sphericity']\n" + ] + } + ], + "source": [ + "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n", + "\n", + "data = pd.read_excel(training_file)\n", + "data.replace(999, np.nan, inplace=True)\n", + "\n", + "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n", + "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n", + "\n", + "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n", + " selected_features = load(file)\n", + " print(f\"Loaded '{file.name}' to selected_feature\")\n", + "\n", + "X = data[selected_features]\n", + "y = data[\"pCR (outcome)\"]\n", + "print(X.shape, y.shape)\n", + "\n", + "print(selected_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": {}, + "outputs": [], + "source": [ + "# # Set up the matplotlib figure\n", + "# plt.figure(figsize=(40, 30))\n", + "\n", + "# # Loop through each feature to create a scatter plot\n", + "# for i, feature in enumerate(X.columns):\n", + "# plt.subplot(5, 6, i + 1) # Adjust the number of rows and columns based on the number of features\n", + "# sns.scatterplot(x=y, y=X[feature], hue=y, style=y, palette='Set2', alpha=0.7)\n", + "# plt.title(feature)\n", + "# plt.xlabel('pCR (outcome)')\n", + "# plt.ylabel(feature)\n", + "# plt.xlim(-2, 3)\n", + "\n", + "# plt.tight_layout()\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": {}, + "outputs": [], + "source": [ + "# df_to_corr_matrix(X, size_factor=1.6, sep=150)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split the data into train_full and test_reserved (untouch)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n", + "The training data has 316 data. The testing data has 79 data. \n", + "Positive ratio: \n", + "\tTrain: 0.21203\n", + "\tTest: 0.21519\n" + ] + } + ], + "source": [ + "# Close ratio random_state\n", + "# [14, 47, 49, 52, 62, 76, 83, 89, 92, 116, 118, 122, 136, 138, 144, 146, 150, 156, 157, 159, 170, 172, 174, 185]\n", + "\n", + "while True: \n", + " X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0\n", + " # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n", + "\n", + " X_train_full.reset_index(drop=True, inplace=True)\n", + " X_test_reserved.reset_index(drop=True, inplace=True)\n", + " y_train_full.reset_index(drop=True, inplace=True)\n", + " y_test_reserved.reset_index(drop=True, inplace=True)\n", + "\n", + " ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n", + " ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n", + "\n", + " if abs(ratio_train - ratio_test) < 0.1:\n", + " break\n", + "\n", + "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n", + "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n", + "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Outliers" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": {}, + "outputs": [], + "source": [ + "# # The result of keeping outliers is better\n", + "# X_train_full, y_train_full = remove_outliers(X_train_full, y_train_full, selected_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### XGBoost" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(316, 25)\n", + "(316,)\n" + ] + } + ], + "source": [ + "print(X_train_full.shape)\n", + "print(y_train_full.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 0\n", + "Best Parameters at Index 8161 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "Balanced accuracy score: 0.7762323390894819\n", + "F1 Score: 0.5732413323110996\n", + "Precision Score: 0.42922841143530793\n", + "Recall Score: 0.8659340659340661\n", + "Specificity Score: 0.686530612244898\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 1\n", + "Best Parameters at Index 3843 : {'gamma': 0, 'learning_rate': 0.25, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 50, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "Balanced accuracy score: 0.763171114599686\n", + "F1 Score: 0.5623509823509825\n", + "Precision Score: 0.4324091925871681\n", + "Recall Score: 0.823076923076923\n", + "Specificity Score: 0.703265306122449\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 2\n", + "Best Parameters at Index 1691 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 100, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "Balanced accuracy score: 0.7733186813186812\n", + "F1 Score: 0.5808290356353484\n", + "Precision Score: 0.45272486772486775\n", + "Recall Score: 0.8197802197802198\n", + "Specificity Score: 0.7268571428571429\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 3\n", + "Best Parameters at Index 7083 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 50, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "Balanced accuracy score: 0.7692747252747253\n", + "F1 Score: 0.5710505619042204\n", + "Precision Score: 0.4382588000235059\n", + "Recall Score: 0.8362637362637363\n", + "Specificity Score: 0.7022857142857143\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 4\n", + "Best Parameters at Index 601 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "Balanced accuracy score: 0.7721507064364206\n", + "F1 Score: 0.5713989997742859\n", + "Precision Score: 0.43731630688152434\n", + "Recall Score: 0.8373626373626374\n", + "Specificity Score: 0.7069387755102041\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 5\n", + "Best Parameters at Index 1685 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "Balanced accuracy score: 0.7514662480376766\n", + "F1 Score: 0.5524357534869451\n", + "Precision Score: 0.42733333333333334\n", + "Recall Score: 0.787912087912088\n", + "Specificity Score: 0.7150204081632653\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 6\n", + "Best Parameters at Index 3309 : {'gamma': 0, 'learning_rate': 0.25, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 80, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "Balanced accuracy score: 0.7787849293563578\n", + "F1 Score: 0.577384971479965\n", + "Precision Score: 0.43828953655040614\n", + "Recall Score: 0.8505494505494505\n", + "Specificity Score: 0.7070204081632653\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 7\n", + "Best Parameters at Index 6540 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n", + "Balanced accuracy score: 0.7682605965463108\n", + "F1 Score: 0.5682149974832902\n", + "Precision Score: 0.4367256817256817\n", + "Recall Score: 0.8175824175824176\n", + "Specificity Score: 0.7189387755102039\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 8\n", + "Best Parameters at Index 600 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n", + "Balanced accuracy score: 0.7585651491365777\n", + "F1 Score: 0.5616951747439553\n", + "Precision Score: 0.4316868417377755\n", + "Recall Score: 0.8186813186813187\n", + "Specificity Score: 0.6984489795918367\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 9\n", + "Best Parameters at Index 6001 : {'gamma': 0, 'learning_rate': 0.25, 'max_bin': 10, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "Balanced accuracy score: 0.781927786499215\n", + "F1 Score: 0.5872351485663194\n", + "Precision Score: 0.44791534391534393\n", + "Recall Score: 0.8648351648351648\n", + "Specificity Score: 0.6990204081632653\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 10\n", + "Best Parameters at Index 4380 : {'gamma': 0, 'learning_rate': 0.25, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n", + "Balanced accuracy score: 0.7608854003139717\n", + "F1 Score: 0.5588994220573168\n", + "Precision Score: 0.4250603804797353\n", + "Recall Score: 0.8230769230769232\n", + "Specificity Score: 0.6986938775510205\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 11\n", + "Best Parameters at Index 60 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n", + "Balanced accuracy score: 0.7567284144427001\n", + "F1 Score: 0.5563016524439233\n", + "Precision Score: 0.42433391599131315\n", + "Recall Score: 0.8186813186813187\n", + "Specificity Score: 0.6947755102040816\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 12\n", + "Best Parameters at Index 8160 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n", + "Balanced accuracy score: 0.7668885400313972\n", + "F1 Score: 0.5754879804114064\n", + "Precision Score: 0.4512433862433863\n", + "Recall Score: 0.8065934065934066\n", + "Specificity Score: 0.7271836734693877\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 13\n", + "Best Parameters at Index 8161 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "Balanced accuracy score: 0.7853751962323391\n", + "F1 Score: 0.5821092385107072\n", + "Precision Score: 0.43648635235732014\n", + "Recall Score: 0.8802197802197803\n", + "Specificity Score: 0.690530612244898\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 14\n", + "Best Parameters at Index 60 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n", + "Balanced accuracy score: 0.7593155416012559\n", + "F1 Score: 0.5554589371980676\n", + "Precision Score: 0.4169422767248854\n", + "Recall Score: 0.8362637362637363\n", + "Specificity Score: 0.6823673469387755\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 15\n", + "Best Parameters at Index 600 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n", + "Balanced accuracy score: 0.7812119309262167\n", + "F1 Score: 0.5818895255480622\n", + "Precision Score: 0.44234472934472935\n", + "Recall Score: 0.8516483516483516\n", + "Specificity Score: 0.7107755102040817\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 16\n", + "Best Parameters at Index 2761 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 10, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "Balanced accuracy score: 0.7639089481946626\n", + "F1 Score: 0.5590271923942298\n", + "Precision Score: 0.4172192749778957\n", + "Recall Score: 0.8494505494505494\n", + "Specificity Score: 0.6783673469387755\n", + "\n", + "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n", + "random_state: 17\n", + "Best Parameters at Index 9780 : {'gamma': 0, 'learning_rate': 0.4, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n", + "Balanced accuracy score: 0.770967032967033\n", + "F1 Score: 0.5707573812580231\n", + "Precision Score: 0.43912169312169314\n", + "Recall Score: 0.823076923076923\n", + "Specificity Score: 0.7188571428571429\n", + "\n", + "Best run is the 13th run. Score: 0.7853751962323391\n" + ] + } + ], + "source": [ + "model = XGBClassifier(objective=\"binary:logistic\")\n", + "\n", + "# param_grid = {\n", + "# \"gamma\": [0.2, 0.3],\n", + "# \"learning_rate\": [0.3, 0.5],\n", + "# \"max_bin\": [2, 5, 10, 20],\n", + "# \"max_depth\": [1, 2, 3],\n", + "# \"max_leaves\": [1, 2, 3, 4],\n", + "# \"n_estimators\": [5, 10, 20, 30, 40, 50],\n", + "# \"scale_pos_weight\": [4.5], # imbalanced data\n", + "# }\n", + "param_grid = {\n", + " \"gamma\": [0, 0.01, 0.1, 0.3],\n", + " \"learning_rate\": [0.2, 0.25, 0.3, 0.4],\n", + " \"max_bin\": [2, 4, 5, 6, 8, 10],\n", + " \"max_depth\": [1, 2, 3],\n", + " \"max_leaves\": [1, 2, 3],\n", + " \"min_child_weight\": [0, 0.001, 0.005, 0.01, 1],\n", + " \"n_estimators\": [30, 50, 70, 75, 80, 100],\n", + " \"num_parallel_tree\": [1],\n", + " \"scale_pos_weight\": [3.8, 4.5], # imbalanced data\n", + "}\n", + "\n", + "best_score = 0\n", + "best_score_at = None\n", + "\n", + "for i in range(18):\n", + "\n", + " kf = StratifiedKFold(5, shuffle=True, random_state=i)\n", + "\n", + " # Set up the GridSearchCV\n", + " grid_search = GridSearchCV(\n", + " estimator=model,\n", + " param_grid=param_grid,\n", + " scoring={\n", + " \"f1\": \"f1\",\n", + " \"recall\": \"recall\",\n", + " \"specificity\": make_scorer(recall_score, pos_label=0),\n", + " \"precision\": \"precision\",\n", + " \"balanced_accuracy_score\": make_scorer(balanced_accuracy_score),\n", + " },\n", + " cv=kf,\n", + " # cv=5,\n", + " verbose=1,\n", + " n_jobs=-1,\n", + " return_train_score=True,\n", + " refit=\"balanced_accuracy_score\",\n", + " )\n", + "\n", + " # Fit the model\n", + " grid_search.fit(X_train_full, y_train_full)\n", + "\n", + " # Get the best parameters and best score\n", + " result = pd.DataFrame(grid_search.cv_results_)\n", + " best_params = grid_search.best_params_\n", + " best_index = grid_search.best_index_\n", + " best_f1 = result[\"mean_test_f1\"][best_index]\n", + " best_precision = result[\"mean_test_precision\"][best_index]\n", + " best_recall = result[\"mean_test_recall\"][best_index]\n", + " best_specificity = result[\"mean_test_specificity\"][best_index]\n", + " best_balanced_accuracy_score = result[\"mean_test_balanced_accuracy_score\"][best_index]\n", + "\n", + " print(f\"random_state: {i}\")\n", + " print(f\"Best Parameters at Index {best_index} :\", best_params)\n", + " print(f\"Balanced accuracy score: {best_balanced_accuracy_score}\")\n", + " print(f\"F1 Score: {best_f1}\")\n", + " print(f\"Precision Score: {best_precision}\")\n", + " print(f\"Recall Score: {best_recall}\")\n", + " print(f\"Specificity Score: {best_specificity}\")\n", + " print()\n", + "\n", + " if best_balanced_accuracy_score > best_score:\n", + " best_score = best_balanced_accuracy_score\n", + " best_score_at = i\n", + "\n", + " pd.DataFrame(grid_search.cv_results_).to_csv(f\"output{i}.csv\")\n", + "\n", + "print(f\"Best run is the {best_score_at}th run. Score: {best_score}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 68, + "metadata": {}, + "outputs": [], + "source": [ + "model = grid_search.best_estimator_\n", + "model.save_model(\"model.ubj\")" + ] + }, + { + "cell_type": "code", + "execution_count": 69, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['param_gamma', 'param_learning_rate', 'param_max_bin', 'param_max_depth', 'param_max_leaves', 'param_min_child_weight', 'param_n_estimators', 'param_num_parallel_tree', 'param_scale_pos_weight']\n" + ] + } + ], + "source": [ + "results = pd.DataFrame(grid_search.cv_results_)\n", + "result_start = 4\n", + "print(list(results.keys())[result_start:result_start+len(param_grid)])\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "results = pd.DataFrame(grid_search.cv_results_)\n", + "\n", + "# filter = results['param_num_parallel_tree'] == 10\n", + "filter = pd.Series([True] * len(results)) # include all data\n", + "\n", + "fig = go.Figure()\n", + "# Add mean train score trace\n", + "fig.add_trace(go.Scatter(\n", + " x=list(range(len(results[\"mean_train_f1\"][filter]))),\n", + " y=results[\"mean_train_f1\"][filter],\n", + " mode='lines+markers',\n", + " name='Mean Train F1',\n", + " text=list(results['params'][filter]), # Display parameter values on hover\n", + " hoverinfo='text+y+x', # Show parameter values and y value\n", + "))\n", + "fig.add_trace(go.Scatter(\n", + " x=list(range(len(results[\"mean_train_recall\"][filter]))),\n", + " y=results[\"mean_train_recall\"][filter],\n", + " mode='lines+markers',\n", + " name='Mean Train Recall',\n", + " text=list(results['params'][filter]), # Display parameter values on hover\n", + " hoverinfo='text+y+x', # Show parameter values and y value\n", + " visible=\"legendonly\",\n", + "))\n", + "fig.add_trace(go.Scatter(\n", + " x=list(range(len(results[\"mean_train_specificity\"][filter]))),\n", + " y=results[\"mean_train_specificity\"][filter],\n", + " mode='lines+markers',\n", + " name='Mean Train Specificity',\n", + " text=list(results['params'][filter]), # Display parameter values on hover\n", + " hoverinfo='text+y+x', # Show parameter values and y value\n", + " visible=\"legendonly\",\n", + "))\n", + "# Add mean test score trace\n", + "fig.add_trace(go.Scatter(\n", + " x=list(range(len(results[\"mean_test_f1\"][filter]))),\n", + " y=results[\"mean_test_f1\"][filter],\n", + " mode='lines+markers',\n", + " name='Mean Test F1',\n", + " text=list(results['params'][filter]), # Display parameter values on hover\n", + " hoverinfo='text+y+x', # Show parameter values and y value\n", + "))\n", + "fig.add_trace(go.Scatter(\n", + " x=list(range(len(results[\"mean_test_recall\"][filter]))),\n", + " y=results[\"mean_test_recall\"][filter],\n", + " mode='lines+markers',\n", + " name='Mean Test Recall',\n", + " text=list(results['params'][filter]), # Display parameter values on hover\n", + " hoverinfo='text+y+x', # Show parameter values and y value\n", + " visible=\"legendonly\",\n", + "))\n", + "fig.add_trace(go.Scatter(\n", + " x=list(range(len(results[\"mean_test_specificity\"][filter]))),\n", + " y=results[\"mean_test_specificity\"][filter],\n", + " mode='lines+markers',\n", + " name='Mean Test Specificity',\n", + " text=list(results['params'][filter]), # Display parameter values on hover\n", + " hoverinfo='text+y+x', # Show parameter values and y value\n", + " visible=\"legendonly\",\n", + "))\n", + "\n", + "# Update layout\n", + "fig.update_layout(\n", + " title='Grid Search Mean Train and Test Scores',\n", + " xaxis_title='Parameter Combinations (Index)',\n", + " yaxis_title='Score',\n", + " legend_title='Scores',\n", + " hovermode='closest'\n", + ")\n", + "fig.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 71, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing set:\n", + "(79, 25)\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.95 0.60 0.73 62\n", + " 1.0 0.38 0.88 0.53 17\n", + "\n", + " accuracy 0.66 79\n", + " macro avg 0.66 0.74 0.63 79\n", + "weighted avg 0.83 0.66 0.69 79\n", + "\n", + "[[37 25]\n", + " [ 2 15]]\n", + "\n", + "Balanced accuracy score: 0.7395635673624288\n", + "F1 Score: 0.5263157894736842\n", + "Precision: 0.375\n", + "Recall: 0.8823529411764706\n", + "Specificity: 0.5967741935483871\n", + "\n", + "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n", + "The training data has 316 data. The testing data has 79 data. \n", + "Positive ratio: \n", + "\tTrain: 0.21203\n", + "\tTest: 0.21519\n", + "Best Parameters at Index 1112 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 8, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "Balanced accuracy score: 0.7834536891679749\n", + "F1 Score: 0.5803091217725365\n", + "Precision Score: 0.4370269041303524\n", + "Recall Score: 0.868131868131868\n", + "Specificity Score: 0.6987755102040816\n", + "\n", + "Training set:\n", + "(316, 25)\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.97 0.69 0.81 249\n", + " 1.0 0.44 0.91 0.60 67\n", + "\n", + " accuracy 0.74 316\n", + " macro avg 0.70 0.80 0.70 316\n", + "weighted avg 0.86 0.74 0.76 316\n", + "\n", + "[[172 77]\n", + " [ 6 61]]\n", + "\n", + "Testing set:\n", + "(79, 25)\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.95 0.60 0.73 62\n", + " 1.0 0.38 0.88 0.53 17\n", + "\n", + " accuracy 0.66 79\n", + " macro avg 0.66 0.74 0.63 79\n", + "weighted avg 0.83 0.66 0.69 79\n", + "\n", + "[[37 25]\n", + " [ 2 15]]\n", + "\n", + "Balanced accuracy score: 0.7395635673624288\n", + "F1 Score: 0.5263157894736842\n", + "Precision: 0.375\n", + "Recall: 0.8823529411764706\n", + "Specificity: 0.5967741935483871\n" + ] + } + ], + "source": [ + "model = grid_search.best_estimator_\n", + "\n", + "X_test = X_test_reserved\n", + "\n", + "y_pred = model.predict(X_test)\n", + "report = classification_report(y_test_reserved, y_pred)\n", + "cm = confusion_matrix(y_test_reserved, y_pred)\n", + "\n", + "print(\"Testing set:\")\n", + "print(X_test_reserved.shape)\n", + "print(report)\n", + "print(cm)\n", + "print()\n", + "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}\")\n", + "print(f\"F1 Score: {f1_score(y_test_reserved, y_pred)}\")\n", + "print(f\"Precision: {precision_score(y_test_reserved, y_pred)}\")\n", + "print(f\"Recall: {recall_score(y_test_reserved, y_pred)}\")\n", + "print(f\"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}\")\n", + "\n", + "while True: \n", + " X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n", + "\n", + " X_train_full.reset_index(drop=True, inplace=True)\n", + " X_test_reserved.reset_index(drop=True, inplace=True)\n", + " y_train_full.reset_index(drop=True, inplace=True)\n", + " y_test_reserved.reset_index(drop=True, inplace=True)\n", + "\n", + " ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n", + " ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n", + "\n", + " if abs(ratio_train - ratio_test) < 0.01:\n", + " break\n", + "\n", + "print()\n", + "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n", + "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n", + "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")\n", + "\n", + "# print(f\"TARGET_NUM_OF_FEATURES: {TARGET_NUM_OF_FEATURES}, scaler: {SCALER}, num_of_features: {num_of_features}\")\n", + "\n", + "print(f\"Best Parameters at Index {best_index} :\", best_params)\n", + "print(f\"Balanced accuracy score: {best_balanced_accuracy_score}\")\n", + "print(f\"F1 Score: {best_f1}\")\n", + "print(f\"Precision Score: {best_precision}\")\n", + "print(f\"Recall Score: {best_recall}\")\n", + "print(f\"Specificity Score: {best_specificity}\")\n", + "\n", + "model.fit(X_train_full, y_train_full)\n", + "\n", + "\n", + "\n", + "y_pred = model.predict(X_train_full)\n", + "report = classification_report(y_train_full, y_pred)\n", + "cm = confusion_matrix(y_train_full, y_pred)\n", + "\n", + "print(\"\\nTraining set:\")\n", + "print(X_train_full.shape)\n", + "print(report)\n", + "print(cm)\n", + "print()\n", + "\n", + "X_test = X_test_reserved\n", + "\n", + "y_pred = model.predict(X_test)\n", + "report = classification_report(y_test_reserved, y_pred)\n", + "cm = confusion_matrix(y_test_reserved, y_pred)\n", + "\n", + "print(\"Testing set:\")\n", + "print(X_test_reserved.shape)\n", + "print(report)\n", + "print(cm)\n", + "print()\n", + "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}\")\n", + "print(f\"F1 Score: {f1_score(y_test_reserved, y_pred)}\")\n", + "print(f\"Precision: {precision_score(y_test_reserved, y_pred)}\")\n", + "print(f\"Recall: {recall_score(y_test_reserved, y_pred)}\")\n", + "print(f\"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}\")\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 72, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "F1 Score: 0.5403225806451613\n", + "Precision: 0.40853658536585363\n", + "Recall: 0.7976190476190477\n", + "Specificity: 0.6881028938906752\n", + "[[214 97]\n", + " [ 17 67]]\n" + ] + } + ], + "source": [ + "stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True)\n", + "\n", + "y_pred_cv = cross_val_predict(model, X, y, cv=stratified_kfold)\n", + "\n", + "# Calculate evaluation metrics\n", + "f1 = f1_score(y, y_pred_cv)\n", + "precision = precision_score(y, y_pred_cv)\n", + "recall = recall_score(y, y_pred_cv)\n", + "specificity = recall_score(y, y_pred_cv, pos_label=0)\n", + "\n", + "# Print evaluation metrics\n", + "print(\"F1 Score:\", f1)\n", + "print(\"Precision:\", precision)\n", + "print(\"Recall:\", recall)\n", + "print(\"Specificity:\", specificity)\n", + "\n", + "print(confusion_matrix(y, y_pred_cv))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "`Best Parameters at Index 244 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0.001, 'n_estimators': 75, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}`\n", + "\n", + "`Best Parameters at Index 1112 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 8, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}`\n", + "\n", + "`random_state=13` `{'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}`\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MLEAsm", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}