ML_BreastCancerTreatment / Git / Diff of /XGBClassification/main old.ipynb

Models:
joseph-gordon/
ML_BreastCancerTreatment
Downloads: 1
Diff of /XGBClassification/main old.ipynb [000000] .. [e6e569]
Switch to side-by-side view

--- a
+++ b/XGBClassification/main old.ipynb
@@ -0,0 +1,859 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install `xlrd` for reading the `xls` file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %conda install xlrd==2.0.1\n",
+    "# $ conda install -c conda-forge py-xgboost-gpu\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the path to the `xls` file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_file = \"../TrainDataset2024.xls\"\n",
+    "# training_file = \"/kaggle/input/dataset/TrainDataset2024.xls\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-12-05 00:35:47.391919: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-12-05 00:35:47.557489: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
+      "E0000 00:00:1733358947.620102    1213 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "E0000 00:00:1733358947.638547    1213 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2024-12-05 00:35:47.796076: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "# Add the parent directory to the system path\n",
+    "sys.path.append(os.path.abspath('../'))  # Adjust the path as needed\n",
+    "\n",
+    "from my_util import df_to_corr_matrix, remove_outliers\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "import plotly.graph_objects as go\n",
+    "\n",
+    "from matplotlib.colors import Normalize\n",
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n",
+    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
+    "from sklearn.impute import KNNImputer\n",
+    "\n",
+    "\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "from imblearn.pipeline import Pipeline\n",
+    "\n",
+    "from joblib import Parallel, delayed\n",
+    "\n",
+    "import xgboost as xgb\n",
+    "from xgboost import XGBClassifier\n",
+    "\n",
+    "from pickle import dump , load\n",
+    "\n",
+    "import warnings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Read the data into X and y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature\n",
+      "(395, 25) (395,)\n",
+      "['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength', 'original_glrlm_LongRunLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'HistologyType', 'ChemoGrade', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterColumn', 'original_shape_SurfaceVolumeRatio', 'original_shape_LeastAxisLength', 'original_glcm_Autocorrelation', 'original_shape_Sphericity']\n"
+     ]
+    }
+   ],
+   "source": [
+    "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n",
+    "\n",
+    "data = pd.read_excel(training_file)\n",
+    "data.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
+    "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
+    "\n",
+    "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n",
+    "    selected_features = load(file)\n",
+    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
+    "\n",
+    "X = data[selected_features]\n",
+    "y = data[\"pCR (outcome)\"]\n",
+    "print(X.shape, y.shape)\n",
+    "\n",
+    "print(selected_features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 62,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Set up the matplotlib figure\n",
+    "# plt.figure(figsize=(40, 30))\n",
+    "\n",
+    "# # Loop through each feature to create a scatter plot\n",
+    "# for i, feature in enumerate(X.columns):\n",
+    "#     plt.subplot(5, 6, i + 1)  # Adjust the number of rows and columns based on the number of features\n",
+    "#     sns.scatterplot(x=y, y=X[feature], hue=y, style=y, palette='Set2', alpha=0.7)\n",
+    "#     plt.title(feature)\n",
+    "#     plt.xlabel('pCR (outcome)')\n",
+    "#     plt.ylabel(feature)\n",
+    "#     plt.xlim(-2, 3)\n",
+    "\n",
+    "# plt.tight_layout()\n",
+    "# plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 63,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df_to_corr_matrix(X, size_factor=1.6, sep=150)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Split the data into train_full and test_reserved (untouch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n",
+      "The training data has 316 data. The testing data has 79 data. \n",
+      "Positive ratio: \n",
+      "\tTrain: 0.21203\n",
+      "\tTest: 0.21519\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Close ratio random_state\n",
+    "# [14, 47, 49, 52, 62, 76, 83, 89, 92, 116, 118, 122, 136, 138, 144, 146, 150, 156, 157, 159, 170, 172, 174, 185]\n",
+    "\n",
+    "while True:  \n",
+    "    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0\n",
+    "    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n",
+    "\n",
+    "    X_train_full.reset_index(drop=True, inplace=True)\n",
+    "    X_test_reserved.reset_index(drop=True, inplace=True)\n",
+    "    y_train_full.reset_index(drop=True, inplace=True)\n",
+    "    y_test_reserved.reset_index(drop=True, inplace=True)\n",
+    "\n",
+    "    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n",
+    "    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n",
+    "\n",
+    "    if abs(ratio_train - ratio_test) < 0.1:\n",
+    "        break\n",
+    "\n",
+    "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n",
+    "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n",
+    "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Outliers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # The result of keeping outliers is better\n",
+    "# X_train_full, y_train_full = remove_outliers(X_train_full, y_train_full, selected_features)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### XGBoost"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(316, 25)\n",
+      "(316,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(X_train_full.shape)\n",
+    "print(y_train_full.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 0\n",
+      "Best Parameters at Index 8161 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+      "Balanced accuracy score: 0.7762323390894819\n",
+      "F1 Score: 0.5732413323110996\n",
+      "Precision Score: 0.42922841143530793\n",
+      "Recall Score: 0.8659340659340661\n",
+      "Specificity Score: 0.686530612244898\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 1\n",
+      "Best Parameters at Index 3843 : {'gamma': 0, 'learning_rate': 0.25, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 50, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+      "Balanced accuracy score: 0.763171114599686\n",
+      "F1 Score: 0.5623509823509825\n",
+      "Precision Score: 0.4324091925871681\n",
+      "Recall Score: 0.823076923076923\n",
+      "Specificity Score: 0.703265306122449\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 2\n",
+      "Best Parameters at Index 1691 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 100, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+      "Balanced accuracy score: 0.7733186813186812\n",
+      "F1 Score: 0.5808290356353484\n",
+      "Precision Score: 0.45272486772486775\n",
+      "Recall Score: 0.8197802197802198\n",
+      "Specificity Score: 0.7268571428571429\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 3\n",
+      "Best Parameters at Index 7083 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 50, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+      "Balanced accuracy score: 0.7692747252747253\n",
+      "F1 Score: 0.5710505619042204\n",
+      "Precision Score: 0.4382588000235059\n",
+      "Recall Score: 0.8362637362637363\n",
+      "Specificity Score: 0.7022857142857143\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 4\n",
+      "Best Parameters at Index 601 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+      "Balanced accuracy score: 0.7721507064364206\n",
+      "F1 Score: 0.5713989997742859\n",
+      "Precision Score: 0.43731630688152434\n",
+      "Recall Score: 0.8373626373626374\n",
+      "Specificity Score: 0.7069387755102041\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 5\n",
+      "Best Parameters at Index 1685 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+      "Balanced accuracy score: 0.7514662480376766\n",
+      "F1 Score: 0.5524357534869451\n",
+      "Precision Score: 0.42733333333333334\n",
+      "Recall Score: 0.787912087912088\n",
+      "Specificity Score: 0.7150204081632653\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 6\n",
+      "Best Parameters at Index 3309 : {'gamma': 0, 'learning_rate': 0.25, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 80, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+      "Balanced accuracy score: 0.7787849293563578\n",
+      "F1 Score: 0.577384971479965\n",
+      "Precision Score: 0.43828953655040614\n",
+      "Recall Score: 0.8505494505494505\n",
+      "Specificity Score: 0.7070204081632653\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 7\n",
+      "Best Parameters at Index 6540 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
+      "Balanced accuracy score: 0.7682605965463108\n",
+      "F1 Score: 0.5682149974832902\n",
+      "Precision Score: 0.4367256817256817\n",
+      "Recall Score: 0.8175824175824176\n",
+      "Specificity Score: 0.7189387755102039\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 8\n",
+      "Best Parameters at Index 600 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
+      "Balanced accuracy score: 0.7585651491365777\n",
+      "F1 Score: 0.5616951747439553\n",
+      "Precision Score: 0.4316868417377755\n",
+      "Recall Score: 0.8186813186813187\n",
+      "Specificity Score: 0.6984489795918367\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 9\n",
+      "Best Parameters at Index 6001 : {'gamma': 0, 'learning_rate': 0.25, 'max_bin': 10, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+      "Balanced accuracy score: 0.781927786499215\n",
+      "F1 Score: 0.5872351485663194\n",
+      "Precision Score: 0.44791534391534393\n",
+      "Recall Score: 0.8648351648351648\n",
+      "Specificity Score: 0.6990204081632653\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 10\n",
+      "Best Parameters at Index 4380 : {'gamma': 0, 'learning_rate': 0.25, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
+      "Balanced accuracy score: 0.7608854003139717\n",
+      "F1 Score: 0.5588994220573168\n",
+      "Precision Score: 0.4250603804797353\n",
+      "Recall Score: 0.8230769230769232\n",
+      "Specificity Score: 0.6986938775510205\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 11\n",
+      "Best Parameters at Index 60 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
+      "Balanced accuracy score: 0.7567284144427001\n",
+      "F1 Score: 0.5563016524439233\n",
+      "Precision Score: 0.42433391599131315\n",
+      "Recall Score: 0.8186813186813187\n",
+      "Specificity Score: 0.6947755102040816\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 12\n",
+      "Best Parameters at Index 8160 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
+      "Balanced accuracy score: 0.7668885400313972\n",
+      "F1 Score: 0.5754879804114064\n",
+      "Precision Score: 0.4512433862433863\n",
+      "Recall Score: 0.8065934065934066\n",
+      "Specificity Score: 0.7271836734693877\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 13\n",
+      "Best Parameters at Index 8161 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+      "Balanced accuracy score: 0.7853751962323391\n",
+      "F1 Score: 0.5821092385107072\n",
+      "Precision Score: 0.43648635235732014\n",
+      "Recall Score: 0.8802197802197803\n",
+      "Specificity Score: 0.690530612244898\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 14\n",
+      "Best Parameters at Index 60 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
+      "Balanced accuracy score: 0.7593155416012559\n",
+      "F1 Score: 0.5554589371980676\n",
+      "Precision Score: 0.4169422767248854\n",
+      "Recall Score: 0.8362637362637363\n",
+      "Specificity Score: 0.6823673469387755\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 15\n",
+      "Best Parameters at Index 600 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
+      "Balanced accuracy score: 0.7812119309262167\n",
+      "F1 Score: 0.5818895255480622\n",
+      "Precision Score: 0.44234472934472935\n",
+      "Recall Score: 0.8516483516483516\n",
+      "Specificity Score: 0.7107755102040817\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 16\n",
+      "Best Parameters at Index 2761 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 10, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+      "Balanced accuracy score: 0.7639089481946626\n",
+      "F1 Score: 0.5590271923942298\n",
+      "Precision Score: 0.4172192749778957\n",
+      "Recall Score: 0.8494505494505494\n",
+      "Specificity Score: 0.6783673469387755\n",
+      "\n",
+      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
+      "random_state: 17\n",
+      "Best Parameters at Index 9780 : {'gamma': 0, 'learning_rate': 0.4, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
+      "Balanced accuracy score: 0.770967032967033\n",
+      "F1 Score: 0.5707573812580231\n",
+      "Precision Score: 0.43912169312169314\n",
+      "Recall Score: 0.823076923076923\n",
+      "Specificity Score: 0.7188571428571429\n",
+      "\n",
+      "Best run is the 13th run. Score: 0.7853751962323391\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = XGBClassifier(objective=\"binary:logistic\")\n",
+    "\n",
+    "# param_grid = {\n",
+    "#     \"gamma\": [0.2, 0.3],\n",
+    "#     \"learning_rate\": [0.3, 0.5],\n",
+    "#     \"max_bin\": [2, 5, 10, 20],\n",
+    "#     \"max_depth\": [1, 2, 3],\n",
+    "#     \"max_leaves\": [1, 2, 3, 4],\n",
+    "#     \"n_estimators\": [5, 10, 20, 30, 40, 50],\n",
+    "#     \"scale_pos_weight\": [4.5],  # imbalanced data\n",
+    "# }\n",
+    "param_grid = {\n",
+    "    \"gamma\": [0, 0.01, 0.1, 0.3],\n",
+    "    \"learning_rate\": [0.2, 0.25, 0.3, 0.4],\n",
+    "    \"max_bin\": [2, 4, 5, 6, 8, 10],\n",
+    "    \"max_depth\": [1, 2, 3],\n",
+    "    \"max_leaves\": [1, 2, 3],\n",
+    "    \"min_child_weight\": [0, 0.001, 0.005, 0.01, 1],\n",
+    "    \"n_estimators\": [30, 50, 70, 75, 80, 100],\n",
+    "    \"num_parallel_tree\": [1],\n",
+    "    \"scale_pos_weight\": [3.8, 4.5],  # imbalanced data\n",
+    "}\n",
+    "\n",
+    "best_score = 0\n",
+    "best_score_at = None\n",
+    "\n",
+    "for i in range(18):\n",
+    "\n",
+    "    kf = StratifiedKFold(5, shuffle=True, random_state=i)\n",
+    "\n",
+    "    # Set up the GridSearchCV\n",
+    "    grid_search = GridSearchCV(\n",
+    "        estimator=model,\n",
+    "        param_grid=param_grid,\n",
+    "        scoring={\n",
+    "            \"f1\": \"f1\",\n",
+    "            \"recall\": \"recall\",\n",
+    "            \"specificity\": make_scorer(recall_score, pos_label=0),\n",
+    "            \"precision\": \"precision\",\n",
+    "            \"balanced_accuracy_score\": make_scorer(balanced_accuracy_score),\n",
+    "        },\n",
+    "        cv=kf,\n",
+    "        # cv=5,\n",
+    "        verbose=1,\n",
+    "        n_jobs=-1,\n",
+    "        return_train_score=True,\n",
+    "        refit=\"balanced_accuracy_score\",\n",
+    "    )\n",
+    "\n",
+    "    # Fit the model\n",
+    "    grid_search.fit(X_train_full, y_train_full)\n",
+    "\n",
+    "    # Get the best parameters and best score\n",
+    "    result = pd.DataFrame(grid_search.cv_results_)\n",
+    "    best_params = grid_search.best_params_\n",
+    "    best_index = grid_search.best_index_\n",
+    "    best_f1 = result[\"mean_test_f1\"][best_index]\n",
+    "    best_precision = result[\"mean_test_precision\"][best_index]\n",
+    "    best_recall = result[\"mean_test_recall\"][best_index]\n",
+    "    best_specificity = result[\"mean_test_specificity\"][best_index]\n",
+    "    best_balanced_accuracy_score = result[\"mean_test_balanced_accuracy_score\"][best_index]\n",
+    "\n",
+    "    print(f\"random_state: {i}\")\n",
+    "    print(f\"Best Parameters at Index {best_index} :\", best_params)\n",
+    "    print(f\"Balanced accuracy score: {best_balanced_accuracy_score}\")\n",
+    "    print(f\"F1 Score: {best_f1}\")\n",
+    "    print(f\"Precision Score: {best_precision}\")\n",
+    "    print(f\"Recall Score: {best_recall}\")\n",
+    "    print(f\"Specificity Score: {best_specificity}\")\n",
+    "    print()\n",
+    "\n",
+    "    if best_balanced_accuracy_score > best_score:\n",
+    "        best_score = best_balanced_accuracy_score\n",
+    "        best_score_at = i\n",
+    "\n",
+    "    pd.DataFrame(grid_search.cv_results_).to_csv(f\"output{i}.csv\")\n",
+    "\n",
+    "print(f\"Best run is the {best_score_at}th run. Score: {best_score}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 68,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = grid_search.best_estimator_\n",
+    "model.save_model(\"model.ubj\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 69,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['param_gamma', 'param_learning_rate', 'param_max_bin', 'param_max_depth', 'param_max_leaves', 'param_min_child_weight', 'param_n_estimators', 'param_num_parallel_tree', 'param_scale_pos_weight']\n"
+     ]
+    }
+   ],
+   "source": [
+    "results = pd.DataFrame(grid_search.cv_results_)\n",
+    "result_start = 4\n",
+    "print(list(results.keys())[result_start:result_start+len(param_grid)])\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "results = pd.DataFrame(grid_search.cv_results_)\n",
+    "\n",
+    "# filter = results['param_num_parallel_tree'] == 10\n",
+    "filter = pd.Series([True] * len(results)) # include all data\n",
+    "\n",
+    "fig = go.Figure()\n",
+    "# Add mean train score trace\n",
+    "fig.add_trace(go.Scatter(\n",
+    "    x=list(range(len(results[\"mean_train_f1\"][filter]))),\n",
+    "    y=results[\"mean_train_f1\"][filter],\n",
+    "    mode='lines+markers',\n",
+    "    name='Mean Train F1',\n",
+    "    text=list(results['params'][filter]),  # Display parameter values on hover\n",
+    "    hoverinfo='text+y+x',  # Show parameter values and y value\n",
+    "))\n",
+    "fig.add_trace(go.Scatter(\n",
+    "    x=list(range(len(results[\"mean_train_recall\"][filter]))),\n",
+    "    y=results[\"mean_train_recall\"][filter],\n",
+    "    mode='lines+markers',\n",
+    "    name='Mean Train Recall',\n",
+    "    text=list(results['params'][filter]),  # Display parameter values on hover\n",
+    "    hoverinfo='text+y+x',  # Show parameter values and y value\n",
+    "    visible=\"legendonly\",\n",
+    "))\n",
+    "fig.add_trace(go.Scatter(\n",
+    "    x=list(range(len(results[\"mean_train_specificity\"][filter]))),\n",
+    "    y=results[\"mean_train_specificity\"][filter],\n",
+    "    mode='lines+markers',\n",
+    "    name='Mean Train Specificity',\n",
+    "    text=list(results['params'][filter]),  # Display parameter values on hover\n",
+    "    hoverinfo='text+y+x',  # Show parameter values and y value\n",
+    "    visible=\"legendonly\",\n",
+    "))\n",
+    "# Add mean test score trace\n",
+    "fig.add_trace(go.Scatter(\n",
+    "    x=list(range(len(results[\"mean_test_f1\"][filter]))),\n",
+    "    y=results[\"mean_test_f1\"][filter],\n",
+    "    mode='lines+markers',\n",
+    "    name='Mean Test F1',\n",
+    "    text=list(results['params'][filter]),  # Display parameter values on hover\n",
+    "    hoverinfo='text+y+x',  # Show parameter values and y value\n",
+    "))\n",
+    "fig.add_trace(go.Scatter(\n",
+    "    x=list(range(len(results[\"mean_test_recall\"][filter]))),\n",
+    "    y=results[\"mean_test_recall\"][filter],\n",
+    "    mode='lines+markers',\n",
+    "    name='Mean Test Recall',\n",
+    "    text=list(results['params'][filter]),  # Display parameter values on hover\n",
+    "    hoverinfo='text+y+x',  # Show parameter values and y value\n",
+    "    visible=\"legendonly\",\n",
+    "))\n",
+    "fig.add_trace(go.Scatter(\n",
+    "    x=list(range(len(results[\"mean_test_specificity\"][filter]))),\n",
+    "    y=results[\"mean_test_specificity\"][filter],\n",
+    "    mode='lines+markers',\n",
+    "    name='Mean Test Specificity',\n",
+    "    text=list(results['params'][filter]),  # Display parameter values on hover\n",
+    "    hoverinfo='text+y+x',  # Show parameter values and y value\n",
+    "    visible=\"legendonly\",\n",
+    "))\n",
+    "\n",
+    "# Update layout\n",
+    "fig.update_layout(\n",
+    "    title='Grid Search Mean Train and Test Scores',\n",
+    "    xaxis_title='Parameter Combinations (Index)',\n",
+    "    yaxis_title='Score',\n",
+    "    legend_title='Scores',\n",
+    "    hovermode='closest'\n",
+    ")\n",
+    "fig.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 71,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Testing set:\n",
+      "(79, 25)\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "         0.0       0.95      0.60      0.73        62\n",
+      "         1.0       0.38      0.88      0.53        17\n",
+      "\n",
+      "    accuracy                           0.66        79\n",
+      "   macro avg       0.66      0.74      0.63        79\n",
+      "weighted avg       0.83      0.66      0.69        79\n",
+      "\n",
+      "[[37 25]\n",
+      " [ 2 15]]\n",
+      "\n",
+      "Balanced accuracy score: 0.7395635673624288\n",
+      "F1 Score: 0.5263157894736842\n",
+      "Precision: 0.375\n",
+      "Recall: 0.8823529411764706\n",
+      "Specificity: 0.5967741935483871\n",
+      "\n",
+      "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n",
+      "The training data has 316 data. The testing data has 79 data. \n",
+      "Positive ratio: \n",
+      "\tTrain: 0.21203\n",
+      "\tTest: 0.21519\n",
+      "Best Parameters at Index 1112 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 8, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+      "Balanced accuracy score: 0.7834536891679749\n",
+      "F1 Score: 0.5803091217725365\n",
+      "Precision Score: 0.4370269041303524\n",
+      "Recall Score: 0.868131868131868\n",
+      "Specificity Score: 0.6987755102040816\n",
+      "\n",
+      "Training set:\n",
+      "(316, 25)\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "         0.0       0.97      0.69      0.81       249\n",
+      "         1.0       0.44      0.91      0.60        67\n",
+      "\n",
+      "    accuracy                           0.74       316\n",
+      "   macro avg       0.70      0.80      0.70       316\n",
+      "weighted avg       0.86      0.74      0.76       316\n",
+      "\n",
+      "[[172  77]\n",
+      " [  6  61]]\n",
+      "\n",
+      "Testing set:\n",
+      "(79, 25)\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "         0.0       0.95      0.60      0.73        62\n",
+      "         1.0       0.38      0.88      0.53        17\n",
+      "\n",
+      "    accuracy                           0.66        79\n",
+      "   macro avg       0.66      0.74      0.63        79\n",
+      "weighted avg       0.83      0.66      0.69        79\n",
+      "\n",
+      "[[37 25]\n",
+      " [ 2 15]]\n",
+      "\n",
+      "Balanced accuracy score: 0.7395635673624288\n",
+      "F1 Score: 0.5263157894736842\n",
+      "Precision: 0.375\n",
+      "Recall: 0.8823529411764706\n",
+      "Specificity: 0.5967741935483871\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = grid_search.best_estimator_\n",
+    "\n",
+    "X_test = X_test_reserved\n",
+    "\n",
+    "y_pred = model.predict(X_test)\n",
+    "report = classification_report(y_test_reserved, y_pred)\n",
+    "cm = confusion_matrix(y_test_reserved, y_pred)\n",
+    "\n",
+    "print(\"Testing set:\")\n",
+    "print(X_test_reserved.shape)\n",
+    "print(report)\n",
+    "print(cm)\n",
+    "print()\n",
+    "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"F1 Score: {f1_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"Precision: {precision_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"Recall: {recall_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}\")\n",
+    "\n",
+    "while True:  \n",
+    "    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n",
+    "\n",
+    "    X_train_full.reset_index(drop=True, inplace=True)\n",
+    "    X_test_reserved.reset_index(drop=True, inplace=True)\n",
+    "    y_train_full.reset_index(drop=True, inplace=True)\n",
+    "    y_test_reserved.reset_index(drop=True, inplace=True)\n",
+    "\n",
+    "    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n",
+    "    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n",
+    "\n",
+    "    if abs(ratio_train - ratio_test) < 0.01:\n",
+    "        break\n",
+    "\n",
+    "print()\n",
+    "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n",
+    "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n",
+    "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")\n",
+    "\n",
+    "# print(f\"TARGET_NUM_OF_FEATURES: {TARGET_NUM_OF_FEATURES}, scaler: {SCALER}, num_of_features: {num_of_features}\")\n",
+    "\n",
+    "print(f\"Best Parameters at Index {best_index} :\", best_params)\n",
+    "print(f\"Balanced accuracy score: {best_balanced_accuracy_score}\")\n",
+    "print(f\"F1 Score: {best_f1}\")\n",
+    "print(f\"Precision Score: {best_precision}\")\n",
+    "print(f\"Recall Score: {best_recall}\")\n",
+    "print(f\"Specificity Score: {best_specificity}\")\n",
+    "\n",
+    "model.fit(X_train_full, y_train_full)\n",
+    "\n",
+    "\n",
+    "\n",
+    "y_pred = model.predict(X_train_full)\n",
+    "report = classification_report(y_train_full, y_pred)\n",
+    "cm = confusion_matrix(y_train_full, y_pred)\n",
+    "\n",
+    "print(\"\\nTraining set:\")\n",
+    "print(X_train_full.shape)\n",
+    "print(report)\n",
+    "print(cm)\n",
+    "print()\n",
+    "\n",
+    "X_test = X_test_reserved\n",
+    "\n",
+    "y_pred = model.predict(X_test)\n",
+    "report = classification_report(y_test_reserved, y_pred)\n",
+    "cm = confusion_matrix(y_test_reserved, y_pred)\n",
+    "\n",
+    "print(\"Testing set:\")\n",
+    "print(X_test_reserved.shape)\n",
+    "print(report)\n",
+    "print(cm)\n",
+    "print()\n",
+    "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"F1 Score: {f1_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"Precision: {precision_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"Recall: {recall_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}\")\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 72,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "F1 Score: 0.5403225806451613\n",
+      "Precision: 0.40853658536585363\n",
+      "Recall: 0.7976190476190477\n",
+      "Specificity: 0.6881028938906752\n",
+      "[[214  97]\n",
+      " [ 17  67]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True)\n",
+    "\n",
+    "y_pred_cv = cross_val_predict(model, X, y, cv=stratified_kfold)\n",
+    "\n",
+    "# Calculate evaluation metrics\n",
+    "f1 = f1_score(y, y_pred_cv)\n",
+    "precision = precision_score(y, y_pred_cv)\n",
+    "recall = recall_score(y, y_pred_cv)\n",
+    "specificity = recall_score(y, y_pred_cv, pos_label=0)\n",
+    "\n",
+    "# Print evaluation metrics\n",
+    "print(\"F1 Score:\", f1)\n",
+    "print(\"Precision:\", precision)\n",
+    "print(\"Recall:\", recall)\n",
+    "print(\"Specificity:\", specificity)\n",
+    "\n",
+    "print(confusion_matrix(y, y_pred_cv))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "`Best Parameters at Index 244 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0.001, 'n_estimators': 75, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}`\n",
+    "\n",
+    "`Best Parameters at Index 1112 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 8, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}`\n",
+    "\n",
+    "`random_state=13` `{'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}`\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "MLEAsm",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}