ML_BreastCancerTreatment / Git / [e6e569] /XGBClassification/main old.ipynb

Models:
joseph-gordon/
ML_BreastCancerTreatment
Downloads: 1
[e6e569]: / XGBClassification / main old.ipynb
History
Download this file
860 lines (859 with data), 36.1 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install `xlrd` for reading the `xls` file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %conda install xlrd==2.0.1\n",
    "# $ conda install -c conda-forge py-xgboost-gpu\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set the path to the `xls` file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_file = \"../TrainDataset2024.xls\"\n",
    "# training_file = \"/kaggle/input/dataset/TrainDataset2024.xls\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-12-05 00:35:47.391919: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2024-12-05 00:35:47.557489: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
      "E0000 00:00:1733358947.620102    1213 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
      "E0000 00:00:1733358947.638547    1213 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "2024-12-05 00:35:47.796076: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import os\n",
    "\n",
    "# Add the parent directory to the system path\n",
    "sys.path.append(os.path.abspath('../'))  # Adjust the path as needed\n",
    "\n",
    "from my_util import df_to_corr_matrix, remove_outliers\n",
    "\n",
    "import tensorflow as tf\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import plotly.graph_objects as go\n",
    "\n",
    "from matplotlib.colors import Normalize\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n",
    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
    "from sklearn.impute import KNNImputer\n",
    "\n",
    "\n",
    "from imblearn.over_sampling import SMOTE\n",
    "from imblearn.pipeline import Pipeline\n",
    "\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "import xgboost as xgb\n",
    "from xgboost import XGBClassifier\n",
    "\n",
    "from pickle import dump , load\n",
    "\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read the data into X and y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature\n",
      "(395, 25) (395,)\n",
      "['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength', 'original_glrlm_LongRunLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'HistologyType', 'ChemoGrade', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterColumn', 'original_shape_SurfaceVolumeRatio', 'original_shape_LeastAxisLength', 'original_glcm_Autocorrelation', 'original_shape_Sphericity']\n"
     ]
    }
   ],
   "source": [
    "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n",
    "\n",
    "data = pd.read_excel(training_file)\n",
    "data.replace(999, np.nan, inplace=True)\n",
    "\n",
    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
    "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
    "\n",
    "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n",
    "    selected_features = load(file)\n",
    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
    "\n",
    "X = data[selected_features]\n",
    "y = data[\"pCR (outcome)\"]\n",
    "print(X.shape, y.shape)\n",
    "\n",
    "print(selected_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Set up the matplotlib figure\n",
    "# plt.figure(figsize=(40, 30))\n",
    "\n",
    "# # Loop through each feature to create a scatter plot\n",
    "# for i, feature in enumerate(X.columns):\n",
    "#     plt.subplot(5, 6, i + 1)  # Adjust the number of rows and columns based on the number of features\n",
    "#     sns.scatterplot(x=y, y=X[feature], hue=y, style=y, palette='Set2', alpha=0.7)\n",
    "#     plt.title(feature)\n",
    "#     plt.xlabel('pCR (outcome)')\n",
    "#     plt.ylabel(feature)\n",
    "#     plt.xlim(-2, 3)\n",
    "\n",
    "# plt.tight_layout()\n",
    "# plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 63,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_to_corr_matrix(X, size_factor=1.6, sep=150)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Split the data into train_full and test_reserved (untouch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n",
      "The training data has 316 data. The testing data has 79 data. \n",
      "Positive ratio: \n",
      "\tTrain: 0.21203\n",
      "\tTest: 0.21519\n"
     ]
    }
   ],
   "source": [
    "# Close ratio random_state\n",
    "# [14, 47, 49, 52, 62, 76, 83, 89, 92, 116, 118, 122, 136, 138, 144, 146, 150, 156, 157, 159, 170, 172, 174, 185]\n",
    "\n",
    "while True:  \n",
    "    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0\n",
    "    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n",
    "\n",
    "    X_train_full.reset_index(drop=True, inplace=True)\n",
    "    X_test_reserved.reset_index(drop=True, inplace=True)\n",
    "    y_train_full.reset_index(drop=True, inplace=True)\n",
    "    y_test_reserved.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n",
    "    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n",
    "\n",
    "    if abs(ratio_train - ratio_test) < 0.1:\n",
    "        break\n",
    "\n",
    "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n",
    "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n",
    "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 65,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # The result of keeping outliers is better\n",
    "# X_train_full, y_train_full = remove_outliers(X_train_full, y_train_full, selected_features)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### XGBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(316, 25)\n",
      "(316,)\n"
     ]
    }
   ],
   "source": [
    "print(X_train_full.shape)\n",
    "print(y_train_full.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 0\n",
      "Best Parameters at Index 8161 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
      "Balanced accuracy score: 0.7762323390894819\n",
      "F1 Score: 0.5732413323110996\n",
      "Precision Score: 0.42922841143530793\n",
      "Recall Score: 0.8659340659340661\n",
      "Specificity Score: 0.686530612244898\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 1\n",
      "Best Parameters at Index 3843 : {'gamma': 0, 'learning_rate': 0.25, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 50, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
      "Balanced accuracy score: 0.763171114599686\n",
      "F1 Score: 0.5623509823509825\n",
      "Precision Score: 0.4324091925871681\n",
      "Recall Score: 0.823076923076923\n",
      "Specificity Score: 0.703265306122449\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 2\n",
      "Best Parameters at Index 1691 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 100, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
      "Balanced accuracy score: 0.7733186813186812\n",
      "F1 Score: 0.5808290356353484\n",
      "Precision Score: 0.45272486772486775\n",
      "Recall Score: 0.8197802197802198\n",
      "Specificity Score: 0.7268571428571429\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 3\n",
      "Best Parameters at Index 7083 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 50, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
      "Balanced accuracy score: 0.7692747252747253\n",
      "F1 Score: 0.5710505619042204\n",
      "Precision Score: 0.4382588000235059\n",
      "Recall Score: 0.8362637362637363\n",
      "Specificity Score: 0.7022857142857143\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 4\n",
      "Best Parameters at Index 601 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
      "Balanced accuracy score: 0.7721507064364206\n",
      "F1 Score: 0.5713989997742859\n",
      "Precision Score: 0.43731630688152434\n",
      "Recall Score: 0.8373626373626374\n",
      "Specificity Score: 0.7069387755102041\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 5\n",
      "Best Parameters at Index 1685 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
      "Balanced accuracy score: 0.7514662480376766\n",
      "F1 Score: 0.5524357534869451\n",
      "Precision Score: 0.42733333333333334\n",
      "Recall Score: 0.787912087912088\n",
      "Specificity Score: 0.7150204081632653\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 6\n",
      "Best Parameters at Index 3309 : {'gamma': 0, 'learning_rate': 0.25, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 80, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
      "Balanced accuracy score: 0.7787849293563578\n",
      "F1 Score: 0.577384971479965\n",
      "Precision Score: 0.43828953655040614\n",
      "Recall Score: 0.8505494505494505\n",
      "Specificity Score: 0.7070204081632653\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 7\n",
      "Best Parameters at Index 6540 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
      "Balanced accuracy score: 0.7682605965463108\n",
      "F1 Score: 0.5682149974832902\n",
      "Precision Score: 0.4367256817256817\n",
      "Recall Score: 0.8175824175824176\n",
      "Specificity Score: 0.7189387755102039\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 8\n",
      "Best Parameters at Index 600 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
      "Balanced accuracy score: 0.7585651491365777\n",
      "F1 Score: 0.5616951747439553\n",
      "Precision Score: 0.4316868417377755\n",
      "Recall Score: 0.8186813186813187\n",
      "Specificity Score: 0.6984489795918367\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 9\n",
      "Best Parameters at Index 6001 : {'gamma': 0, 'learning_rate': 0.25, 'max_bin': 10, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
      "Balanced accuracy score: 0.781927786499215\n",
      "F1 Score: 0.5872351485663194\n",
      "Precision Score: 0.44791534391534393\n",
      "Recall Score: 0.8648351648351648\n",
      "Specificity Score: 0.6990204081632653\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 10\n",
      "Best Parameters at Index 4380 : {'gamma': 0, 'learning_rate': 0.25, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
      "Balanced accuracy score: 0.7608854003139717\n",
      "F1 Score: 0.5588994220573168\n",
      "Precision Score: 0.4250603804797353\n",
      "Recall Score: 0.8230769230769232\n",
      "Specificity Score: 0.6986938775510205\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 11\n",
      "Best Parameters at Index 60 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
      "Balanced accuracy score: 0.7567284144427001\n",
      "F1 Score: 0.5563016524439233\n",
      "Precision Score: 0.42433391599131315\n",
      "Recall Score: 0.8186813186813187\n",
      "Specificity Score: 0.6947755102040816\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 12\n",
      "Best Parameters at Index 8160 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
      "Balanced accuracy score: 0.7668885400313972\n",
      "F1 Score: 0.5754879804114064\n",
      "Precision Score: 0.4512433862433863\n",
      "Recall Score: 0.8065934065934066\n",
      "Specificity Score: 0.7271836734693877\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 13\n",
      "Best Parameters at Index 8161 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
      "Balanced accuracy score: 0.7853751962323391\n",
      "F1 Score: 0.5821092385107072\n",
      "Precision Score: 0.43648635235732014\n",
      "Recall Score: 0.8802197802197803\n",
      "Specificity Score: 0.690530612244898\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 14\n",
      "Best Parameters at Index 60 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
      "Balanced accuracy score: 0.7593155416012559\n",
      "F1 Score: 0.5554589371980676\n",
      "Precision Score: 0.4169422767248854\n",
      "Recall Score: 0.8362637362637363\n",
      "Specificity Score: 0.6823673469387755\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 15\n",
      "Best Parameters at Index 600 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 4, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
      "Balanced accuracy score: 0.7812119309262167\n",
      "F1 Score: 0.5818895255480622\n",
      "Precision Score: 0.44234472934472935\n",
      "Recall Score: 0.8516483516483516\n",
      "Specificity Score: 0.7107755102040817\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 16\n",
      "Best Parameters at Index 2761 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 10, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
      "Balanced accuracy score: 0.7639089481946626\n",
      "F1 Score: 0.5590271923942298\n",
      "Precision Score: 0.4172192749778957\n",
      "Recall Score: 0.8494505494505494\n",
      "Specificity Score: 0.6783673469387755\n",
      "\n",
      "Fitting 5 folds for each of 51840 candidates, totalling 259200 fits\n",
      "random_state: 17\n",
      "Best Parameters at Index 9780 : {'gamma': 0, 'learning_rate': 0.4, 'max_bin': 2, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 3.8}\n",
      "Balanced accuracy score: 0.770967032967033\n",
      "F1 Score: 0.5707573812580231\n",
      "Precision Score: 0.43912169312169314\n",
      "Recall Score: 0.823076923076923\n",
      "Specificity Score: 0.7188571428571429\n",
      "\n",
      "Best run is the 13th run. Score: 0.7853751962323391\n"
     ]
    }
   ],
   "source": [
    "model = XGBClassifier(objective=\"binary:logistic\")\n",
    "\n",
    "# param_grid = {\n",
    "#     \"gamma\": [0.2, 0.3],\n",
    "#     \"learning_rate\": [0.3, 0.5],\n",
    "#     \"max_bin\": [2, 5, 10, 20],\n",
    "#     \"max_depth\": [1, 2, 3],\n",
    "#     \"max_leaves\": [1, 2, 3, 4],\n",
    "#     \"n_estimators\": [5, 10, 20, 30, 40, 50],\n",
    "#     \"scale_pos_weight\": [4.5],  # imbalanced data\n",
    "# }\n",
    "param_grid = {\n",
    "    \"gamma\": [0, 0.01, 0.1, 0.3],\n",
    "    \"learning_rate\": [0.2, 0.25, 0.3, 0.4],\n",
    "    \"max_bin\": [2, 4, 5, 6, 8, 10],\n",
    "    \"max_depth\": [1, 2, 3],\n",
    "    \"max_leaves\": [1, 2, 3],\n",
    "    \"min_child_weight\": [0, 0.001, 0.005, 0.01, 1],\n",
    "    \"n_estimators\": [30, 50, 70, 75, 80, 100],\n",
    "    \"num_parallel_tree\": [1],\n",
    "    \"scale_pos_weight\": [3.8, 4.5],  # imbalanced data\n",
    "}\n",
    "\n",
    "best_score = 0\n",
    "best_score_at = None\n",
    "\n",
    "for i in range(18):\n",
    "\n",
    "    kf = StratifiedKFold(5, shuffle=True, random_state=i)\n",
    "\n",
    "    # Set up the GridSearchCV\n",
    "    grid_search = GridSearchCV(\n",
    "        estimator=model,\n",
    "        param_grid=param_grid,\n",
    "        scoring={\n",
    "            \"f1\": \"f1\",\n",
    "            \"recall\": \"recall\",\n",
    "            \"specificity\": make_scorer(recall_score, pos_label=0),\n",
    "            \"precision\": \"precision\",\n",
    "            \"balanced_accuracy_score\": make_scorer(balanced_accuracy_score),\n",
    "        },\n",
    "        cv=kf,\n",
    "        # cv=5,\n",
    "        verbose=1,\n",
    "        n_jobs=-1,\n",
    "        return_train_score=True,\n",
    "        refit=\"balanced_accuracy_score\",\n",
    "    )\n",
    "\n",
    "    # Fit the model\n",
    "    grid_search.fit(X_train_full, y_train_full)\n",
    "\n",
    "    # Get the best parameters and best score\n",
    "    result = pd.DataFrame(grid_search.cv_results_)\n",
    "    best_params = grid_search.best_params_\n",
    "    best_index = grid_search.best_index_\n",
    "    best_f1 = result[\"mean_test_f1\"][best_index]\n",
    "    best_precision = result[\"mean_test_precision\"][best_index]\n",
    "    best_recall = result[\"mean_test_recall\"][best_index]\n",
    "    best_specificity = result[\"mean_test_specificity\"][best_index]\n",
    "    best_balanced_accuracy_score = result[\"mean_test_balanced_accuracy_score\"][best_index]\n",
    "\n",
    "    print(f\"random_state: {i}\")\n",
    "    print(f\"Best Parameters at Index {best_index} :\", best_params)\n",
    "    print(f\"Balanced accuracy score: {best_balanced_accuracy_score}\")\n",
    "    print(f\"F1 Score: {best_f1}\")\n",
    "    print(f\"Precision Score: {best_precision}\")\n",
    "    print(f\"Recall Score: {best_recall}\")\n",
    "    print(f\"Specificity Score: {best_specificity}\")\n",
    "    print()\n",
    "\n",
    "    if best_balanced_accuracy_score > best_score:\n",
    "        best_score = best_balanced_accuracy_score\n",
    "        best_score_at = i\n",
    "\n",
    "    pd.DataFrame(grid_search.cv_results_).to_csv(f\"output{i}.csv\")\n",
    "\n",
    "print(f\"Best run is the {best_score_at}th run. Score: {best_score}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 68,
   "metadata": {},
   "outputs": [],
   "source": [
    "model = grid_search.best_estimator_\n",
    "model.save_model(\"model.ubj\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['param_gamma', 'param_learning_rate', 'param_max_bin', 'param_max_depth', 'param_max_leaves', 'param_min_child_weight', 'param_n_estimators', 'param_num_parallel_tree', 'param_scale_pos_weight']\n"
     ]
    }
   ],
   "source": [
    "results = pd.DataFrame(grid_search.cv_results_)\n",
    "result_start = 4\n",
    "print(list(results.keys())[result_start:result_start+len(param_grid)])\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "results = pd.DataFrame(grid_search.cv_results_)\n",
    "\n",
    "# filter = results['param_num_parallel_tree'] == 10\n",
    "filter = pd.Series([True] * len(results)) # include all data\n",
    "\n",
    "fig = go.Figure()\n",
    "# Add mean train score trace\n",
    "fig.add_trace(go.Scatter(\n",
    "    x=list(range(len(results[\"mean_train_f1\"][filter]))),\n",
    "    y=results[\"mean_train_f1\"][filter],\n",
    "    mode='lines+markers',\n",
    "    name='Mean Train F1',\n",
    "    text=list(results['params'][filter]),  # Display parameter values on hover\n",
    "    hoverinfo='text+y+x',  # Show parameter values and y value\n",
    "))\n",
    "fig.add_trace(go.Scatter(\n",
    "    x=list(range(len(results[\"mean_train_recall\"][filter]))),\n",
    "    y=results[\"mean_train_recall\"][filter],\n",
    "    mode='lines+markers',\n",
    "    name='Mean Train Recall',\n",
    "    text=list(results['params'][filter]),  # Display parameter values on hover\n",
    "    hoverinfo='text+y+x',  # Show parameter values and y value\n",
    "    visible=\"legendonly\",\n",
    "))\n",
    "fig.add_trace(go.Scatter(\n",
    "    x=list(range(len(results[\"mean_train_specificity\"][filter]))),\n",
    "    y=results[\"mean_train_specificity\"][filter],\n",
    "    mode='lines+markers',\n",
    "    name='Mean Train Specificity',\n",
    "    text=list(results['params'][filter]),  # Display parameter values on hover\n",
    "    hoverinfo='text+y+x',  # Show parameter values and y value\n",
    "    visible=\"legendonly\",\n",
    "))\n",
    "# Add mean test score trace\n",
    "fig.add_trace(go.Scatter(\n",
    "    x=list(range(len(results[\"mean_test_f1\"][filter]))),\n",
    "    y=results[\"mean_test_f1\"][filter],\n",
    "    mode='lines+markers',\n",
    "    name='Mean Test F1',\n",
    "    text=list(results['params'][filter]),  # Display parameter values on hover\n",
    "    hoverinfo='text+y+x',  # Show parameter values and y value\n",
    "))\n",
    "fig.add_trace(go.Scatter(\n",
    "    x=list(range(len(results[\"mean_test_recall\"][filter]))),\n",
    "    y=results[\"mean_test_recall\"][filter],\n",
    "    mode='lines+markers',\n",
    "    name='Mean Test Recall',\n",
    "    text=list(results['params'][filter]),  # Display parameter values on hover\n",
    "    hoverinfo='text+y+x',  # Show parameter values and y value\n",
    "    visible=\"legendonly\",\n",
    "))\n",
    "fig.add_trace(go.Scatter(\n",
    "    x=list(range(len(results[\"mean_test_specificity\"][filter]))),\n",
    "    y=results[\"mean_test_specificity\"][filter],\n",
    "    mode='lines+markers',\n",
    "    name='Mean Test Specificity',\n",
    "    text=list(results['params'][filter]),  # Display parameter values on hover\n",
    "    hoverinfo='text+y+x',  # Show parameter values and y value\n",
    "    visible=\"legendonly\",\n",
    "))\n",
    "\n",
    "# Update layout\n",
    "fig.update_layout(\n",
    "    title='Grid Search Mean Train and Test Scores',\n",
    "    xaxis_title='Parameter Combinations (Index)',\n",
    "    yaxis_title='Score',\n",
    "    legend_title='Scores',\n",
    "    hovermode='closest'\n",
    ")\n",
    "fig.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 71,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Testing set:\n",
      "(79, 25)\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.95      0.60      0.73        62\n",
      "         1.0       0.38      0.88      0.53        17\n",
      "\n",
      "    accuracy                           0.66        79\n",
      "   macro avg       0.66      0.74      0.63        79\n",
      "weighted avg       0.83      0.66      0.69        79\n",
      "\n",
      "[[37 25]\n",
      " [ 2 15]]\n",
      "\n",
      "Balanced accuracy score: 0.7395635673624288\n",
      "F1 Score: 0.5263157894736842\n",
      "Precision: 0.375\n",
      "Recall: 0.8823529411764706\n",
      "Specificity: 0.5967741935483871\n",
      "\n",
      "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n",
      "The training data has 316 data. The testing data has 79 data. \n",
      "Positive ratio: \n",
      "\tTrain: 0.21203\n",
      "\tTest: 0.21519\n",
      "Best Parameters at Index 1112 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 8, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
      "Balanced accuracy score: 0.7834536891679749\n",
      "F1 Score: 0.5803091217725365\n",
      "Precision Score: 0.4370269041303524\n",
      "Recall Score: 0.868131868131868\n",
      "Specificity Score: 0.6987755102040816\n",
      "\n",
      "Training set:\n",
      "(316, 25)\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.97      0.69      0.81       249\n",
      "         1.0       0.44      0.91      0.60        67\n",
      "\n",
      "    accuracy                           0.74       316\n",
      "   macro avg       0.70      0.80      0.70       316\n",
      "weighted avg       0.86      0.74      0.76       316\n",
      "\n",
      "[[172  77]\n",
      " [  6  61]]\n",
      "\n",
      "Testing set:\n",
      "(79, 25)\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.95      0.60      0.73        62\n",
      "         1.0       0.38      0.88      0.53        17\n",
      "\n",
      "    accuracy                           0.66        79\n",
      "   macro avg       0.66      0.74      0.63        79\n",
      "weighted avg       0.83      0.66      0.69        79\n",
      "\n",
      "[[37 25]\n",
      " [ 2 15]]\n",
      "\n",
      "Balanced accuracy score: 0.7395635673624288\n",
      "F1 Score: 0.5263157894736842\n",
      "Precision: 0.375\n",
      "Recall: 0.8823529411764706\n",
      "Specificity: 0.5967741935483871\n"
     ]
    }
   ],
   "source": [
    "model = grid_search.best_estimator_\n",
    "\n",
    "X_test = X_test_reserved\n",
    "\n",
    "y_pred = model.predict(X_test)\n",
    "report = classification_report(y_test_reserved, y_pred)\n",
    "cm = confusion_matrix(y_test_reserved, y_pred)\n",
    "\n",
    "print(\"Testing set:\")\n",
    "print(X_test_reserved.shape)\n",
    "print(report)\n",
    "print(cm)\n",
    "print()\n",
    "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"F1 Score: {f1_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"Precision: {precision_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"Recall: {recall_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}\")\n",
    "\n",
    "while True:  \n",
    "    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n",
    "\n",
    "    X_train_full.reset_index(drop=True, inplace=True)\n",
    "    X_test_reserved.reset_index(drop=True, inplace=True)\n",
    "    y_train_full.reset_index(drop=True, inplace=True)\n",
    "    y_test_reserved.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n",
    "    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n",
    "\n",
    "    if abs(ratio_train - ratio_test) < 0.01:\n",
    "        break\n",
    "\n",
    "print()\n",
    "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n",
    "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n",
    "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")\n",
    "\n",
    "# print(f\"TARGET_NUM_OF_FEATURES: {TARGET_NUM_OF_FEATURES}, scaler: {SCALER}, num_of_features: {num_of_features}\")\n",
    "\n",
    "print(f\"Best Parameters at Index {best_index} :\", best_params)\n",
    "print(f\"Balanced accuracy score: {best_balanced_accuracy_score}\")\n",
    "print(f\"F1 Score: {best_f1}\")\n",
    "print(f\"Precision Score: {best_precision}\")\n",
    "print(f\"Recall Score: {best_recall}\")\n",
    "print(f\"Specificity Score: {best_specificity}\")\n",
    "\n",
    "model.fit(X_train_full, y_train_full)\n",
    "\n",
    "\n",
    "\n",
    "y_pred = model.predict(X_train_full)\n",
    "report = classification_report(y_train_full, y_pred)\n",
    "cm = confusion_matrix(y_train_full, y_pred)\n",
    "\n",
    "print(\"\\nTraining set:\")\n",
    "print(X_train_full.shape)\n",
    "print(report)\n",
    "print(cm)\n",
    "print()\n",
    "\n",
    "X_test = X_test_reserved\n",
    "\n",
    "y_pred = model.predict(X_test)\n",
    "report = classification_report(y_test_reserved, y_pred)\n",
    "cm = confusion_matrix(y_test_reserved, y_pred)\n",
    "\n",
    "print(\"Testing set:\")\n",
    "print(X_test_reserved.shape)\n",
    "print(report)\n",
    "print(cm)\n",
    "print()\n",
    "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"F1 Score: {f1_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"Precision: {precision_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"Recall: {recall_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 72,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "F1 Score: 0.5403225806451613\n",
      "Precision: 0.40853658536585363\n",
      "Recall: 0.7976190476190477\n",
      "Specificity: 0.6881028938906752\n",
      "[[214  97]\n",
      " [ 17  67]]\n"
     ]
    }
   ],
   "source": [
    "stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True)\n",
    "\n",
    "y_pred_cv = cross_val_predict(model, X, y, cv=stratified_kfold)\n",
    "\n",
    "# Calculate evaluation metrics\n",
    "f1 = f1_score(y, y_pred_cv)\n",
    "precision = precision_score(y, y_pred_cv)\n",
    "recall = recall_score(y, y_pred_cv)\n",
    "specificity = recall_score(y, y_pred_cv, pos_label=0)\n",
    "\n",
    "# Print evaluation metrics\n",
    "print(\"F1 Score:\", f1)\n",
    "print(\"Precision:\", precision)\n",
    "print(\"Recall:\", recall)\n",
    "print(\"Specificity:\", specificity)\n",
    "\n",
    "print(confusion_matrix(y, y_pred_cv))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "`Best Parameters at Index 244 : {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0.001, 'n_estimators': 75, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}`\n",
    "\n",
    "`Best Parameters at Index 1112 : {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 8, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}`\n",
    "\n",
    "`random_state=13` `{'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}`\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MLEAsm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}