[e6e569]: / XGBClassification / main.ipynb

Download this file

440 lines (439 with data), 14.2 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Install `xlrd` for reading the `xls` file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# %conda install xlrd==2.0.1\n",
    "# $ conda install -c conda-forge py-xgboost-gpu\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set the path to the `xls` file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_file = \"../TrainDataset2024.xls\"\n",
    "# training_file = \"/kaggle/input/dataset/TrainDataset2024.xls\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "\n",
    "# Add the parent directory to the system path\n",
    "sys.path.append(os.path.abspath('../'))  # Adjust the path as needed\n",
    "\n",
    "from my_util import df_to_corr_matrix, remove_outliers\n",
    "\n",
    "import tensorflow as tf\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import plotly.graph_objects as go\n",
    "\n",
    "from matplotlib.colors import Normalize\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n",
    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
    "from sklearn.impute import KNNImputer\n",
    "\n",
    "from skopt import BayesSearchCV\n",
    "from skopt.space import Real, Integer, Categorical\n",
    "\n",
    "from imblearn.over_sampling import SMOTE\n",
    "from imblearn.pipeline import Pipeline\n",
    "\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "import xgboost as xgb\n",
    "from xgboost import XGBClassifier\n",
    "\n",
    "from pickle import dump , load\n",
    "\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read the data into X and y"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded '../FeatureSelection/pkl/corr_15_selected_features.pkl' to selected_feature\n",
      "(395, 15) (395,)\n",
      "['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength']\n"
     ]
    }
   ],
   "source": [
    "NUM_OF_SELECTED_FEATURES = \"15\"\n",
    "FEATURES_FILE_PREFIX = F\"corr_{NUM_OF_SELECTED_FEATURES}\"\n",
    "\n",
    "\n",
    "data = pd.read_excel(training_file)\n",
    "data.replace(999, np.nan, inplace=True)\n",
    "\n",
    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
    "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
    "\n",
    "with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n",
    "    selected_features = load(file)\n",
    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
    "\n",
    "X = data[selected_features]\n",
    "y = data[\"pCR (outcome)\"]\n",
    "print(X.shape, y.shape)\n",
    "\n",
    "print(selected_features)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # Set up the matplotlib figure\n",
    "# plt.figure(figsize=(40, 30))\n",
    "\n",
    "# # Loop through each feature to create a scatter plot\n",
    "# for i, feature in enumerate(X.columns):\n",
    "#     plt.subplot(5, 6, i + 1)  # Adjust the number of rows and columns based on the number of features\n",
    "#     sns.scatterplot(x=y, y=X[feature], hue=y, style=y, palette='Set2', alpha=0.7)\n",
    "#     plt.title(feature)\n",
    "#     plt.xlabel('pCR (outcome)')\n",
    "#     plt.ylabel(feature)\n",
    "#     plt.xlim(-2, 3)\n",
    "\n",
    "# plt.tight_layout()\n",
    "# plt.show()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "# df_to_corr_matrix(X, size_factor=1.6, sep=150)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Split the data into train_full and test_reserved (untouch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n",
      "The training data has 316 data. The testing data has 79 data. \n",
      "Positive ratio: \n",
      "\tTrain: 0.21203\n",
      "\tTest: 0.21519\n"
     ]
    }
   ],
   "source": [
    "# Close ratio random_state\n",
    "# [14, 47, 49, 52, 62, 76, 83, 89, 92, 116, 118, 122, 136, 138, 144, 146, 150, 156, 157, 159, 170, 172, 174, 185]\n",
    "\n",
    "while True:  \n",
    "    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0\n",
    "    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n",
    "\n",
    "    X_train_full.reset_index(drop=True, inplace=True)\n",
    "    X_test_reserved.reset_index(drop=True, inplace=True)\n",
    "    y_train_full.reset_index(drop=True, inplace=True)\n",
    "    y_test_reserved.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n",
    "    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n",
    "\n",
    "    if abs(ratio_train - ratio_test) < 0.1:\n",
    "        break\n",
    "\n",
    "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n",
    "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n",
    "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Outliers"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [],
   "source": [
    "# # The result of keeping outliers is better\n",
    "# X_train_full, y_train_full = remove_outliers(X_train_full, y_train_full, selected_features)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### XGBoost"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(316, 15)\n",
      "(316,)\n"
     ]
    }
   ],
   "source": [
    "print(X_train_full.shape)\n",
    "print(y_train_full.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Best Parameters at Index 17 : OrderedDict([('gamma', 0.01), ('learning_rate', 0.041870422386972375), ('max_bin', 13), ('max_depth', 1), ('max_leaves', 2), ('min_child_weight', 2.8986170391945087), ('n_estimators', 180), ('num_parallel_tree', 2), ('scale_pos_weight', 4.5)])\n",
      "Balanced accuracy score: 0.7540690737833595\n",
      "F1 Score: 0.5519510577941229\n",
      "Precision Score: 0.4183051747014595\n",
      "Recall Score: 0.8373626373626374\n",
      "Specificity Score: 0.6707755102040817\n",
      "\n"
     ]
    }
   ],
   "source": [
    "model = XGBClassifier(objective=\"binary:logistic\")\n",
    "\n",
    "search_space = {\n",
    "    \"gamma\": Categorical([0, 0.01, 0.1, 0.3, 0.5, 0.7]),\n",
    "    \"learning_rate\": Real(1e-4, 0.5, prior='log-uniform'),\n",
    "    \"max_bin\": Integer(2, 20),\n",
    "    \"max_depth\": Integer(1, 5),\n",
    "    \"max_leaves\": Integer(1, 5),\n",
    "    \"min_child_weight\": Real(0, 10, prior='uniform'),\n",
    "    \"n_estimators\": Integer(30, 200),\n",
    "    \"num_parallel_tree\": Categorical([1, 2]),\n",
    "    \"scale_pos_weight\": Categorical([3.8, 4.5]),\n",
    "}\n",
    "\n",
    "kf = StratifiedKFold(5, shuffle=True, random_state=42)\n",
    "\n",
    "# Set up the GridSearchCV\n",
    "bayes_search = BayesSearchCV(\n",
    "    estimator=model,\n",
    "    search_spaces = search_space,\n",
    "    scoring={\n",
    "        \"f1\": \"f1\",\n",
    "        \"recall\": \"recall\",\n",
    "        \"specificity\": make_scorer(recall_score, pos_label=0),\n",
    "        \"precision\": \"precision\",\n",
    "        \"balanced_accuracy_score\": make_scorer(balanced_accuracy_score),\n",
    "    },\n",
    "    cv=kf,\n",
    "    n_iter=100,\n",
    "    verbose=0,\n",
    "    n_jobs=-1,\n",
    "    return_train_score=True,\n",
    "    refit=\"balanced_accuracy_score\",\n",
    ")\n",
    "\n",
    "# Fit the model\n",
    "bayes_search.fit(X_train_full, y_train_full)\n",
    "\n",
    "# Get the best parameters and best score\n",
    "result = pd.DataFrame(bayes_search.cv_results_)\n",
    "best_params = bayes_search.best_params_\n",
    "best_index = bayes_search.best_index_\n",
    "best_f1 = result[\"mean_test_f1\"][best_index]\n",
    "best_precision = result[\"mean_test_precision\"][best_index]\n",
    "best_recall = result[\"mean_test_recall\"][best_index]\n",
    "best_specificity = result[\"mean_test_specificity\"][best_index]\n",
    "best_balanced_accuracy_score = result[\"mean_test_balanced_accuracy_score\"][best_index]\n",
    "\n",
    "print(f\"Best Parameters at Index {best_index} :\", best_params)\n",
    "print(f\"Balanced accuracy score: {best_balanced_accuracy_score}\")\n",
    "print(f\"F1 Score: {best_f1}\")\n",
    "print(f\"Precision Score: {best_precision}\")\n",
    "print(f\"Recall Score: {best_recall}\")\n",
    "print(f\"Specificity Score: {best_specificity}\")\n",
    "print()\n",
    "\n",
    "pd.DataFrame(bayes_search.cv_results_).to_csv(f\"output.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Testing set:\n",
      "(79, 15)\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.95      0.56      0.71        62\n",
      "         1.0       0.36      0.88      0.51        17\n",
      "\n",
      "    accuracy                           0.63        79\n",
      "   macro avg       0.65      0.72      0.61        79\n",
      "weighted avg       0.82      0.63      0.66        79\n",
      "\n",
      "[[35 27]\n",
      " [ 2 15]]\n",
      "\n",
      "Balanced accuracy score: 0.7234345351043643\n",
      "F1 Score: 0.5084745762711864\n",
      "Precision: 0.35714285714285715\n",
      "Recall: 0.8823529411764706\n",
      "Specificity: 0.5645161290322581\n"
     ]
    }
   ],
   "source": [
    "model = bayes_search.best_estimator_\n",
    "\n",
    "X_test = X_test_reserved\n",
    "\n",
    "y_pred = model.predict(X_test)\n",
    "report = classification_report(y_test_reserved, y_pred)\n",
    "cm = confusion_matrix(y_test_reserved, y_pred)\n",
    "\n",
    "print(\"Testing set:\")\n",
    "print(X_test_reserved.shape)\n",
    "print(report)\n",
    "print(cm)\n",
    "print()\n",
    "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"F1 Score: {f1_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"Precision: {precision_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"Recall: {recall_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Saved file pkl/best_params_15.pkl.\n",
      "Saved file pkl/model_15.pkl\n",
      "{'gamma': 0.01, 'learning_rate': 0.041870422386972375, 'max_bin': 13, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 2.8986170391945087, 'n_estimators': 180, 'num_parallel_tree': 2, 'scale_pos_weight': 4.5}\n"
     ]
    }
   ],
   "source": [
    "bp = dict(bayes_search.best_params_)\n",
    "with open(f\"pkl/best_params_{NUM_OF_SELECTED_FEATURES}.pkl\", \"wb\") as file:\n",
    "    dump(bp, file)\n",
    "    print(f\"Saved file {file.name}.\")\n",
    "\n",
    "model = bayes_search.best_estimator_\n",
    "with open(f\"pkl/model_{NUM_OF_SELECTED_FEATURES}.pkl\", \"wb\") as file:\n",
    "    dump(model, file)\n",
    "    print(f\"Saved file {file.name}\")\n",
    "\n",
    "print(bp)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MLEAsm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}