ML_BreastCancerTreatment / Git / Diff of /XGBClassification/main.ipynb

Models:
joseph-gordon/
ML_BreastCancerTreatment
Downloads: 1
Diff of /XGBClassification/main.ipynb [000000] .. [e6e569]
Switch to side-by-side view

--- a
+++ b/XGBClassification/main.ipynb
@@ -0,0 +1,439 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Install `xlrd` for reading the `xls` file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# %conda install xlrd==2.0.1\n",
+    "# $ conda install -c conda-forge py-xgboost-gpu\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the path to the `xls` file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_file = \"../TrainDataset2024.xls\"\n",
+    "# training_file = \"/kaggle/input/dataset/TrainDataset2024.xls\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "# Add the parent directory to the system path\n",
+    "sys.path.append(os.path.abspath('../'))  # Adjust the path as needed\n",
+    "\n",
+    "from my_util import df_to_corr_matrix, remove_outliers\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "import plotly.graph_objects as go\n",
+    "\n",
+    "from matplotlib.colors import Normalize\n",
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n",
+    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
+    "from sklearn.impute import KNNImputer\n",
+    "\n",
+    "from skopt import BayesSearchCV\n",
+    "from skopt.space import Real, Integer, Categorical\n",
+    "\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "from imblearn.pipeline import Pipeline\n",
+    "\n",
+    "from joblib import Parallel, delayed\n",
+    "\n",
+    "import xgboost as xgb\n",
+    "from xgboost import XGBClassifier\n",
+    "\n",
+    "from pickle import dump , load\n",
+    "\n",
+    "import warnings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Read the data into X and y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded '../FeatureSelection/pkl/corr_15_selected_features.pkl' to selected_feature\n",
+      "(395, 15) (395,)\n",
+      "['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength']\n"
+     ]
+    }
+   ],
+   "source": [
+    "NUM_OF_SELECTED_FEATURES = \"15\"\n",
+    "FEATURES_FILE_PREFIX = F\"corr_{NUM_OF_SELECTED_FEATURES}\"\n",
+    "\n",
+    "\n",
+    "data = pd.read_excel(training_file)\n",
+    "data.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
+    "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
+    "\n",
+    "with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n",
+    "    selected_features = load(file)\n",
+    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
+    "\n",
+    "X = data[selected_features]\n",
+    "y = data[\"pCR (outcome)\"]\n",
+    "print(X.shape, y.shape)\n",
+    "\n",
+    "print(selected_features)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Set up the matplotlib figure\n",
+    "# plt.figure(figsize=(40, 30))\n",
+    "\n",
+    "# # Loop through each feature to create a scatter plot\n",
+    "# for i, feature in enumerate(X.columns):\n",
+    "#     plt.subplot(5, 6, i + 1)  # Adjust the number of rows and columns based on the number of features\n",
+    "#     sns.scatterplot(x=y, y=X[feature], hue=y, style=y, palette='Set2', alpha=0.7)\n",
+    "#     plt.title(feature)\n",
+    "#     plt.xlabel('pCR (outcome)')\n",
+    "#     plt.ylabel(feature)\n",
+    "#     plt.xlim(-2, 3)\n",
+    "\n",
+    "# plt.tight_layout()\n",
+    "# plt.show()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# df_to_corr_matrix(X, size_factor=1.6, sep=150)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Split the data into train_full and test_reserved (untouch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n",
+      "The training data has 316 data. The testing data has 79 data. \n",
+      "Positive ratio: \n",
+      "\tTrain: 0.21203\n",
+      "\tTest: 0.21519\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Close ratio random_state\n",
+    "# [14, 47, 49, 52, 62, 76, 83, 89, 92, 116, 118, 122, 136, 138, 144, 146, 150, 156, 157, 159, 170, 172, 174, 185]\n",
+    "\n",
+    "while True:  \n",
+    "    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0\n",
+    "    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n",
+    "\n",
+    "    X_train_full.reset_index(drop=True, inplace=True)\n",
+    "    X_test_reserved.reset_index(drop=True, inplace=True)\n",
+    "    y_train_full.reset_index(drop=True, inplace=True)\n",
+    "    y_test_reserved.reset_index(drop=True, inplace=True)\n",
+    "\n",
+    "    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n",
+    "    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n",
+    "\n",
+    "    if abs(ratio_train - ratio_test) < 0.1:\n",
+    "        break\n",
+    "\n",
+    "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n",
+    "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n",
+    "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Outliers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # The result of keeping outliers is better\n",
+    "# X_train_full, y_train_full = remove_outliers(X_train_full, y_train_full, selected_features)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### XGBoost"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "(316, 15)\n",
+      "(316,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(X_train_full.shape)\n",
+    "print(y_train_full.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Best Parameters at Index 17 : OrderedDict([('gamma', 0.01), ('learning_rate', 0.041870422386972375), ('max_bin', 13), ('max_depth', 1), ('max_leaves', 2), ('min_child_weight', 2.8986170391945087), ('n_estimators', 180), ('num_parallel_tree', 2), ('scale_pos_weight', 4.5)])\n",
+      "Balanced accuracy score: 0.7540690737833595\n",
+      "F1 Score: 0.5519510577941229\n",
+      "Precision Score: 0.4183051747014595\n",
+      "Recall Score: 0.8373626373626374\n",
+      "Specificity Score: 0.6707755102040817\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = XGBClassifier(objective=\"binary:logistic\")\n",
+    "\n",
+    "search_space = {\n",
+    "    \"gamma\": Categorical([0, 0.01, 0.1, 0.3, 0.5, 0.7]),\n",
+    "    \"learning_rate\": Real(1e-4, 0.5, prior='log-uniform'),\n",
+    "    \"max_bin\": Integer(2, 20),\n",
+    "    \"max_depth\": Integer(1, 5),\n",
+    "    \"max_leaves\": Integer(1, 5),\n",
+    "    \"min_child_weight\": Real(0, 10, prior='uniform'),\n",
+    "    \"n_estimators\": Integer(30, 200),\n",
+    "    \"num_parallel_tree\": Categorical([1, 2]),\n",
+    "    \"scale_pos_weight\": Categorical([3.8, 4.5]),\n",
+    "}\n",
+    "\n",
+    "kf = StratifiedKFold(5, shuffle=True, random_state=42)\n",
+    "\n",
+    "# Set up the GridSearchCV\n",
+    "bayes_search = BayesSearchCV(\n",
+    "    estimator=model,\n",
+    "    search_spaces = search_space,\n",
+    "    scoring={\n",
+    "        \"f1\": \"f1\",\n",
+    "        \"recall\": \"recall\",\n",
+    "        \"specificity\": make_scorer(recall_score, pos_label=0),\n",
+    "        \"precision\": \"precision\",\n",
+    "        \"balanced_accuracy_score\": make_scorer(balanced_accuracy_score),\n",
+    "    },\n",
+    "    cv=kf,\n",
+    "    n_iter=100,\n",
+    "    verbose=0,\n",
+    "    n_jobs=-1,\n",
+    "    return_train_score=True,\n",
+    "    refit=\"balanced_accuracy_score\",\n",
+    ")\n",
+    "\n",
+    "# Fit the model\n",
+    "bayes_search.fit(X_train_full, y_train_full)\n",
+    "\n",
+    "# Get the best parameters and best score\n",
+    "result = pd.DataFrame(bayes_search.cv_results_)\n",
+    "best_params = bayes_search.best_params_\n",
+    "best_index = bayes_search.best_index_\n",
+    "best_f1 = result[\"mean_test_f1\"][best_index]\n",
+    "best_precision = result[\"mean_test_precision\"][best_index]\n",
+    "best_recall = result[\"mean_test_recall\"][best_index]\n",
+    "best_specificity = result[\"mean_test_specificity\"][best_index]\n",
+    "best_balanced_accuracy_score = result[\"mean_test_balanced_accuracy_score\"][best_index]\n",
+    "\n",
+    "print(f\"Best Parameters at Index {best_index} :\", best_params)\n",
+    "print(f\"Balanced accuracy score: {best_balanced_accuracy_score}\")\n",
+    "print(f\"F1 Score: {best_f1}\")\n",
+    "print(f\"Precision Score: {best_precision}\")\n",
+    "print(f\"Recall Score: {best_recall}\")\n",
+    "print(f\"Specificity Score: {best_specificity}\")\n",
+    "print()\n",
+    "\n",
+    "pd.DataFrame(bayes_search.cv_results_).to_csv(f\"output.csv\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Testing set:\n",
+      "(79, 15)\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "         0.0       0.95      0.56      0.71        62\n",
+      "         1.0       0.36      0.88      0.51        17\n",
+      "\n",
+      "    accuracy                           0.63        79\n",
+      "   macro avg       0.65      0.72      0.61        79\n",
+      "weighted avg       0.82      0.63      0.66        79\n",
+      "\n",
+      "[[35 27]\n",
+      " [ 2 15]]\n",
+      "\n",
+      "Balanced accuracy score: 0.7234345351043643\n",
+      "F1 Score: 0.5084745762711864\n",
+      "Precision: 0.35714285714285715\n",
+      "Recall: 0.8823529411764706\n",
+      "Specificity: 0.5645161290322581\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = bayes_search.best_estimator_\n",
+    "\n",
+    "X_test = X_test_reserved\n",
+    "\n",
+    "y_pred = model.predict(X_test)\n",
+    "report = classification_report(y_test_reserved, y_pred)\n",
+    "cm = confusion_matrix(y_test_reserved, y_pred)\n",
+    "\n",
+    "print(\"Testing set:\")\n",
+    "print(X_test_reserved.shape)\n",
+    "print(report)\n",
+    "print(cm)\n",
+    "print()\n",
+    "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"F1 Score: {f1_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"Precision: {precision_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"Recall: {recall_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Saved file pkl/best_params_15.pkl.\n",
+      "Saved file pkl/model_15.pkl\n",
+      "{'gamma': 0.01, 'learning_rate': 0.041870422386972375, 'max_bin': 13, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 2.8986170391945087, 'n_estimators': 180, 'num_parallel_tree': 2, 'scale_pos_weight': 4.5}\n"
+     ]
+    }
+   ],
+   "source": [
+    "bp = dict(bayes_search.best_params_)\n",
+    "with open(f\"pkl/best_params_{NUM_OF_SELECTED_FEATURES}.pkl\", \"wb\") as file:\n",
+    "    dump(bp, file)\n",
+    "    print(f\"Saved file {file.name}.\")\n",
+    "\n",
+    "model = bayes_search.best_estimator_\n",
+    "with open(f\"pkl/model_{NUM_OF_SELECTED_FEATURES}.pkl\", \"wb\") as file:\n",
+    "    dump(model, file)\n",
+    "    print(f\"Saved file {file.name}\")\n",
+    "\n",
+    "print(bp)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "MLEAsm",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}