--- a +++ b/XGBClassification/main.ipynb @@ -0,0 +1,439 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Install `xlrd` for reading the `xls` file" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "# %conda install xlrd==2.0.1\n", + "# $ conda install -c conda-forge py-xgboost-gpu\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the path to the `xls` file" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [], + "source": [ + "training_file = \"../TrainDataset2024.xls\"\n", + "# training_file = \"/kaggle/input/dataset/TrainDataset2024.xls\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Add the parent directory to the system path\n", + "sys.path.append(os.path.abspath('../')) # Adjust the path as needed\n", + "\n", + "from my_util import df_to_corr_matrix, remove_outliers\n", + "\n", + "import tensorflow as tf\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import plotly.graph_objects as go\n", + "\n", + "from matplotlib.colors import Normalize\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n", + "from sklearn.preprocessing import StandardScaler, RobustScaler\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n", + "from sklearn.svm import SVC\n", + "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n", + "from sklearn.impute import KNNImputer\n", + "\n", + "from skopt import BayesSearchCV\n", + "from skopt.space import Real, Integer, Categorical\n", + "\n", + "from imblearn.over_sampling import SMOTE\n", + "from imblearn.pipeline import Pipeline\n", + "\n", + "from joblib import Parallel, delayed\n", + "\n", + "import xgboost as xgb\n", + "from xgboost import XGBClassifier\n", + "\n", + "from pickle import dump , load\n", + "\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read the data into X and y" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded '../FeatureSelection/pkl/corr_15_selected_features.pkl' to selected_feature\n", + "(395, 15) (395,)\n", + "['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength']\n" + ] + } + ], + "source": [ + "NUM_OF_SELECTED_FEATURES = \"15\"\n", + "FEATURES_FILE_PREFIX = F\"corr_{NUM_OF_SELECTED_FEATURES}\"\n", + "\n", + "\n", + "data = pd.read_excel(training_file)\n", + "data.replace(999, np.nan, inplace=True)\n", + "\n", + "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n", + "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n", + "\n", + "with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n", + " selected_features = load(file)\n", + " print(f\"Loaded '{file.name}' to selected_feature\")\n", + "\n", + "X = data[selected_features]\n", + "y = data[\"pCR (outcome)\"]\n", + "print(X.shape, y.shape)\n", + "\n", + "print(selected_features)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "# # Set up the matplotlib figure\n", + "# plt.figure(figsize=(40, 30))\n", + "\n", + "# # Loop through each feature to create a scatter plot\n", + "# for i, feature in enumerate(X.columns):\n", + "# plt.subplot(5, 6, i + 1) # Adjust the number of rows and columns based on the number of features\n", + "# sns.scatterplot(x=y, y=X[feature], hue=y, style=y, palette='Set2', alpha=0.7)\n", + "# plt.title(feature)\n", + "# plt.xlabel('pCR (outcome)')\n", + "# plt.ylabel(feature)\n", + "# plt.xlim(-2, 3)\n", + "\n", + "# plt.tight_layout()\n", + "# plt.show()" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "# df_to_corr_matrix(X, size_factor=1.6, sep=150)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split the data into train_full and test_reserved (untouch)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n", + "The training data has 316 data. The testing data has 79 data. \n", + "Positive ratio: \n", + "\tTrain: 0.21203\n", + "\tTest: 0.21519\n" + ] + } + ], + "source": [ + "# Close ratio random_state\n", + "# [14, 47, 49, 52, 62, 76, 83, 89, 92, 116, 118, 122, 136, 138, 144, 146, 150, 156, 157, 159, 170, 172, 174, 185]\n", + "\n", + "while True: \n", + " X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0\n", + " # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n", + "\n", + " X_train_full.reset_index(drop=True, inplace=True)\n", + " X_test_reserved.reset_index(drop=True, inplace=True)\n", + " y_train_full.reset_index(drop=True, inplace=True)\n", + " y_test_reserved.reset_index(drop=True, inplace=True)\n", + "\n", + " ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n", + " ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n", + "\n", + " if abs(ratio_train - ratio_test) < 0.1:\n", + " break\n", + "\n", + "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n", + "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n", + "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Outliers" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# # The result of keeping outliers is better\n", + "# X_train_full, y_train_full = remove_outliers(X_train_full, y_train_full, selected_features)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### XGBoost" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "(316, 15)\n", + "(316,)\n" + ] + } + ], + "source": [ + "print(X_train_full.shape)\n", + "print(y_train_full.shape)" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Best Parameters at Index 17 : OrderedDict([('gamma', 0.01), ('learning_rate', 0.041870422386972375), ('max_bin', 13), ('max_depth', 1), ('max_leaves', 2), ('min_child_weight', 2.8986170391945087), ('n_estimators', 180), ('num_parallel_tree', 2), ('scale_pos_weight', 4.5)])\n", + "Balanced accuracy score: 0.7540690737833595\n", + "F1 Score: 0.5519510577941229\n", + "Precision Score: 0.4183051747014595\n", + "Recall Score: 0.8373626373626374\n", + "Specificity Score: 0.6707755102040817\n", + "\n" + ] + } + ], + "source": [ + "model = XGBClassifier(objective=\"binary:logistic\")\n", + "\n", + "search_space = {\n", + " \"gamma\": Categorical([0, 0.01, 0.1, 0.3, 0.5, 0.7]),\n", + " \"learning_rate\": Real(1e-4, 0.5, prior='log-uniform'),\n", + " \"max_bin\": Integer(2, 20),\n", + " \"max_depth\": Integer(1, 5),\n", + " \"max_leaves\": Integer(1, 5),\n", + " \"min_child_weight\": Real(0, 10, prior='uniform'),\n", + " \"n_estimators\": Integer(30, 200),\n", + " \"num_parallel_tree\": Categorical([1, 2]),\n", + " \"scale_pos_weight\": Categorical([3.8, 4.5]),\n", + "}\n", + "\n", + "kf = StratifiedKFold(5, shuffle=True, random_state=42)\n", + "\n", + "# Set up the GridSearchCV\n", + "bayes_search = BayesSearchCV(\n", + " estimator=model,\n", + " search_spaces = search_space,\n", + " scoring={\n", + " \"f1\": \"f1\",\n", + " \"recall\": \"recall\",\n", + " \"specificity\": make_scorer(recall_score, pos_label=0),\n", + " \"precision\": \"precision\",\n", + " \"balanced_accuracy_score\": make_scorer(balanced_accuracy_score),\n", + " },\n", + " cv=kf,\n", + " n_iter=100,\n", + " verbose=0,\n", + " n_jobs=-1,\n", + " return_train_score=True,\n", + " refit=\"balanced_accuracy_score\",\n", + ")\n", + "\n", + "# Fit the model\n", + "bayes_search.fit(X_train_full, y_train_full)\n", + "\n", + "# Get the best parameters and best score\n", + "result = pd.DataFrame(bayes_search.cv_results_)\n", + "best_params = bayes_search.best_params_\n", + "best_index = bayes_search.best_index_\n", + "best_f1 = result[\"mean_test_f1\"][best_index]\n", + "best_precision = result[\"mean_test_precision\"][best_index]\n", + "best_recall = result[\"mean_test_recall\"][best_index]\n", + "best_specificity = result[\"mean_test_specificity\"][best_index]\n", + "best_balanced_accuracy_score = result[\"mean_test_balanced_accuracy_score\"][best_index]\n", + "\n", + "print(f\"Best Parameters at Index {best_index} :\", best_params)\n", + "print(f\"Balanced accuracy score: {best_balanced_accuracy_score}\")\n", + "print(f\"F1 Score: {best_f1}\")\n", + "print(f\"Precision Score: {best_precision}\")\n", + "print(f\"Recall Score: {best_recall}\")\n", + "print(f\"Specificity Score: {best_specificity}\")\n", + "print()\n", + "\n", + "pd.DataFrame(bayes_search.cv_results_).to_csv(f\"output.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Testing set:\n", + "(79, 15)\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.95 0.56 0.71 62\n", + " 1.0 0.36 0.88 0.51 17\n", + "\n", + " accuracy 0.63 79\n", + " macro avg 0.65 0.72 0.61 79\n", + "weighted avg 0.82 0.63 0.66 79\n", + "\n", + "[[35 27]\n", + " [ 2 15]]\n", + "\n", + "Balanced accuracy score: 0.7234345351043643\n", + "F1 Score: 0.5084745762711864\n", + "Precision: 0.35714285714285715\n", + "Recall: 0.8823529411764706\n", + "Specificity: 0.5645161290322581\n" + ] + } + ], + "source": [ + "model = bayes_search.best_estimator_\n", + "\n", + "X_test = X_test_reserved\n", + "\n", + "y_pred = model.predict(X_test)\n", + "report = classification_report(y_test_reserved, y_pred)\n", + "cm = confusion_matrix(y_test_reserved, y_pred)\n", + "\n", + "print(\"Testing set:\")\n", + "print(X_test_reserved.shape)\n", + "print(report)\n", + "print(cm)\n", + "print()\n", + "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}\")\n", + "print(f\"F1 Score: {f1_score(y_test_reserved, y_pred)}\")\n", + "print(f\"Precision: {precision_score(y_test_reserved, y_pred)}\")\n", + "print(f\"Recall: {recall_score(y_test_reserved, y_pred)}\")\n", + "print(f\"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Saved file pkl/best_params_15.pkl.\n", + "Saved file pkl/model_15.pkl\n", + "{'gamma': 0.01, 'learning_rate': 0.041870422386972375, 'max_bin': 13, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 2.8986170391945087, 'n_estimators': 180, 'num_parallel_tree': 2, 'scale_pos_weight': 4.5}\n" + ] + } + ], + "source": [ + "bp = dict(bayes_search.best_params_)\n", + "with open(f\"pkl/best_params_{NUM_OF_SELECTED_FEATURES}.pkl\", \"wb\") as file:\n", + " dump(bp, file)\n", + " print(f\"Saved file {file.name}.\")\n", + "\n", + "model = bayes_search.best_estimator_\n", + "with open(f\"pkl/model_{NUM_OF_SELECTED_FEATURES}.pkl\", \"wb\") as file:\n", + " dump(model, file)\n", + " print(f\"Saved file {file.name}\")\n", + "\n", + "print(bp)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MLEAsm", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}