[e6e569]: / XGBClassification / test old.ipynb

Download this file

423 lines (422 with data), 17.4 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parameter\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "grid1 = {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0.001, 'n_estimators': 75, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
    "grid2 = {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 8, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
    "grid3 = {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-12-08 13:43:03.611456: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2024-12-08 13:43:03.698524: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
      "E0000 00:00:1733665383.730998   32201 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
      "E0000 00:00:1733665383.739369   32201 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "2024-12-08 13:43:03.817875: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import os\n",
    "\n",
    "# Add the parent directory to the system path\n",
    "sys.path.append(os.path.abspath('../'))  # Adjust the path as needed\n",
    "\n",
    "from my_util import df_to_corr_matrix, remove_outliers\n",
    "\n",
    "import tensorflow as tf\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import plotly.graph_objects as go\n",
    "\n",
    "from matplotlib.colors import Normalize\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n",
    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
    "from sklearn.impute import KNNImputer\n",
    "\n",
    "from imblearn.over_sampling import SMOTE\n",
    "from imblearn.pipeline import Pipeline\n",
    "\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "import xgboost as xgb\n",
    "from xgboost import XGBClassifier\n",
    "\n",
    "from pickle import dump , load\n",
    "\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_file_path = '../TestDatasetExample.xls'"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature\n",
      "Loaded selected_features to X\n"
     ]
    }
   ],
   "source": [
    "X = pd.read_excel(test_file_path)\n",
    "\n",
    "X.replace(999, np.nan, inplace=True)\n",
    "\n",
    "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n",
    "\n",
    "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n",
    "    selected_features = load(file)\n",
    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
    "\n",
    "X = X[selected_features]\n",
    "print('Loaded selected_features to X')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength', 'original_glrlm_LongRunLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'HistologyType', 'ChemoGrade', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterColumn', 'original_shape_SurfaceVolumeRatio', 'original_shape_LeastAxisLength', 'original_glcm_Autocorrelation', 'original_shape_Sphericity']\n"
     ]
    }
   ],
   "source": [
    "model = XGBClassifier()\n",
    "model.load_model(\"model.ubj\")\n",
    "\n",
    "print(selected_features)\n",
    "y_pred = model.predict(X)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([0, 1, 1])"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "y_pred"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Retrain the model with different data and evaluate the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature\n",
      "(395, 25) (395,)\n",
      "Split data using train_test_split with random_state=14\n",
      "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n",
      "The training data has 316 data. The testing data has 79 data. \n",
      "Positive ratio: \n",
      "\tTrain: 0.21203\n",
      "\tTest: 0.21519\n"
     ]
    },
    {
     "ename": "NameError",
     "evalue": "name 'grid' is not defined",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[9], line 45\u001b[0m\n\u001b[1;32m     42\u001b[0m rs \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m13\u001b[39m\n\u001b[1;32m     43\u001b[0m stratified_kfold \u001b[38;5;241m=\u001b[39m StratifiedKFold(n_splits\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m13\u001b[39m)\n\u001b[0;32m---> 45\u001b[0m model\u001b[38;5;241m.\u001b[39mset_params(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[43mgrid\u001b[49m)\n\u001b[1;32m     47\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mCross validation for the train set using StratifiedKFold with random_state=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mX_train_full\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     49\u001b[0m y_pred_cv \u001b[38;5;241m=\u001b[39m cross_val_predict(model, X_train_full, y_train_full, cv\u001b[38;5;241m=\u001b[39mstratified_kfold)\n",
      "\u001b[0;31mNameError\u001b[0m: name 'grid' is not defined"
     ]
    }
   ],
   "source": [
    "model = XGBClassifier()\n",
    "\n",
    "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n",
    "\n",
    "data = pd.read_excel(\"../TrainDataset2024.xls\")\n",
    "data.replace(999, np.nan, inplace=True)\n",
    "\n",
    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
    "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
    "\n",
    "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n",
    "    selected_features = load(file)\n",
    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
    "\n",
    "X = data[selected_features]\n",
    "y = data[\"pCR (outcome)\"]\n",
    "print(X.shape, y.shape)\n",
    "\n",
    "rs = 10\n",
    "while True:  \n",
    "    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=46) # similar distribution of 1 and 0\n",
    "    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=rs)\n",
    "\n",
    "    X_train_full.reset_index(drop=True, inplace=True)\n",
    "    X_test_reserved.reset_index(drop=True, inplace=True)\n",
    "    y_train_full.reset_index(drop=True, inplace=True)\n",
    "    y_test_reserved.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n",
    "    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n",
    "\n",
    "    if abs(ratio_train - ratio_test) < 0.01:\n",
    "        print(f\"Split data using train_test_split with random_state={rs}\")\n",
    "        break\n",
    "    rs+=1\n",
    "\n",
    "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n",
    "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n",
    "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")\n",
    "\n",
    "# stratified_kfold = StratifiedKFold(n_splits=5, shuffle=False)\n",
    "rs = 13\n",
    "stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)\n",
    "\n",
    "model.set_params(**grid)\n",
    "\n",
    "print(f\"\\nCross validation for the train set using StratifiedKFold with random_state={rs}: {X_train_full.shape}\")\n",
    "\n",
    "y_pred_cv = cross_val_predict(model, X_train_full, y_train_full, cv=stratified_kfold)\n",
    "print(confusion_matrix(y_train_full, y_pred_cv))\n",
    "print(classification_report(y_train_full, y_pred_cv))\n",
    "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_train_full, y_pred_cv)}\")\n",
    "print(f\"F1 Score: {f1_score(y_train_full, y_pred_cv)}\")\n",
    "print(f\"Precision: {precision_score(y_train_full, y_pred_cv)}\")\n",
    "print(f\"Recall: {recall_score(y_train_full, y_pred_cv)}\")\n",
    "print(f\"Specificity: {recall_score(y_train_full, y_pred_cv, pos_label=0)}\")\n",
    "print()\n",
    "\n",
    "model.fit(X_train_full, y_train_full)\n",
    "y_pred = model.predict(X_test_reserved)\n",
    "\n",
    "print(f\"\\nResult of the test set: {X_test_reserved.shape}\")\n",
    "\n",
    "print(confusion_matrix(y_test_reserved, y_pred))\n",
    "print(classification_report(y_test_reserved, y_pred))\n",
    "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"F1 Score: {f1_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"Precision: {precision_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"Recall: {recall_score(y_test_reserved, y_pred)}\")\n",
    "print(f\"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}\")\n",
    "\n",
    "\n",
    "print(\"\\nUse the whole data to train and do CV using StratifiedKFold with random_state={rs}\")\n",
    "y_pred_cv = cross_val_predict(model, X, y, cv=stratified_kfold)\n",
    "print(confusion_matrix(y, y_pred_cv))\n",
    "print(classification_report(y, y_pred_cv))\n",
    "print(f\"Balanced accuracy score: {balanced_accuracy_score(y, y_pred_cv)}\")\n",
    "print(f\"F1 Score: {f1_score(y, y_pred_cv)}\")\n",
    "print(f\"Precision: {precision_score(y, y_pred_cv)}\")\n",
    "print(f\"Recall: {recall_score(y, y_pred_cv)}\")\n",
    "print(f\"Specificity: {recall_score(y, y_pred_cv, pos_label=0)}\")\n",
    "print()\n",
    "\n",
    "\n",
    "print(f\"Predict the test file:\")\n",
    "\n",
    "X = pd.read_excel(test_file_path)\n",
    "\n",
    "X.replace(999, np.nan, inplace=True)\n",
    "\n",
    "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n",
    "\n",
    "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n",
    "    selected_features = load(file)\n",
    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
    "\n",
    "X = X[selected_features]\n",
    "y_pred = model.predict(X)\n",
    "\n",
    "print(y_pred)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature\n",
      "File 0\n",
      "[[44 18]\n",
      " [ 6 11]]\n",
      "0.6783681214421253\n",
      "File 1\n",
      "[[38 24]\n",
      " [ 1 16]]\n",
      "0.7770398481973435\n",
      "File 2\n",
      "[[42 20]\n",
      " [ 5 12]]\n",
      "0.6916508538899431\n",
      "Averaged balanced accuracy: 0.715686274509804\n"
     ]
    }
   ],
   "source": [
    "\n",
    "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n",
    "\n",
    "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n",
    "    selected_features = load(file)\n",
    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
    "\n",
    "\n",
    "files = [(\"../train_data.xls\", \"../test_data.xls\"), (\"../train_data_2.xls\", \"../test_data_2.xls\"), (\"../train_data_3.xls\", \"../test_data_3.xls\")]\n",
    "\n",
    "ba = []\n",
    "\n",
    "for index, (train_file, test_file) in enumerate(files):    \n",
    "    data = pd.read_excel(train_file)\n",
    "    data.replace(999, np.nan, inplace=True)\n",
    "\n",
    "    data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
    "    data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
    "\n",
    "    X = data.drop(columns='pCR (outcome)', axis=1)\n",
    "    X = X[selected_features]\n",
    "    y = data[\"pCR (outcome)\"]\n",
    "    # print(X.shape, y.shape)\n",
    "\n",
    "    testdata = pd.read_excel(test_file)\n",
    "    testdata.replace(999, np.nan, inplace=True)\n",
    "\n",
    "    testdata.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
    "    testdata.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
    "\n",
    "    X_test = testdata.drop(columns='pCR (outcome)', axis=1)\n",
    "    X_test = X_test[selected_features]\n",
    "    y_test = testdata[\"pCR (outcome)\"]\n",
    "    # print(X_test.shape, y_test.shape)\n",
    "\n",
    "    model1 = XGBClassifier()\n",
    "    model1.set_params(**grid1)\n",
    "    model2 = XGBClassifier()\n",
    "    model2.set_params(**grid2)\n",
    "    model3 = XGBClassifier()\n",
    "    model3.set_params(**grid3)\n",
    "\n",
    "    model1.fit(X, y)\n",
    "    model2.fit(X, y)\n",
    "    model3.fit(X, y)\n",
    "\n",
    "    y_pred = []\n",
    "    y_pred.append(model1.predict(X_test))\n",
    "    y_pred.append(model2.predict(X_test))\n",
    "    y_pred.append(model3.predict(X_test))\n",
    "    y_pred = np.array(y_pred)\n",
    "\n",
    "    yp = np.round(np.average(y_pred, axis=0))\n",
    "    print(f\"File {index}\")\n",
    "    print(confusion_matrix(y_test, yp))\n",
    "    ba.append(balanced_accuracy_score(y_test, yp))\n",
    "    print(ba[-1])\n",
    "print(f\"Averaged balanced accuracy: {np.mean(ba)}\")\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MLEAsm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}