Switch to side-by-side view

--- a
+++ b/XGBClassification/test old.ipynb
@@ -0,0 +1,422 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Parameter\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "grid1 = {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0.001, 'n_estimators': 75, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+    "grid2 = {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 8, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n",
+    "grid3 = {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-12-08 13:43:03.611456: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-12-08 13:43:03.698524: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
+      "E0000 00:00:1733665383.730998   32201 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "E0000 00:00:1733665383.739369   32201 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2024-12-08 13:43:03.817875: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "# Add the parent directory to the system path\n",
+    "sys.path.append(os.path.abspath('../'))  # Adjust the path as needed\n",
+    "\n",
+    "from my_util import df_to_corr_matrix, remove_outliers\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "import plotly.graph_objects as go\n",
+    "\n",
+    "from matplotlib.colors import Normalize\n",
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n",
+    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
+    "from sklearn.impute import KNNImputer\n",
+    "\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "from imblearn.pipeline import Pipeline\n",
+    "\n",
+    "from joblib import Parallel, delayed\n",
+    "\n",
+    "import xgboost as xgb\n",
+    "from xgboost import XGBClassifier\n",
+    "\n",
+    "from pickle import dump , load\n",
+    "\n",
+    "import warnings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_file_path = '../TestDatasetExample.xls'"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature\n",
+      "Loaded selected_features to X\n"
+     ]
+    }
+   ],
+   "source": [
+    "X = pd.read_excel(test_file_path)\n",
+    "\n",
+    "X.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n",
+    "\n",
+    "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n",
+    "    selected_features = load(file)\n",
+    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
+    "\n",
+    "X = X[selected_features]\n",
+    "print('Loaded selected_features to X')"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength', 'original_glrlm_LongRunLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'HistologyType', 'ChemoGrade', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterColumn', 'original_shape_SurfaceVolumeRatio', 'original_shape_LeastAxisLength', 'original_glcm_Autocorrelation', 'original_shape_Sphericity']\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = XGBClassifier()\n",
+    "model.load_model(\"model.ubj\")\n",
+    "\n",
+    "print(selected_features)\n",
+    "y_pred = model.predict(X)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "array([0, 1, 1])"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "y_pred"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Retrain the model with different data and evaluate the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature\n",
+      "(395, 25) (395,)\n",
+      "Split data using train_test_split with random_state=14\n",
+      "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n",
+      "The training data has 316 data. The testing data has 79 data. \n",
+      "Positive ratio: \n",
+      "\tTrain: 0.21203\n",
+      "\tTest: 0.21519\n"
+     ]
+    },
+    {
+     "ename": "NameError",
+     "evalue": "name 'grid' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[9], line 45\u001b[0m\n\u001b[1;32m     42\u001b[0m rs \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m13\u001b[39m\n\u001b[1;32m     43\u001b[0m stratified_kfold \u001b[38;5;241m=\u001b[39m StratifiedKFold(n_splits\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m13\u001b[39m)\n\u001b[0;32m---> 45\u001b[0m model\u001b[38;5;241m.\u001b[39mset_params(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[43mgrid\u001b[49m)\n\u001b[1;32m     47\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mCross validation for the train set using StratifiedKFold with random_state=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mX_train_full\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m     49\u001b[0m y_pred_cv \u001b[38;5;241m=\u001b[39m cross_val_predict(model, X_train_full, y_train_full, cv\u001b[38;5;241m=\u001b[39mstratified_kfold)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'grid' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "model = XGBClassifier()\n",
+    "\n",
+    "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n",
+    "\n",
+    "data = pd.read_excel(\"../TrainDataset2024.xls\")\n",
+    "data.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
+    "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
+    "\n",
+    "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n",
+    "    selected_features = load(file)\n",
+    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
+    "\n",
+    "X = data[selected_features]\n",
+    "y = data[\"pCR (outcome)\"]\n",
+    "print(X.shape, y.shape)\n",
+    "\n",
+    "rs = 10\n",
+    "while True:  \n",
+    "    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=46) # similar distribution of 1 and 0\n",
+    "    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=rs)\n",
+    "\n",
+    "    X_train_full.reset_index(drop=True, inplace=True)\n",
+    "    X_test_reserved.reset_index(drop=True, inplace=True)\n",
+    "    y_train_full.reset_index(drop=True, inplace=True)\n",
+    "    y_test_reserved.reset_index(drop=True, inplace=True)\n",
+    "\n",
+    "    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n",
+    "    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n",
+    "\n",
+    "    if abs(ratio_train - ratio_test) < 0.01:\n",
+    "        print(f\"Split data using train_test_split with random_state={rs}\")\n",
+    "        break\n",
+    "    rs+=1\n",
+    "\n",
+    "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n",
+    "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n",
+    "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")\n",
+    "\n",
+    "# stratified_kfold = StratifiedKFold(n_splits=5, shuffle=False)\n",
+    "rs = 13\n",
+    "stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)\n",
+    "\n",
+    "model.set_params(**grid)\n",
+    "\n",
+    "print(f\"\\nCross validation for the train set using StratifiedKFold with random_state={rs}: {X_train_full.shape}\")\n",
+    "\n",
+    "y_pred_cv = cross_val_predict(model, X_train_full, y_train_full, cv=stratified_kfold)\n",
+    "print(confusion_matrix(y_train_full, y_pred_cv))\n",
+    "print(classification_report(y_train_full, y_pred_cv))\n",
+    "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_train_full, y_pred_cv)}\")\n",
+    "print(f\"F1 Score: {f1_score(y_train_full, y_pred_cv)}\")\n",
+    "print(f\"Precision: {precision_score(y_train_full, y_pred_cv)}\")\n",
+    "print(f\"Recall: {recall_score(y_train_full, y_pred_cv)}\")\n",
+    "print(f\"Specificity: {recall_score(y_train_full, y_pred_cv, pos_label=0)}\")\n",
+    "print()\n",
+    "\n",
+    "model.fit(X_train_full, y_train_full)\n",
+    "y_pred = model.predict(X_test_reserved)\n",
+    "\n",
+    "print(f\"\\nResult of the test set: {X_test_reserved.shape}\")\n",
+    "\n",
+    "print(confusion_matrix(y_test_reserved, y_pred))\n",
+    "print(classification_report(y_test_reserved, y_pred))\n",
+    "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"F1 Score: {f1_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"Precision: {precision_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"Recall: {recall_score(y_test_reserved, y_pred)}\")\n",
+    "print(f\"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}\")\n",
+    "\n",
+    "\n",
+    "print(\"\\nUse the whole data to train and do CV using StratifiedKFold with random_state={rs}\")\n",
+    "y_pred_cv = cross_val_predict(model, X, y, cv=stratified_kfold)\n",
+    "print(confusion_matrix(y, y_pred_cv))\n",
+    "print(classification_report(y, y_pred_cv))\n",
+    "print(f\"Balanced accuracy score: {balanced_accuracy_score(y, y_pred_cv)}\")\n",
+    "print(f\"F1 Score: {f1_score(y, y_pred_cv)}\")\n",
+    "print(f\"Precision: {precision_score(y, y_pred_cv)}\")\n",
+    "print(f\"Recall: {recall_score(y, y_pred_cv)}\")\n",
+    "print(f\"Specificity: {recall_score(y, y_pred_cv, pos_label=0)}\")\n",
+    "print()\n",
+    "\n",
+    "\n",
+    "print(f\"Predict the test file:\")\n",
+    "\n",
+    "X = pd.read_excel(test_file_path)\n",
+    "\n",
+    "X.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n",
+    "\n",
+    "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n",
+    "    selected_features = load(file)\n",
+    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
+    "\n",
+    "X = X[selected_features]\n",
+    "y_pred = model.predict(X)\n",
+    "\n",
+    "print(y_pred)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature\n",
+      "File 0\n",
+      "[[44 18]\n",
+      " [ 6 11]]\n",
+      "0.6783681214421253\n",
+      "File 1\n",
+      "[[38 24]\n",
+      " [ 1 16]]\n",
+      "0.7770398481973435\n",
+      "File 2\n",
+      "[[42 20]\n",
+      " [ 5 12]]\n",
+      "0.6916508538899431\n",
+      "Averaged balanced accuracy: 0.715686274509804\n"
+     ]
+    }
+   ],
+   "source": [
+    "\n",
+    "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n",
+    "\n",
+    "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n",
+    "    selected_features = load(file)\n",
+    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
+    "\n",
+    "\n",
+    "files = [(\"../train_data.xls\", \"../test_data.xls\"), (\"../train_data_2.xls\", \"../test_data_2.xls\"), (\"../train_data_3.xls\", \"../test_data_3.xls\")]\n",
+    "\n",
+    "ba = []\n",
+    "\n",
+    "for index, (train_file, test_file) in enumerate(files):    \n",
+    "    data = pd.read_excel(train_file)\n",
+    "    data.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "    data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
+    "    data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
+    "\n",
+    "    X = data.drop(columns='pCR (outcome)', axis=1)\n",
+    "    X = X[selected_features]\n",
+    "    y = data[\"pCR (outcome)\"]\n",
+    "    # print(X.shape, y.shape)\n",
+    "\n",
+    "    testdata = pd.read_excel(test_file)\n",
+    "    testdata.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "    testdata.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
+    "    testdata.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
+    "\n",
+    "    X_test = testdata.drop(columns='pCR (outcome)', axis=1)\n",
+    "    X_test = X_test[selected_features]\n",
+    "    y_test = testdata[\"pCR (outcome)\"]\n",
+    "    # print(X_test.shape, y_test.shape)\n",
+    "\n",
+    "    model1 = XGBClassifier()\n",
+    "    model1.set_params(**grid1)\n",
+    "    model2 = XGBClassifier()\n",
+    "    model2.set_params(**grid2)\n",
+    "    model3 = XGBClassifier()\n",
+    "    model3.set_params(**grid3)\n",
+    "\n",
+    "    model1.fit(X, y)\n",
+    "    model2.fit(X, y)\n",
+    "    model3.fit(X, y)\n",
+    "\n",
+    "    y_pred = []\n",
+    "    y_pred.append(model1.predict(X_test))\n",
+    "    y_pred.append(model2.predict(X_test))\n",
+    "    y_pred.append(model3.predict(X_test))\n",
+    "    y_pred = np.array(y_pred)\n",
+    "\n",
+    "    yp = np.round(np.average(y_pred, axis=0))\n",
+    "    print(f\"File {index}\")\n",
+    "    print(confusion_matrix(y_test, yp))\n",
+    "    ba.append(balanced_accuracy_score(y_test, yp))\n",
+    "    print(ba[-1])\n",
+    "print(f\"Averaged balanced accuracy: {np.mean(ba)}\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "MLEAsm",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}