ML_BreastCancerTreatment / Git / Diff of /XGBClassification/test.ipynb

Models:
joseph-gordon/
ML_BreastCancerTreatment
Downloads: 1
Diff of /XGBClassification/test.ipynb [000000] .. [e6e569]
Switch to side-by-side view

--- a
+++ b/XGBClassification/test.ipynb
@@ -0,0 +1,398 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Imports"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "2024-12-12 20:34:16.323182: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
+      "2024-12-12 20:34:16.497025: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
+      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
+      "E0000 00:00:1734035656.563220   77052 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
+      "E0000 00:00:1734035656.581797   77052 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
+      "2024-12-12 20:34:16.741729: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
+      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "# Add the parent directory to the system path\n",
+    "sys.path.append(os.path.abspath('../'))  # Adjust the path as needed\n",
+    "\n",
+    "from my_util import df_to_corr_matrix, remove_outliers\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "import plotly.graph_objects as go\n",
+    "\n",
+    "from matplotlib.colors import Normalize\n",
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n",
+    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
+    "from sklearn.impute import KNNImputer\n",
+    "\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "from imblearn.pipeline import Pipeline\n",
+    "\n",
+    "from joblib import Parallel, delayed\n",
+    "\n",
+    "import xgboost as xgb\n",
+    "from xgboost import XGBClassifier\n",
+    "\n",
+    "from pickle import dump , load\n",
+    "\n",
+    "import warnings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Parameter\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "params = []\n",
+    "with open(\"pkl/best_params_15.pkl\", 'rb') as file:\n",
+    "  params.append(load(file))\n",
+    "with open(\"pkl/best_params_20.pkl\", 'rb') as file:\n",
+    "  params.append(load(file))\n",
+    "with open(\"pkl/best_params_25.pkl\", 'rb') as file:\n",
+    "  params.append(load(file))\n",
+    "with open(\"pkl/best_params_30.pkl\", 'rb') as file:\n",
+    "  params.append(load(file))\n",
+    "with open(\"pkl/best_params_35.pkl\", 'rb') as file:\n",
+    "  params.append(load(file))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Varify model's robustness using different datasets."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "File 0:\n",
+      "[[43 19]\n",
+      " [ 5 12]]\n",
+      "0.6997153700189753\n",
+      "File 1:\n",
+      "[[31 31]\n",
+      " [ 0 17]]\n",
+      "0.75\n",
+      "File 2:\n",
+      "[[40 22]\n",
+      " [ 3 14]]\n",
+      "0.7343453510436433\n",
+      "Averaged balanced accuracy: 0.7280202403542062\n"
+     ]
+    }
+   ],
+   "source": [
+    "NUM_OF_SELECTED_FEATURES = [25, 30, 35]\n",
+    "\n",
+    "files = [(\"../train_data.xls\", \"../test_data.xls\"), (\"../train_data_2.xls\", \"../test_data_2.xls\"), (\"../train_data_3.xls\", \"../test_data_3.xls\")]\n",
+    "\n",
+    "ba = []\n",
+    "\n",
+    "for index, (train_file, test_file) in enumerate(files):\n",
+    "    data = pd.read_excel(train_file)\n",
+    "    data.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "    data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
+    "    data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
+    "\n",
+    "    X = data.drop(columns='pCR (outcome)', axis=1)\n",
+    "    y = data[\"pCR (outcome)\"]\n",
+    "    # print(X.shape, y.shape)\n",
+    "\n",
+    "    testdata = pd.read_excel(test_file)\n",
+    "    testdata.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "    testdata.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
+    "    testdata.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
+    "\n",
+    "    X_test = testdata.drop(columns='pCR (outcome)', axis=1)\n",
+    "    y_test = testdata[\"pCR (outcome)\"]\n",
+    "    # print(X_test.shape, y_test.shape)\n",
+    "\n",
+    "    models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]\n",
+    "\n",
+    "    selected_features = []\n",
+    "\n",
+    "    for i in NUM_OF_SELECTED_FEATURES:\n",
+    "        FEATURES_FILE_PREFIX = F\"corr_{i}\"\n",
+    "        with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n",
+    "            selected_features.append(load(file))\n",
+    "            # print(f\"Loaded '{file.name}' to selected_feature\")\n",
+    "\n",
+    "    y_pred = []\n",
+    "\n",
+    "    for i, model in enumerate(models):\n",
+    "        X_train_temp = X[selected_features[i]]\n",
+    "        X_test_temp = X_test[selected_features[i]]\n",
+    "        model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])\n",
+    "        model.fit(X, y)\n",
+    "        y_pred.append(model.predict(X_test))\n",
+    "\n",
+    "    y_pred = np.array(y_pred)\n",
+    "\n",
+    "    yp = np.round(np.average(y_pred, axis=0))\n",
+    "\n",
+    "    print(f\"File {index}:\")\n",
+    "    print(confusion_matrix(y_test, yp))\n",
+    "    ba.append(balanced_accuracy_score(y_test, yp))\n",
+    "    print(ba[-1])\n",
+    "\n",
+    "print(f\"Averaged balanced accuracy: {np.mean(ba)}\")\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Predict data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NUM_OF_SELECTED_FEATURES = [25, 30, 35]\n",
+    "\n",
+    "data = pd.read_excel(\"../TrainDataset2024.xls\")\n",
+    "data.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
+    "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
+    "\n",
+    "X_train = data.drop(columns='pCR (outcome)', axis=1)\n",
+    "y_train = data[\"pCR (outcome)\"]\n",
+    "# print(X.shape, y.shape)\n",
+    "\n",
+    "testdata = pd.read_excel(\"../TestDatasetExample.xls\")\n",
+    "testdata.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "id = testdata[\"ID\"]\n",
+    "\n",
+    "testdata.drop([\"ID\"], axis=1, inplace=True)\n",
+    "\n",
+    "X_test = testdata\n",
+    "\n",
+    "models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]\n",
+    "\n",
+    "selected_features = []\n",
+    "\n",
+    "for i in NUM_OF_SELECTED_FEATURES:\n",
+    "    FEATURES_FILE_PREFIX = F\"corr_{i}\"\n",
+    "    with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n",
+    "        selected_features.append(load(file))\n",
+    "        # print(f\"Loaded '{file.name}' to selected_feature\")\n",
+    "\n",
+    "y_pred = []\n",
+    "y_pred_train = []\n",
+    "\n",
+    "for i, model in enumerate(models):\n",
+    "    X_train_temp = X_train[selected_features[i]]\n",
+    "    X_test_temp = X_test[selected_features[i]]\n",
+    "    model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])\n",
+    "    model.fit(X_train, y_train)\n",
+    "    y_pred.append(model.predict(X_test))\n",
+    "\n",
+    "y_pred = np.array(y_pred)\n",
+    "\n",
+    "yp = np.round(np.average(y_pred, axis=0))\n",
+    "\n",
+    "yp = pd.concat([id, pd.Series(yp)], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style scoped>\n",
+       "    .dataframe tbody tr th:only-of-type {\n",
+       "        vertical-align: middle;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>ID</th>\n",
+       "      <th>0</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>TRG002728</td>\n",
+       "      <td>0.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>TRG002649</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>TRG002628</td>\n",
+       "      <td>1.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "          ID    0\n",
+       "0  TRG002728  0.0\n",
+       "1  TRG002649  1.0\n",
+       "2  TRG002628  1.0"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "yp"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "NUM_OF_SELECTED_FEATURES = [25, 30, 35]\n",
+    "\n",
+    "data = pd.read_excel(\"../TrainDataset2024.xls\")\n",
+    "data.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "\n",
+    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
+    "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
+    "\n",
+    "X_train = data.drop(columns='pCR (outcome)', axis=1)\n",
+    "y_train = data[\"pCR (outcome)\"]\n",
+    "# print(X.shape, y.shape)\n",
+    "\n",
+    "testdata = pd.read_excel(\"../FinalTestDataset2024.xls\")\n",
+    "testdata.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "id = testdata[\"ID\"]\n",
+    "\n",
+    "testdata.drop([\"ID\"], axis=1, inplace=True)\n",
+    "\n",
+    "X_test = testdata\n",
+    "\n",
+    "models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]\n",
+    "\n",
+    "selected_features = []\n",
+    "\n",
+    "for i in NUM_OF_SELECTED_FEATURES:\n",
+    "    FEATURES_FILE_PREFIX = F\"corr_{i}\"\n",
+    "    with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n",
+    "        selected_features.append(load(file))\n",
+    "        # print(f\"Loaded '{file.name}' to selected_feature\")\n",
+    "\n",
+    "y_pred = []\n",
+    "y_pred_train = []\n",
+    "\n",
+    "for i, model in enumerate(models):\n",
+    "    X_train_temp = X_train[selected_features[i]]\n",
+    "    X_test_temp = X_test[selected_features[i]]\n",
+    "    model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])\n",
+    "    model.fit(X_train, y_train)\n",
+    "    y_pred.append(model.predict(X_test))\n",
+    "\n",
+    "y_pred = np.array(y_pred)\n",
+    "\n",
+    "yp = np.round(np.average(y_pred, axis=0))\n",
+    "\n",
+    "yp = pd.concat([id, pd.Series(yp)], axis=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "yp.to_csv(\"predicted.csv\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "MLEAsm",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}