[e6e569]: / XGBClassification / test.ipynb

Download this file

399 lines (398 with data), 13.1 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Imports"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "2024-12-12 20:34:16.323182: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
      "2024-12-12 20:34:16.497025: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
      "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
      "E0000 00:00:1734035656.563220   77052 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
      "E0000 00:00:1734035656.581797   77052 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
      "2024-12-12 20:34:16.741729: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
      "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
     ]
    }
   ],
   "source": [
    "import sys\n",
    "import os\n",
    "\n",
    "# Add the parent directory to the system path\n",
    "sys.path.append(os.path.abspath('../'))  # Adjust the path as needed\n",
    "\n",
    "from my_util import df_to_corr_matrix, remove_outliers\n",
    "\n",
    "import tensorflow as tf\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import plotly.graph_objects as go\n",
    "\n",
    "from matplotlib.colors import Normalize\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n",
    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
    "from sklearn.impute import KNNImputer\n",
    "\n",
    "from imblearn.over_sampling import SMOTE\n",
    "from imblearn.pipeline import Pipeline\n",
    "\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "import xgboost as xgb\n",
    "from xgboost import XGBClassifier\n",
    "\n",
    "from pickle import dump , load\n",
    "\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Parameter\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "params = []\n",
    "with open(\"pkl/best_params_15.pkl\", 'rb') as file:\n",
    "  params.append(load(file))\n",
    "with open(\"pkl/best_params_20.pkl\", 'rb') as file:\n",
    "  params.append(load(file))\n",
    "with open(\"pkl/best_params_25.pkl\", 'rb') as file:\n",
    "  params.append(load(file))\n",
    "with open(\"pkl/best_params_30.pkl\", 'rb') as file:\n",
    "  params.append(load(file))\n",
    "with open(\"pkl/best_params_35.pkl\", 'rb') as file:\n",
    "  params.append(load(file))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Varify model's robustness using different datasets."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "File 0:\n",
      "[[43 19]\n",
      " [ 5 12]]\n",
      "0.6997153700189753\n",
      "File 1:\n",
      "[[31 31]\n",
      " [ 0 17]]\n",
      "0.75\n",
      "File 2:\n",
      "[[40 22]\n",
      " [ 3 14]]\n",
      "0.7343453510436433\n",
      "Averaged balanced accuracy: 0.7280202403542062\n"
     ]
    }
   ],
   "source": [
    "NUM_OF_SELECTED_FEATURES = [25, 30, 35]\n",
    "\n",
    "files = [(\"../train_data.xls\", \"../test_data.xls\"), (\"../train_data_2.xls\", \"../test_data_2.xls\"), (\"../train_data_3.xls\", \"../test_data_3.xls\")]\n",
    "\n",
    "ba = []\n",
    "\n",
    "for index, (train_file, test_file) in enumerate(files):\n",
    "    data = pd.read_excel(train_file)\n",
    "    data.replace(999, np.nan, inplace=True)\n",
    "\n",
    "    data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
    "    data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
    "\n",
    "    X = data.drop(columns='pCR (outcome)', axis=1)\n",
    "    y = data[\"pCR (outcome)\"]\n",
    "    # print(X.shape, y.shape)\n",
    "\n",
    "    testdata = pd.read_excel(test_file)\n",
    "    testdata.replace(999, np.nan, inplace=True)\n",
    "\n",
    "    testdata.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
    "    testdata.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
    "\n",
    "    X_test = testdata.drop(columns='pCR (outcome)', axis=1)\n",
    "    y_test = testdata[\"pCR (outcome)\"]\n",
    "    # print(X_test.shape, y_test.shape)\n",
    "\n",
    "    models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]\n",
    "\n",
    "    selected_features = []\n",
    "\n",
    "    for i in NUM_OF_SELECTED_FEATURES:\n",
    "        FEATURES_FILE_PREFIX = F\"corr_{i}\"\n",
    "        with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n",
    "            selected_features.append(load(file))\n",
    "            # print(f\"Loaded '{file.name}' to selected_feature\")\n",
    "\n",
    "    y_pred = []\n",
    "\n",
    "    for i, model in enumerate(models):\n",
    "        X_train_temp = X[selected_features[i]]\n",
    "        X_test_temp = X_test[selected_features[i]]\n",
    "        model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])\n",
    "        model.fit(X, y)\n",
    "        y_pred.append(model.predict(X_test))\n",
    "\n",
    "    y_pred = np.array(y_pred)\n",
    "\n",
    "    yp = np.round(np.average(y_pred, axis=0))\n",
    "\n",
    "    print(f\"File {index}:\")\n",
    "    print(confusion_matrix(y_test, yp))\n",
    "    ba.append(balanced_accuracy_score(y_test, yp))\n",
    "    print(ba[-1])\n",
    "\n",
    "print(f\"Averaged balanced accuracy: {np.mean(ba)}\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Predict data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_OF_SELECTED_FEATURES = [25, 30, 35]\n",
    "\n",
    "data = pd.read_excel(\"../TrainDataset2024.xls\")\n",
    "data.replace(999, np.nan, inplace=True)\n",
    "\n",
    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
    "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
    "\n",
    "X_train = data.drop(columns='pCR (outcome)', axis=1)\n",
    "y_train = data[\"pCR (outcome)\"]\n",
    "# print(X.shape, y.shape)\n",
    "\n",
    "testdata = pd.read_excel(\"../TestDatasetExample.xls\")\n",
    "testdata.replace(999, np.nan, inplace=True)\n",
    "\n",
    "id = testdata[\"ID\"]\n",
    "\n",
    "testdata.drop([\"ID\"], axis=1, inplace=True)\n",
    "\n",
    "X_test = testdata\n",
    "\n",
    "models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]\n",
    "\n",
    "selected_features = []\n",
    "\n",
    "for i in NUM_OF_SELECTED_FEATURES:\n",
    "    FEATURES_FILE_PREFIX = F\"corr_{i}\"\n",
    "    with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n",
    "        selected_features.append(load(file))\n",
    "        # print(f\"Loaded '{file.name}' to selected_feature\")\n",
    "\n",
    "y_pred = []\n",
    "y_pred_train = []\n",
    "\n",
    "for i, model in enumerate(models):\n",
    "    X_train_temp = X_train[selected_features[i]]\n",
    "    X_test_temp = X_test[selected_features[i]]\n",
    "    model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])\n",
    "    model.fit(X_train, y_train)\n",
    "    y_pred.append(model.predict(X_test))\n",
    "\n",
    "y_pred = np.array(y_pred)\n",
    "\n",
    "yp = np.round(np.average(y_pred, axis=0))\n",
    "\n",
    "yp = pd.concat([id, pd.Series(yp)], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>ID</th>\n",
       "      <th>0</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>TRG002728</td>\n",
       "      <td>0.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>TRG002649</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>TRG002628</td>\n",
       "      <td>1.0</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "          ID    0\n",
       "0  TRG002728  0.0\n",
       "1  TRG002649  1.0\n",
       "2  TRG002628  1.0"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "yp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "NUM_OF_SELECTED_FEATURES = [25, 30, 35]\n",
    "\n",
    "data = pd.read_excel(\"../TrainDataset2024.xls\")\n",
    "data.replace(999, np.nan, inplace=True)\n",
    "\n",
    "\n",
    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
    "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
    "\n",
    "X_train = data.drop(columns='pCR (outcome)', axis=1)\n",
    "y_train = data[\"pCR (outcome)\"]\n",
    "# print(X.shape, y.shape)\n",
    "\n",
    "testdata = pd.read_excel(\"../FinalTestDataset2024.xls\")\n",
    "testdata.replace(999, np.nan, inplace=True)\n",
    "\n",
    "id = testdata[\"ID\"]\n",
    "\n",
    "testdata.drop([\"ID\"], axis=1, inplace=True)\n",
    "\n",
    "X_test = testdata\n",
    "\n",
    "models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]\n",
    "\n",
    "selected_features = []\n",
    "\n",
    "for i in NUM_OF_SELECTED_FEATURES:\n",
    "    FEATURES_FILE_PREFIX = F\"corr_{i}\"\n",
    "    with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n",
    "        selected_features.append(load(file))\n",
    "        # print(f\"Loaded '{file.name}' to selected_feature\")\n",
    "\n",
    "y_pred = []\n",
    "y_pred_train = []\n",
    "\n",
    "for i, model in enumerate(models):\n",
    "    X_train_temp = X_train[selected_features[i]]\n",
    "    X_test_temp = X_test[selected_features[i]]\n",
    "    model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])\n",
    "    model.fit(X_train, y_train)\n",
    "    y_pred.append(model.predict(X_test))\n",
    "\n",
    "y_pred = np.array(y_pred)\n",
    "\n",
    "yp = np.round(np.average(y_pred, axis=0))\n",
    "\n",
    "yp = pd.concat([id, pd.Series(yp)], axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "yp.to_csv(\"predicted.csv\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MLEAsm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}