--- a +++ b/XGBClassification/test.ipynb @@ -0,0 +1,398 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Imports" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-12-12 20:34:16.323182: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2024-12-12 20:34:16.497025: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", + "E0000 00:00:1734035656.563220 77052 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "E0000 00:00:1734035656.581797 77052 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2024-12-12 20:34:16.741729: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Add the parent directory to the system path\n", + "sys.path.append(os.path.abspath('../')) # Adjust the path as needed\n", + "\n", + "from my_util import df_to_corr_matrix, remove_outliers\n", + "\n", + "import tensorflow as tf\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import plotly.graph_objects as go\n", + "\n", + "from matplotlib.colors import Normalize\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n", + "from sklearn.preprocessing import StandardScaler, RobustScaler\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n", + "from sklearn.svm import SVC\n", + "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n", + "from sklearn.impute import KNNImputer\n", + "\n", + "from imblearn.over_sampling import SMOTE\n", + "from imblearn.pipeline import Pipeline\n", + "\n", + "from joblib import Parallel, delayed\n", + "\n", + "import xgboost as xgb\n", + "from xgboost import XGBClassifier\n", + "\n", + "from pickle import dump , load\n", + "\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parameter\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "params = []\n", + "with open(\"pkl/best_params_15.pkl\", 'rb') as file:\n", + " params.append(load(file))\n", + "with open(\"pkl/best_params_20.pkl\", 'rb') as file:\n", + " params.append(load(file))\n", + "with open(\"pkl/best_params_25.pkl\", 'rb') as file:\n", + " params.append(load(file))\n", + "with open(\"pkl/best_params_30.pkl\", 'rb') as file:\n", + " params.append(load(file))\n", + "with open(\"pkl/best_params_35.pkl\", 'rb') as file:\n", + " params.append(load(file))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Varify model's robustness using different datasets." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "File 0:\n", + "[[43 19]\n", + " [ 5 12]]\n", + "0.6997153700189753\n", + "File 1:\n", + "[[31 31]\n", + " [ 0 17]]\n", + "0.75\n", + "File 2:\n", + "[[40 22]\n", + " [ 3 14]]\n", + "0.7343453510436433\n", + "Averaged balanced accuracy: 0.7280202403542062\n" + ] + } + ], + "source": [ + "NUM_OF_SELECTED_FEATURES = [25, 30, 35]\n", + "\n", + "files = [(\"../train_data.xls\", \"../test_data.xls\"), (\"../train_data_2.xls\", \"../test_data_2.xls\"), (\"../train_data_3.xls\", \"../test_data_3.xls\")]\n", + "\n", + "ba = []\n", + "\n", + "for index, (train_file, test_file) in enumerate(files):\n", + " data = pd.read_excel(train_file)\n", + " data.replace(999, np.nan, inplace=True)\n", + "\n", + " data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n", + " data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n", + "\n", + " X = data.drop(columns='pCR (outcome)', axis=1)\n", + " y = data[\"pCR (outcome)\"]\n", + " # print(X.shape, y.shape)\n", + "\n", + " testdata = pd.read_excel(test_file)\n", + " testdata.replace(999, np.nan, inplace=True)\n", + "\n", + " testdata.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n", + " testdata.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n", + "\n", + " X_test = testdata.drop(columns='pCR (outcome)', axis=1)\n", + " y_test = testdata[\"pCR (outcome)\"]\n", + " # print(X_test.shape, y_test.shape)\n", + "\n", + " models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]\n", + "\n", + " selected_features = []\n", + "\n", + " for i in NUM_OF_SELECTED_FEATURES:\n", + " FEATURES_FILE_PREFIX = F\"corr_{i}\"\n", + " with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n", + " selected_features.append(load(file))\n", + " # print(f\"Loaded '{file.name}' to selected_feature\")\n", + "\n", + " y_pred = []\n", + "\n", + " for i, model in enumerate(models):\n", + " X_train_temp = X[selected_features[i]]\n", + " X_test_temp = X_test[selected_features[i]]\n", + " model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])\n", + " model.fit(X, y)\n", + " y_pred.append(model.predict(X_test))\n", + "\n", + " y_pred = np.array(y_pred)\n", + "\n", + " yp = np.round(np.average(y_pred, axis=0))\n", + "\n", + " print(f\"File {index}:\")\n", + " print(confusion_matrix(y_test, yp))\n", + " ba.append(balanced_accuracy_score(y_test, yp))\n", + " print(ba[-1])\n", + "\n", + "print(f\"Averaged balanced accuracy: {np.mean(ba)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Predict data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "NUM_OF_SELECTED_FEATURES = [25, 30, 35]\n", + "\n", + "data = pd.read_excel(\"../TrainDataset2024.xls\")\n", + "data.replace(999, np.nan, inplace=True)\n", + "\n", + "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n", + "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n", + "\n", + "X_train = data.drop(columns='pCR (outcome)', axis=1)\n", + "y_train = data[\"pCR (outcome)\"]\n", + "# print(X.shape, y.shape)\n", + "\n", + "testdata = pd.read_excel(\"../TestDatasetExample.xls\")\n", + "testdata.replace(999, np.nan, inplace=True)\n", + "\n", + "id = testdata[\"ID\"]\n", + "\n", + "testdata.drop([\"ID\"], axis=1, inplace=True)\n", + "\n", + "X_test = testdata\n", + "\n", + "models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]\n", + "\n", + "selected_features = []\n", + "\n", + "for i in NUM_OF_SELECTED_FEATURES:\n", + " FEATURES_FILE_PREFIX = F\"corr_{i}\"\n", + " with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n", + " selected_features.append(load(file))\n", + " # print(f\"Loaded '{file.name}' to selected_feature\")\n", + "\n", + "y_pred = []\n", + "y_pred_train = []\n", + "\n", + "for i, model in enumerate(models):\n", + " X_train_temp = X_train[selected_features[i]]\n", + " X_test_temp = X_test[selected_features[i]]\n", + " model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])\n", + " model.fit(X_train, y_train)\n", + " y_pred.append(model.predict(X_test))\n", + "\n", + "y_pred = np.array(y_pred)\n", + "\n", + "yp = np.round(np.average(y_pred, axis=0))\n", + "\n", + "yp = pd.concat([id, pd.Series(yp)], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>ID</th>\n", + " <th>0</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>TRG002728</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>TRG002649</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>TRG002628</td>\n", + " <td>1.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " ID 0\n", + "0 TRG002728 0.0\n", + "1 TRG002649 1.0\n", + "2 TRG002628 1.0" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "yp" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "NUM_OF_SELECTED_FEATURES = [25, 30, 35]\n", + "\n", + "data = pd.read_excel(\"../TrainDataset2024.xls\")\n", + "data.replace(999, np.nan, inplace=True)\n", + "\n", + "\n", + "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n", + "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n", + "\n", + "X_train = data.drop(columns='pCR (outcome)', axis=1)\n", + "y_train = data[\"pCR (outcome)\"]\n", + "# print(X.shape, y.shape)\n", + "\n", + "testdata = pd.read_excel(\"../FinalTestDataset2024.xls\")\n", + "testdata.replace(999, np.nan, inplace=True)\n", + "\n", + "id = testdata[\"ID\"]\n", + "\n", + "testdata.drop([\"ID\"], axis=1, inplace=True)\n", + "\n", + "X_test = testdata\n", + "\n", + "models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]\n", + "\n", + "selected_features = []\n", + "\n", + "for i in NUM_OF_SELECTED_FEATURES:\n", + " FEATURES_FILE_PREFIX = F\"corr_{i}\"\n", + " with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n", + " selected_features.append(load(file))\n", + " # print(f\"Loaded '{file.name}' to selected_feature\")\n", + "\n", + "y_pred = []\n", + "y_pred_train = []\n", + "\n", + "for i, model in enumerate(models):\n", + " X_train_temp = X_train[selected_features[i]]\n", + " X_test_temp = X_test[selected_features[i]]\n", + " model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])\n", + " model.fit(X_train, y_train)\n", + " y_pred.append(model.predict(X_test))\n", + "\n", + "y_pred = np.array(y_pred)\n", + "\n", + "yp = np.round(np.average(y_pred, axis=0))\n", + "\n", + "yp = pd.concat([id, pd.Series(yp)], axis=1)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "yp.to_csv(\"predicted.csv\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MLEAsm", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}