399 lines (398 with data), 13.1 kB
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Imports"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"2024-12-12 20:34:16.323182: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n",
"2024-12-12 20:34:16.497025: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n",
"WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n",
"E0000 00:00:1734035656.563220 77052 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n",
"E0000 00:00:1734035656.581797 77052 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n",
"2024-12-12 20:34:16.741729: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n",
"To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n"
]
}
],
"source": [
"import sys\n",
"import os\n",
"\n",
"# Add the parent directory to the system path\n",
"sys.path.append(os.path.abspath('../')) # Adjust the path as needed\n",
"\n",
"from my_util import df_to_corr_matrix, remove_outliers\n",
"\n",
"import tensorflow as tf\n",
"import pandas as pd\n",
"import numpy as np\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"import plotly.graph_objects as go\n",
"\n",
"from matplotlib.colors import Normalize\n",
"from sklearn.decomposition import PCA\n",
"from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n",
"from sklearn.preprocessing import StandardScaler, RobustScaler\n",
"from sklearn.tree import DecisionTreeClassifier\n",
"from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n",
"from sklearn.svm import SVC\n",
"from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
"from sklearn.impute import KNNImputer\n",
"\n",
"from imblearn.over_sampling import SMOTE\n",
"from imblearn.pipeline import Pipeline\n",
"\n",
"from joblib import Parallel, delayed\n",
"\n",
"import xgboost as xgb\n",
"from xgboost import XGBClassifier\n",
"\n",
"from pickle import dump , load\n",
"\n",
"import warnings"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Parameter\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"params = []\n",
"with open(\"pkl/best_params_15.pkl\", 'rb') as file:\n",
" params.append(load(file))\n",
"with open(\"pkl/best_params_20.pkl\", 'rb') as file:\n",
" params.append(load(file))\n",
"with open(\"pkl/best_params_25.pkl\", 'rb') as file:\n",
" params.append(load(file))\n",
"with open(\"pkl/best_params_30.pkl\", 'rb') as file:\n",
" params.append(load(file))\n",
"with open(\"pkl/best_params_35.pkl\", 'rb') as file:\n",
" params.append(load(file))"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Varify model's robustness using different datasets."
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File 0:\n",
"[[43 19]\n",
" [ 5 12]]\n",
"0.6997153700189753\n",
"File 1:\n",
"[[31 31]\n",
" [ 0 17]]\n",
"0.75\n",
"File 2:\n",
"[[40 22]\n",
" [ 3 14]]\n",
"0.7343453510436433\n",
"Averaged balanced accuracy: 0.7280202403542062\n"
]
}
],
"source": [
"NUM_OF_SELECTED_FEATURES = [25, 30, 35]\n",
"\n",
"files = [(\"../train_data.xls\", \"../test_data.xls\"), (\"../train_data_2.xls\", \"../test_data_2.xls\"), (\"../train_data_3.xls\", \"../test_data_3.xls\")]\n",
"\n",
"ba = []\n",
"\n",
"for index, (train_file, test_file) in enumerate(files):\n",
" data = pd.read_excel(train_file)\n",
" data.replace(999, np.nan, inplace=True)\n",
"\n",
" data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
" data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
"\n",
" X = data.drop(columns='pCR (outcome)', axis=1)\n",
" y = data[\"pCR (outcome)\"]\n",
" # print(X.shape, y.shape)\n",
"\n",
" testdata = pd.read_excel(test_file)\n",
" testdata.replace(999, np.nan, inplace=True)\n",
"\n",
" testdata.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
" testdata.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
"\n",
" X_test = testdata.drop(columns='pCR (outcome)', axis=1)\n",
" y_test = testdata[\"pCR (outcome)\"]\n",
" # print(X_test.shape, y_test.shape)\n",
"\n",
" models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]\n",
"\n",
" selected_features = []\n",
"\n",
" for i in NUM_OF_SELECTED_FEATURES:\n",
" FEATURES_FILE_PREFIX = F\"corr_{i}\"\n",
" with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n",
" selected_features.append(load(file))\n",
" # print(f\"Loaded '{file.name}' to selected_feature\")\n",
"\n",
" y_pred = []\n",
"\n",
" for i, model in enumerate(models):\n",
" X_train_temp = X[selected_features[i]]\n",
" X_test_temp = X_test[selected_features[i]]\n",
" model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])\n",
" model.fit(X, y)\n",
" y_pred.append(model.predict(X_test))\n",
"\n",
" y_pred = np.array(y_pred)\n",
"\n",
" yp = np.round(np.average(y_pred, axis=0))\n",
"\n",
" print(f\"File {index}:\")\n",
" print(confusion_matrix(y_test, yp))\n",
" ba.append(balanced_accuracy_score(y_test, yp))\n",
" print(ba[-1])\n",
"\n",
"print(f\"Averaged balanced accuracy: {np.mean(ba)}\")\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Predict data"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"NUM_OF_SELECTED_FEATURES = [25, 30, 35]\n",
"\n",
"data = pd.read_excel(\"../TrainDataset2024.xls\")\n",
"data.replace(999, np.nan, inplace=True)\n",
"\n",
"data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
"data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
"\n",
"X_train = data.drop(columns='pCR (outcome)', axis=1)\n",
"y_train = data[\"pCR (outcome)\"]\n",
"# print(X.shape, y.shape)\n",
"\n",
"testdata = pd.read_excel(\"../TestDatasetExample.xls\")\n",
"testdata.replace(999, np.nan, inplace=True)\n",
"\n",
"id = testdata[\"ID\"]\n",
"\n",
"testdata.drop([\"ID\"], axis=1, inplace=True)\n",
"\n",
"X_test = testdata\n",
"\n",
"models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]\n",
"\n",
"selected_features = []\n",
"\n",
"for i in NUM_OF_SELECTED_FEATURES:\n",
" FEATURES_FILE_PREFIX = F\"corr_{i}\"\n",
" with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n",
" selected_features.append(load(file))\n",
" # print(f\"Loaded '{file.name}' to selected_feature\")\n",
"\n",
"y_pred = []\n",
"y_pred_train = []\n",
"\n",
"for i, model in enumerate(models):\n",
" X_train_temp = X_train[selected_features[i]]\n",
" X_test_temp = X_test[selected_features[i]]\n",
" model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])\n",
" model.fit(X_train, y_train)\n",
" y_pred.append(model.predict(X_test))\n",
"\n",
"y_pred = np.array(y_pred)\n",
"\n",
"yp = np.round(np.average(y_pred, axis=0))\n",
"\n",
"yp = pd.concat([id, pd.Series(yp)], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>ID</th>\n",
" <th>0</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>TRG002728</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>TRG002649</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>TRG002628</td>\n",
" <td>1.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" ID 0\n",
"0 TRG002728 0.0\n",
"1 TRG002649 1.0\n",
"2 TRG002628 1.0"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"yp"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"NUM_OF_SELECTED_FEATURES = [25, 30, 35]\n",
"\n",
"data = pd.read_excel(\"../TrainDataset2024.xls\")\n",
"data.replace(999, np.nan, inplace=True)\n",
"\n",
"\n",
"data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
"data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
"\n",
"X_train = data.drop(columns='pCR (outcome)', axis=1)\n",
"y_train = data[\"pCR (outcome)\"]\n",
"# print(X.shape, y.shape)\n",
"\n",
"testdata = pd.read_excel(\"../FinalTestDataset2024.xls\")\n",
"testdata.replace(999, np.nan, inplace=True)\n",
"\n",
"id = testdata[\"ID\"]\n",
"\n",
"testdata.drop([\"ID\"], axis=1, inplace=True)\n",
"\n",
"X_test = testdata\n",
"\n",
"models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]\n",
"\n",
"selected_features = []\n",
"\n",
"for i in NUM_OF_SELECTED_FEATURES:\n",
" FEATURES_FILE_PREFIX = F\"corr_{i}\"\n",
" with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:\n",
" selected_features.append(load(file))\n",
" # print(f\"Loaded '{file.name}' to selected_feature\")\n",
"\n",
"y_pred = []\n",
"y_pred_train = []\n",
"\n",
"for i, model in enumerate(models):\n",
" X_train_temp = X_train[selected_features[i]]\n",
" X_test_temp = X_test[selected_features[i]]\n",
" model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])\n",
" model.fit(X_train, y_train)\n",
" y_pred.append(model.predict(X_test))\n",
"\n",
"y_pred = np.array(y_pred)\n",
"\n",
"yp = np.round(np.average(y_pred, axis=0))\n",
"\n",
"yp = pd.concat([id, pd.Series(yp)], axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"yp.to_csv(\"predicted.csv\")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "MLEAsm",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 2
}