--- a +++ b/XGBClassification/test old.ipynb @@ -0,0 +1,422 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Parameter\n" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "grid1 = {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 5, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0.001, 'n_estimators': 75, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "grid2 = {'gamma': 0, 'learning_rate': 0.2, 'max_bin': 8, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 70, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n", + "grid3 = {'gamma': 0, 'learning_rate': 0.3, 'max_bin': 6, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 0, 'n_estimators': 30, 'num_parallel_tree': 1, 'scale_pos_weight': 4.5}\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-12-08 13:43:03.611456: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.\n", + "2024-12-08 13:43:03.698524: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered\n", + "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", + "E0000 00:00:1733665383.730998 32201 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered\n", + "E0000 00:00:1733665383.739369 32201 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered\n", + "2024-12-08 13:43:03.817875: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", + "To enable the following instructions: AVX2 AVX512F AVX512_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" + ] + } + ], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Add the parent directory to the system path\n", + "sys.path.append(os.path.abspath('../')) # Adjust the path as needed\n", + "\n", + "from my_util import df_to_corr_matrix, remove_outliers\n", + "\n", + "import tensorflow as tf\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import plotly.graph_objects as go\n", + "\n", + "from matplotlib.colors import Normalize\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold\n", + "from sklearn.preprocessing import StandardScaler, RobustScaler\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score\n", + "from sklearn.svm import SVC\n", + "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n", + "from sklearn.impute import KNNImputer\n", + "\n", + "from imblearn.over_sampling import SMOTE\n", + "from imblearn.pipeline import Pipeline\n", + "\n", + "from joblib import Parallel, delayed\n", + "\n", + "import xgboost as xgb\n", + "from xgboost import XGBClassifier\n", + "\n", + "from pickle import dump , load\n", + "\n", + "import warnings" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "test_file_path = '../TestDatasetExample.xls'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature\n", + "Loaded selected_features to X\n" + ] + } + ], + "source": [ + "X = pd.read_excel(test_file_path)\n", + "\n", + "X.replace(999, np.nan, inplace=True)\n", + "\n", + "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n", + "\n", + "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n", + " selected_features = load(file)\n", + " print(f\"Loaded '{file.name}' to selected_feature\")\n", + "\n", + "X = X[selected_features]\n", + "print('Loaded selected_features to X')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Load model" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength', 'original_glrlm_LongRunLowGrayLevelEmphasis', 'original_firstorder_Minimum', 'HistologyType', 'ChemoGrade', 'original_shape_Maximum2DDiameterRow', 'original_shape_Maximum2DDiameterColumn', 'original_shape_SurfaceVolumeRatio', 'original_shape_LeastAxisLength', 'original_glcm_Autocorrelation', 'original_shape_Sphericity']\n" + ] + } + ], + "source": [ + "model = XGBClassifier()\n", + "model.load_model(\"model.ubj\")\n", + "\n", + "print(selected_features)\n", + "y_pred = model.predict(X)" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array([0, 1, 1])" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "y_pred" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Retrain the model with different data and evaluate the model" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature\n", + "(395, 25) (395,)\n", + "Split data using train_test_split with random_state=14\n", + "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n", + "The training data has 316 data. The testing data has 79 data. \n", + "Positive ratio: \n", + "\tTrain: 0.21203\n", + "\tTest: 0.21519\n" + ] + }, + { + "ename": "NameError", + "evalue": "name 'grid' is not defined", + "output_type": "error", + "traceback": [ + "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", + "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", + "Cell \u001b[0;32mIn[9], line 45\u001b[0m\n\u001b[1;32m 42\u001b[0m rs \u001b[38;5;241m=\u001b[39m \u001b[38;5;241m13\u001b[39m\n\u001b[1;32m 43\u001b[0m stratified_kfold \u001b[38;5;241m=\u001b[39m StratifiedKFold(n_splits\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m5\u001b[39m, shuffle\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m, random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m13\u001b[39m)\n\u001b[0;32m---> 45\u001b[0m model\u001b[38;5;241m.\u001b[39mset_params(\u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39m\u001b[43mgrid\u001b[49m)\n\u001b[1;32m 47\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124mf\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[38;5;124mCross validation for the train set using StratifiedKFold with random_state=\u001b[39m\u001b[38;5;132;01m{\u001b[39;00mrs\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mX_train_full\u001b[38;5;241m.\u001b[39mshape\u001b[38;5;132;01m}\u001b[39;00m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m 49\u001b[0m y_pred_cv \u001b[38;5;241m=\u001b[39m cross_val_predict(model, X_train_full, y_train_full, cv\u001b[38;5;241m=\u001b[39mstratified_kfold)\n", + "\u001b[0;31mNameError\u001b[0m: name 'grid' is not defined" + ] + } + ], + "source": [ + "model = XGBClassifier()\n", + "\n", + "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n", + "\n", + "data = pd.read_excel(\"../TrainDataset2024.xls\")\n", + "data.replace(999, np.nan, inplace=True)\n", + "\n", + "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n", + "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n", + "\n", + "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n", + " selected_features = load(file)\n", + " print(f\"Loaded '{file.name}' to selected_feature\")\n", + "\n", + "X = data[selected_features]\n", + "y = data[\"pCR (outcome)\"]\n", + "print(X.shape, y.shape)\n", + "\n", + "rs = 10\n", + "while True: \n", + " # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=46) # similar distribution of 1 and 0\n", + " X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=rs)\n", + "\n", + " X_train_full.reset_index(drop=True, inplace=True)\n", + " X_test_reserved.reset_index(drop=True, inplace=True)\n", + " y_train_full.reset_index(drop=True, inplace=True)\n", + " y_test_reserved.reset_index(drop=True, inplace=True)\n", + "\n", + " ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n", + " ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n", + "\n", + " if abs(ratio_train - ratio_test) < 0.01:\n", + " print(f\"Split data using train_test_split with random_state={rs}\")\n", + " break\n", + " rs+=1\n", + "\n", + "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n", + "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n", + "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")\n", + "\n", + "# stratified_kfold = StratifiedKFold(n_splits=5, shuffle=False)\n", + "rs = 13\n", + "stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=13)\n", + "\n", + "model.set_params(**grid)\n", + "\n", + "print(f\"\\nCross validation for the train set using StratifiedKFold with random_state={rs}: {X_train_full.shape}\")\n", + "\n", + "y_pred_cv = cross_val_predict(model, X_train_full, y_train_full, cv=stratified_kfold)\n", + "print(confusion_matrix(y_train_full, y_pred_cv))\n", + "print(classification_report(y_train_full, y_pred_cv))\n", + "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_train_full, y_pred_cv)}\")\n", + "print(f\"F1 Score: {f1_score(y_train_full, y_pred_cv)}\")\n", + "print(f\"Precision: {precision_score(y_train_full, y_pred_cv)}\")\n", + "print(f\"Recall: {recall_score(y_train_full, y_pred_cv)}\")\n", + "print(f\"Specificity: {recall_score(y_train_full, y_pred_cv, pos_label=0)}\")\n", + "print()\n", + "\n", + "model.fit(X_train_full, y_train_full)\n", + "y_pred = model.predict(X_test_reserved)\n", + "\n", + "print(f\"\\nResult of the test set: {X_test_reserved.shape}\")\n", + "\n", + "print(confusion_matrix(y_test_reserved, y_pred))\n", + "print(classification_report(y_test_reserved, y_pred))\n", + "print(f\"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}\")\n", + "print(f\"F1 Score: {f1_score(y_test_reserved, y_pred)}\")\n", + "print(f\"Precision: {precision_score(y_test_reserved, y_pred)}\")\n", + "print(f\"Recall: {recall_score(y_test_reserved, y_pred)}\")\n", + "print(f\"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}\")\n", + "\n", + "\n", + "print(\"\\nUse the whole data to train and do CV using StratifiedKFold with random_state={rs}\")\n", + "y_pred_cv = cross_val_predict(model, X, y, cv=stratified_kfold)\n", + "print(confusion_matrix(y, y_pred_cv))\n", + "print(classification_report(y, y_pred_cv))\n", + "print(f\"Balanced accuracy score: {balanced_accuracy_score(y, y_pred_cv)}\")\n", + "print(f\"F1 Score: {f1_score(y, y_pred_cv)}\")\n", + "print(f\"Precision: {precision_score(y, y_pred_cv)}\")\n", + "print(f\"Recall: {recall_score(y, y_pred_cv)}\")\n", + "print(f\"Specificity: {recall_score(y, y_pred_cv, pos_label=0)}\")\n", + "print()\n", + "\n", + "\n", + "print(f\"Predict the test file:\")\n", + "\n", + "X = pd.read_excel(test_file_path)\n", + "\n", + "X.replace(999, np.nan, inplace=True)\n", + "\n", + "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n", + "\n", + "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n", + " selected_features = load(file)\n", + " print(f\"Loaded '{file.name}' to selected_feature\")\n", + "\n", + "X = X[selected_features]\n", + "y_pred = model.predict(X)\n", + "\n", + "print(y_pred)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded '../FeatureSelection/pkl/corr_25_selected_features.pkl' to selected_feature\n", + "File 0\n", + "[[44 18]\n", + " [ 6 11]]\n", + "0.6783681214421253\n", + "File 1\n", + "[[38 24]\n", + " [ 1 16]]\n", + "0.7770398481973435\n", + "File 2\n", + "[[42 20]\n", + " [ 5 12]]\n", + "0.6916508538899431\n", + "Averaged balanced accuracy: 0.715686274509804\n" + ] + } + ], + "source": [ + "\n", + "NUM_OF_SELECTED_FEATURES = \"corr_25\"\n", + "\n", + "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n", + " selected_features = load(file)\n", + " print(f\"Loaded '{file.name}' to selected_feature\")\n", + "\n", + "\n", + "files = [(\"../train_data.xls\", \"../test_data.xls\"), (\"../train_data_2.xls\", \"../test_data_2.xls\"), (\"../train_data_3.xls\", \"../test_data_3.xls\")]\n", + "\n", + "ba = []\n", + "\n", + "for index, (train_file, test_file) in enumerate(files): \n", + " data = pd.read_excel(train_file)\n", + " data.replace(999, np.nan, inplace=True)\n", + "\n", + " data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n", + " data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n", + "\n", + " X = data.drop(columns='pCR (outcome)', axis=1)\n", + " X = X[selected_features]\n", + " y = data[\"pCR (outcome)\"]\n", + " # print(X.shape, y.shape)\n", + "\n", + " testdata = pd.read_excel(test_file)\n", + " testdata.replace(999, np.nan, inplace=True)\n", + "\n", + " testdata.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n", + " testdata.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n", + "\n", + " X_test = testdata.drop(columns='pCR (outcome)', axis=1)\n", + " X_test = X_test[selected_features]\n", + " y_test = testdata[\"pCR (outcome)\"]\n", + " # print(X_test.shape, y_test.shape)\n", + "\n", + " model1 = XGBClassifier()\n", + " model1.set_params(**grid1)\n", + " model2 = XGBClassifier()\n", + " model2.set_params(**grid2)\n", + " model3 = XGBClassifier()\n", + " model3.set_params(**grid3)\n", + "\n", + " model1.fit(X, y)\n", + " model2.fit(X, y)\n", + " model3.fit(X, y)\n", + "\n", + " y_pred = []\n", + " y_pred.append(model1.predict(X_test))\n", + " y_pred.append(model2.predict(X_test))\n", + " y_pred.append(model3.predict(X_test))\n", + " y_pred = np.array(y_pred)\n", + "\n", + " yp = np.round(np.average(y_pred, axis=0))\n", + " print(f\"File {index}\")\n", + " print(confusion_matrix(y_test, yp))\n", + " ba.append(balanced_accuracy_score(y_test, yp))\n", + " print(ba[-1])\n", + "print(f\"Averaged balanced accuracy: {np.mean(ba)}\")\n" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MLEAsm", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}