Install `xlrd` for reading the `xls` file

In [13]:
# %conda install xlrd==2.0.1
# $ conda install -c conda-forge py-xgboost-gpu


Set the path to the `xls` file

In [14]:
training_file = "../TrainDataset2024.xls"
# training_file = "/kaggle/input/dataset/TrainDataset2024.xls"

Import libraries

In [15]:
import sys
import os

# Add the parent directory to the system path
sys.path.append(os.path.abspath('../'))  # Adjust the path as needed

from my_util import df_to_corr_matrix, remove_outliers

import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from matplotlib.colors import Normalize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.impute import KNNImputer

from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from joblib import Parallel, delayed

import xgboost as xgb
from xgboost import XGBClassifier

from pickle import dump , load

import warnings

### Read the data into X and y

In [16]:
NUM_OF_SELECTED_FEATURES = "15"
FEATURES_FILE_PREFIX = F"corr_{NUM_OF_SELECTED_FEATURES}"


data = pd.read_excel(training_file)
data.replace(999, np.nan, inplace=True)

data.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
data.dropna(subset=["pCR (outcome)"], inplace=True)

with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:
    selected_features = load(file)
    print(f"Loaded '{file.name}' to selected_feature")

X = data[selected_features]
y = data["pCR (outcome)"]
print(X.shape, y.shape)

print(selected_features)

Loaded '../FeatureSelection/pkl/corr_15_selected_features.pkl' to selected_feature
(395, 15) (395,)
['Gene', 'HER2', 'PgR', 'ER', 'original_firstorder_10Percentile', 'original_ngtdm_Busyness', 'LNStatus', 'TumourStage', 'original_gldm_DependenceEntropy', 'original_firstorder_Skewness', 'original_glrlm_ShortRunHighGrayLevelEmphasis', 'original_ngtdm_Strength', 'original_gldm_SmallDependenceEmphasis', 'original_firstorder_InterquartileRange', 'original_shape_MajorAxisLength']


In [17]:
# # Set up the matplotlib figure
# plt.figure(figsize=(40, 30))

# # Loop through each feature to create a scatter plot
# for i, feature in enumerate(X.columns):
#     plt.subplot(5, 6, i + 1)  # Adjust the number of rows and columns based on the number of features
#     sns.scatterplot(x=y, y=X[feature], hue=y, style=y, palette='Set2', alpha=0.7)
#     plt.title(feature)
#     plt.xlabel('pCR (outcome)')
#     plt.ylabel(feature)
#     plt.xlim(-2, 3)

# plt.tight_layout()
# plt.show()

In [18]:
# df_to_corr_matrix(X, size_factor=1.6, sep=150)

### Split the data into train_full and test_reserved (untouch)

In [19]:
# Close ratio random_state
# [14, 47, 49, 52, 62, 76, 83, 89, 92, 116, 118, 122, 136, 138, 144, 146, 150, 156, 157, 159, 170, 172, 174, 185]

while True:  
    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0
    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)

    X_train_full.reset_index(drop=True, inplace=True)
    X_test_reserved.reset_index(drop=True, inplace=True)
    y_train_full.reset_index(drop=True, inplace=True)
    y_test_reserved.reset_index(drop=True, inplace=True)

    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)
    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)

    if abs(ratio_train - ratio_test) < 0.1:
        break

print("Splited the data into train and test. The test will not be used in the training, but just for test the xgb. ")
print(f"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. ")
print(f"Positive ratio: \n\tTrain: {ratio_train:.5f}\n\tTest: {ratio_test:.5f}")

Splited the data into train and test. The test will not be used in the training, but just for test the xgb. 
The training data has 316 data. The testing data has 79 data. 
Positive ratio: 
	Train: 0.21203
	Test: 0.21519


### Outliers

In [20]:
# # The result of keeping outliers is better
# X_train_full, y_train_full = remove_outliers(X_train_full, y_train_full, selected_features)

### XGBoost

In [21]:
print(X_train_full.shape)
print(y_train_full.shape)

(316, 15)
(316,)


In [22]:
model = XGBClassifier(objective="binary:logistic")

search_space = {
    "gamma": Categorical([0, 0.01, 0.1, 0.3, 0.5, 0.7]),
    "learning_rate": Real(1e-4, 0.5, prior='log-uniform'),
    "max_bin": Integer(2, 20),
    "max_depth": Integer(1, 5),
    "max_leaves": Integer(1, 5),
    "min_child_weight": Real(0, 10, prior='uniform'),
    "n_estimators": Integer(30, 200),
    "num_parallel_tree": Categorical([1, 2]),
    "scale_pos_weight": Categorical([3.8, 4.5]),
}

kf = StratifiedKFold(5, shuffle=True, random_state=42)

# Set up the GridSearchCV
bayes_search = BayesSearchCV(
    estimator=model,
    search_spaces = search_space,
    scoring={
        "f1": "f1",
        "recall": "recall",
        "specificity": make_scorer(recall_score, pos_label=0),
        "precision": "precision",
        "balanced_accuracy_score": make_scorer(balanced_accuracy_score),
    },
    cv=kf,
    n_iter=100,
    verbose=0,
    n_jobs=-1,
    return_train_score=True,
    refit="balanced_accuracy_score",
)

# Fit the model
bayes_search.fit(X_train_full, y_train_full)

# Get the best parameters and best score
result = pd.DataFrame(bayes_search.cv_results_)
best_params = bayes_search.best_params_
best_index = bayes_search.best_index_
best_f1 = result["mean_test_f1"][best_index]
best_precision = result["mean_test_precision"][best_index]
best_recall = result["mean_test_recall"][best_index]
best_specificity = result["mean_test_specificity"][best_index]
best_balanced_accuracy_score = result["mean_test_balanced_accuracy_score"][best_index]

print(f"Best Parameters at Index {best_index} :", best_params)
print(f"Balanced accuracy score: {best_balanced_accuracy_score}")
print(f"F1 Score: {best_f1}")
print(f"Precision Score: {best_precision}")
print(f"Recall Score: {best_recall}")
print(f"Specificity Score: {best_specificity}")
print()

pd.DataFrame(bayes_search.cv_results_).to_csv(f"output.csv")

Best Parameters at Index 17 : OrderedDict([('gamma', 0.01), ('learning_rate', 0.041870422386972375), ('max_bin', 13), ('max_depth', 1), ('max_leaves', 2), ('min_child_weight', 2.8986170391945087), ('n_estimators', 180), ('num_parallel_tree', 2), ('scale_pos_weight', 4.5)])
Balanced accuracy score: 0.7540690737833595
F1 Score: 0.5519510577941229
Precision Score: 0.4183051747014595
Recall Score: 0.8373626373626374
Specificity Score: 0.6707755102040817



In [23]:
model = bayes_search.best_estimator_

X_test = X_test_reserved

y_pred = model.predict(X_test)
report = classification_report(y_test_reserved, y_pred)
cm = confusion_matrix(y_test_reserved, y_pred)

print("Testing set:")
print(X_test_reserved.shape)
print(report)
print(cm)
print()
print(f"Balanced accuracy score: {balanced_accuracy_score(y_test_reserved, y_pred)}")
print(f"F1 Score: {f1_score(y_test_reserved, y_pred)}")
print(f"Precision: {precision_score(y_test_reserved, y_pred)}")
print(f"Recall: {recall_score(y_test_reserved, y_pred)}")
print(f"Specificity: {recall_score(y_test_reserved, y_pred, pos_label=0)}")

Testing set:
(79, 15)
              precision    recall  f1-score   support

         0.0       0.95      0.56      0.71        62
         1.0       0.36      0.88      0.51        17

    accuracy                           0.63        79
   macro avg       0.65      0.72      0.61        79
weighted avg       0.82      0.63      0.66        79

[[35 27]
 [ 2 15]]

Balanced accuracy score: 0.7234345351043643
F1 Score: 0.5084745762711864
Precision: 0.35714285714285715
Recall: 0.8823529411764706
Specificity: 0.5645161290322581


In [24]:
bp = dict(bayes_search.best_params_)
with open(f"pkl/best_params_{NUM_OF_SELECTED_FEATURES}.pkl", "wb") as file:
    dump(bp, file)
    print(f"Saved file {file.name}.")

model = bayes_search.best_estimator_
with open(f"pkl/model_{NUM_OF_SELECTED_FEATURES}.pkl", "wb") as file:
    dump(model, file)
    print(f"Saved file {file.name}")

print(bp)

Saved file pkl/best_params_15.pkl.
Saved file pkl/model_15.pkl
{'gamma': 0.01, 'learning_rate': 0.041870422386972375, 'max_bin': 13, 'max_depth': 1, 'max_leaves': 2, 'min_child_weight': 2.8986170391945087, 'n_estimators': 180, 'num_parallel_tree': 2, 'scale_pos_weight': 4.5}
