### Imports

In [1]:
import sys
import os

# Add the parent directory to the system path
sys.path.append(os.path.abspath('../'))  # Adjust the path as needed

from my_util import df_to_corr_matrix, remove_outliers

import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from matplotlib.colors import Normalize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV, cross_val_predict, StratifiedKFold
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score, f1_score, make_scorer, balanced_accuracy_score
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.impute import KNNImputer

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from joblib import Parallel, delayed

import xgboost as xgb
from xgboost import XGBClassifier

from pickle import dump , load

import warnings

2024-12-12 20:34:16.323182: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-12 20:34:16.497025: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1734035656.563220   77052 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1734035656.581797   77052 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-12 20:34:16.741729: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

### Parameter


In [2]:
params = []
with open("pkl/best_params_15.pkl", 'rb') as file:
  params.append(load(file))
with open("pkl/best_params_20.pkl", 'rb') as file:
  params.append(load(file))
with open("pkl/best_params_25.pkl", 'rb') as file:
  params.append(load(file))
with open("pkl/best_params_30.pkl", 'rb') as file:
  params.append(load(file))
with open("pkl/best_params_35.pkl", 'rb') as file:
  params.append(load(file))

### Varify model's robustness using different datasets.

In [3]:
NUM_OF_SELECTED_FEATURES = [25, 30, 35]

files = [("../train_data.xls", "../test_data.xls"), ("../train_data_2.xls", "../test_data_2.xls"), ("../train_data_3.xls", "../test_data_3.xls")]

ba = []

for index, (train_file, test_file) in enumerate(files):
    data = pd.read_excel(train_file)
    data.replace(999, np.nan, inplace=True)

    data.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
    data.dropna(subset=["pCR (outcome)"], inplace=True)

    X = data.drop(columns='pCR (outcome)', axis=1)
    y = data["pCR (outcome)"]
    # print(X.shape, y.shape)

    testdata = pd.read_excel(test_file)
    testdata.replace(999, np.nan, inplace=True)

    testdata.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
    testdata.dropna(subset=["pCR (outcome)"], inplace=True)

    X_test = testdata.drop(columns='pCR (outcome)', axis=1)
    y_test = testdata["pCR (outcome)"]
    # print(X_test.shape, y_test.shape)

    models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]

    selected_features = []

    for i in NUM_OF_SELECTED_FEATURES:
        FEATURES_FILE_PREFIX = F"corr_{i}"
        with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:
            selected_features.append(load(file))
            # print(f"Loaded '{file.name}' to selected_feature")

    y_pred = []

    for i, model in enumerate(models):
        X_train_temp = X[selected_features[i]]
        X_test_temp = X_test[selected_features[i]]
        model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])
        model.fit(X, y)
        y_pred.append(model.predict(X_test))

    y_pred = np.array(y_pred)

    yp = np.round(np.average(y_pred, axis=0))

    print(f"File {index}:")
    print(confusion_matrix(y_test, yp))
    ba.append(balanced_accuracy_score(y_test, yp))
    print(ba[-1])

print(f"Averaged balanced accuracy: {np.mean(ba)}")


File 0:
[[43 19]
 [ 5 12]]
0.6997153700189753
File 1:
[[31 31]
 [ 0 17]]
0.75
File 2:
[[40 22]
 [ 3 14]]
0.7343453510436433
Averaged balanced accuracy: 0.7280202403542062


### Predict data

In [4]:
NUM_OF_SELECTED_FEATURES = [25, 30, 35]

data = pd.read_excel("../TrainDataset2024.xls")
data.replace(999, np.nan, inplace=True)

data.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
data.dropna(subset=["pCR (outcome)"], inplace=True)

X_train = data.drop(columns='pCR (outcome)', axis=1)
y_train = data["pCR (outcome)"]
# print(X.shape, y.shape)

testdata = pd.read_excel("../TestDatasetExample.xls")
testdata.replace(999, np.nan, inplace=True)

id = testdata["ID"]

testdata.drop(["ID"], axis=1, inplace=True)

X_test = testdata

models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]

selected_features = []

for i in NUM_OF_SELECTED_FEATURES:
    FEATURES_FILE_PREFIX = F"corr_{i}"
    with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:
        selected_features.append(load(file))
        # print(f"Loaded '{file.name}' to selected_feature")

y_pred = []
y_pred_train = []

for i, model in enumerate(models):
    X_train_temp = X_train[selected_features[i]]
    X_test_temp = X_test[selected_features[i]]
    model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])
    model.fit(X_train, y_train)
    y_pred.append(model.predict(X_test))

y_pred = np.array(y_pred)

yp = np.round(np.average(y_pred, axis=0))

yp = pd.concat([id, pd.Series(yp)], axis=1)

In [5]:
yp

Unnamed: 0,ID,0
0,TRG002728,0.0
1,TRG002649,1.0
2,TRG002628,1.0


In [7]:
NUM_OF_SELECTED_FEATURES = [25, 30, 35]

data = pd.read_excel("../TrainDataset2024.xls")
data.replace(999, np.nan, inplace=True)


data.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
data.dropna(subset=["pCR (outcome)"], inplace=True)

X_train = data.drop(columns='pCR (outcome)', axis=1)
y_train = data["pCR (outcome)"]
# print(X.shape, y.shape)

testdata = pd.read_excel("../FinalTestDataset2024.xls")
testdata.replace(999, np.nan, inplace=True)

id = testdata["ID"]

testdata.drop(["ID"], axis=1, inplace=True)

X_test = testdata

models = len(NUM_OF_SELECTED_FEATURES)*[XGBClassifier()]

selected_features = []

for i in NUM_OF_SELECTED_FEATURES:
    FEATURES_FILE_PREFIX = F"corr_{i}"
    with open(f'../FeatureSelection/pkl/{FEATURES_FILE_PREFIX}_selected_features.pkl', mode='rb') as file:
        selected_features.append(load(file))
        # print(f"Loaded '{file.name}' to selected_feature")

y_pred = []
y_pred_train = []

for i, model in enumerate(models):
    X_train_temp = X_train[selected_features[i]]
    X_test_temp = X_test[selected_features[i]]
    model.set_params(**params[5-len(NUM_OF_SELECTED_FEATURES)+i])
    model.fit(X_train, y_train)
    y_pred.append(model.predict(X_test))

y_pred = np.array(y_pred)

yp = np.round(np.average(y_pred, axis=0))

yp = pd.concat([id, pd.Series(yp)], axis=1)

In [9]:
yp.to_csv("predicted.csv")