Set the path to the `xls` file

In [703]:
training_file = "../TrainDataset2024.xls"
# training_file = "/kaggle/input/dataset/TrainDataset2024.xls"

Import libraries

In [704]:
import sys
import os

# Add the parent directory to the system path
sys.path.append(os.path.abspath('../'))  # Adjust the path as needed

from my_util import df_to_corr_matrix

import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as go

from matplotlib.colors import Normalize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif
from sklearn.impute import KNNImputer

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

from joblib import Parallel, delayed

from pickle import dump , load

import warnings

### Read the data and load the selected features

In [705]:
NUM_OF_SELECTED_FEATURES = 20

data = pd.read_excel(training_file)
data.replace(999, np.nan, inplace=True)

data.drop(["ID", "RelapseFreeSurvival (outcome)"], axis=1, inplace=True)
data.dropna(subset=["pCR (outcome)"], inplace=True)

with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:
    selected_features = load(file)
    print(f"Loaded '{file.name}' to selected_feature")

X = data[selected_features]
y = data["pCR (outcome)"]
print(X.shape, y.shape)

Loaded '../FeatureSelection/pkl/20_selected_features.pkl' to selected_feature
(395, 20) (395,)


### Imputer

In [706]:
with open('pkl/imputer.pkl', 'rb') as file:
  imputer = load(file)
X = pd.DataFrame(imputer.transform(X), columns=X.columns)

### Normalisation

In [707]:
with open('pkl/StandardScaler.pkl', 'rb') as file:
  scaler = load(file)
X = pd.DataFrame(scaler.transform(X), columns=X.columns)

### Split the data into train_full and test_reserved (untouch)

In [758]:
# Close ratio random_state
# [14, 47, 49, 52, 62, 76, 83, 89, 92, 116, 118, 122, 136, 138, 144, 146, 150, 156, 157, 159, 170, 172, 174, 185]

while True:  
    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0
    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)

    X_train_full.reset_index(drop=True, inplace=True)
    X_test_reserved.reset_index(drop=True, inplace=True)
    y_train_full.reset_index(drop=True, inplace=True)
    y_test_reserved.reset_index(drop=True, inplace=True)

    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)
    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)

    if abs(ratio_train - ratio_test) < 0.1:
        break

print("Splited the data into train and test. The test will not be used in the training, but just for test the model. ")
print(f"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. ")
print(f"Positive ratio: \n\tTrain: {ratio_train:.5f}\n\tTest: {ratio_test:.5f}")

Splited the data into train and test. The test will not be used in the training, but just for test the model. 
The training data has 316 data. The testing data has 79 data. 
Positive ratio: 
	Train: 0.22468
	Test: 0.16456


### Model

In [None]:
pipeline = Pipeline(
    [
        ("pca", PCA()),
        ("sampling", SMOTE()),
        ("svc", SVC(max_iter=100_000_000)),
    ]
)

params = {
    "pca__n_components": 9,
    "svc__C": 0.07,
    "svc__degree": 7,
    "svc__gamma": "auto",
    "svc__kernel": "linear",
}

# params = {
#     "pca__n_components": 11,
#     "svc__C": 0.15,
#     "svc__degree": 1,
#     "svc__gamma": 5,
#     "svc__kernel": "poly",
# }

# params = {
#     "pca__n_components": 11,
#     "svc__C": 0.2,
#     "svc__degree": 4,
#     "svc__gamma": "scale",
#     "svc__kernel": "sigmoid",
# }

# params = {
#     "pca__n_components": 11,
#     "svc__C": 0.2,
#     "svc__degree": 4,
#     "svc__gamma": 0.05,
#     "svc__kernel": "rbf",
# }

pipeline.set_params(**params)
pipeline.fit(X_train_full, y_train_full)

y_pred = pipeline.predict(X_train_full)
print("Training report:")
print(classification_report(y_train_full, y_pred))
print(confusion_matrix(y_train_full, y_pred))

y_pred = pipeline.predict(X_test_reserved)
print("Testing report:")
print(classification_report(y_test_reserved, y_pred))
print(confusion_matrix(y_test_reserved, y_pred))

Training report:
              precision    recall  f1-score   support

         0.0       0.92      0.69      0.79       245
         1.0       0.43      0.80      0.56        71

    accuracy                           0.72       316
   macro avg       0.68      0.75      0.68       316
weighted avg       0.81      0.72      0.74       316

[[170  75]
 [ 14  57]]
Training report:
              precision    recall  f1-score   support

         0.0       0.96      0.74      0.84        66
         1.0       0.39      0.85      0.54        13

    accuracy                           0.76        79
   macro avg       0.68      0.79      0.69        79
weighted avg       0.87      0.76      0.79        79

[[49 17]
 [ 2 11]]
