[e6e569]: / svm / test.ipynb

Download this file

312 lines (311 with data), 9.2 kB

{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Set the path to the `xls` file"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 703,
   "metadata": {},
   "outputs": [],
   "source": [
    "training_file = \"../TrainDataset2024.xls\"\n",
    "# training_file = \"/kaggle/input/dataset/TrainDataset2024.xls\""
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Import libraries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 704,
   "metadata": {},
   "outputs": [],
   "source": [
    "import sys\n",
    "import os\n",
    "\n",
    "# Add the parent directory to the system path\n",
    "sys.path.append(os.path.abspath('../'))  # Adjust the path as needed\n",
    "\n",
    "from my_util import df_to_corr_matrix\n",
    "\n",
    "import tensorflow as tf\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import seaborn as sns\n",
    "import matplotlib.pyplot as plt\n",
    "import plotly.graph_objects as go\n",
    "\n",
    "from matplotlib.colors import Normalize\n",
    "from sklearn.decomposition import PCA\n",
    "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV\n",
    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
    "from sklearn.impute import KNNImputer\n",
    "\n",
    "from imblearn.over_sampling import SMOTE\n",
    "from imblearn.pipeline import Pipeline\n",
    "\n",
    "from joblib import Parallel, delayed\n",
    "\n",
    "from pickle import dump , load\n",
    "\n",
    "import warnings"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read the data and load the selected features"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 705,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded '../FeatureSelection/pkl/20_selected_features.pkl' to selected_feature\n",
      "(395, 20) (395,)\n"
     ]
    }
   ],
   "source": [
    "NUM_OF_SELECTED_FEATURES = 20\n",
    "\n",
    "data = pd.read_excel(training_file)\n",
    "data.replace(999, np.nan, inplace=True)\n",
    "\n",
    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
    "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
    "\n",
    "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n",
    "    selected_features = load(file)\n",
    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
    "\n",
    "X = data[selected_features]\n",
    "y = data[\"pCR (outcome)\"]\n",
    "print(X.shape, y.shape)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Imputer"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 706,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('pkl/imputer.pkl', 'rb') as file:\n",
    "  imputer = load(file)\n",
    "X = pd.DataFrame(imputer.transform(X), columns=X.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Normalisation"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 707,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('pkl/StandardScaler.pkl', 'rb') as file:\n",
    "  scaler = load(file)\n",
    "X = pd.DataFrame(scaler.transform(X), columns=X.columns)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Split the data into train_full and test_reserved (untouch)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 758,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Splited the data into train and test. The test will not be used in the training, but just for test the model. \n",
      "The training data has 316 data. The testing data has 79 data. \n",
      "Positive ratio: \n",
      "\tTrain: 0.22468\n",
      "\tTest: 0.16456\n"
     ]
    }
   ],
   "source": [
    "# Close ratio random_state\n",
    "# [14, 47, 49, 52, 62, 76, 83, 89, 92, 116, 118, 122, 136, 138, 144, 146, 150, 156, 157, 159, 170, 172, 174, 185]\n",
    "\n",
    "while True:  \n",
    "    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0\n",
    "    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n",
    "\n",
    "    X_train_full.reset_index(drop=True, inplace=True)\n",
    "    X_test_reserved.reset_index(drop=True, inplace=True)\n",
    "    y_train_full.reset_index(drop=True, inplace=True)\n",
    "    y_test_reserved.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n",
    "    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n",
    "\n",
    "    if abs(ratio_train - ratio_test) < 0.1:\n",
    "        break\n",
    "\n",
    "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the model. \")\n",
    "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n",
    "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Training report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.92      0.69      0.79       245\n",
      "         1.0       0.43      0.80      0.56        71\n",
      "\n",
      "    accuracy                           0.72       316\n",
      "   macro avg       0.68      0.75      0.68       316\n",
      "weighted avg       0.81      0.72      0.74       316\n",
      "\n",
      "[[170  75]\n",
      " [ 14  57]]\n",
      "Training report:\n",
      "              precision    recall  f1-score   support\n",
      "\n",
      "         0.0       0.96      0.74      0.84        66\n",
      "         1.0       0.39      0.85      0.54        13\n",
      "\n",
      "    accuracy                           0.76        79\n",
      "   macro avg       0.68      0.79      0.69        79\n",
      "weighted avg       0.87      0.76      0.79        79\n",
      "\n",
      "[[49 17]\n",
      " [ 2 11]]\n"
     ]
    }
   ],
   "source": [
    "pipeline = Pipeline(\n",
    "    [\n",
    "        (\"pca\", PCA()),\n",
    "        (\"sampling\", SMOTE()),\n",
    "        (\"svc\", SVC(max_iter=100_000_000)),\n",
    "    ]\n",
    ")\n",
    "\n",
    "params = {\n",
    "    \"pca__n_components\": 9,\n",
    "    \"svc__C\": 0.07,\n",
    "    \"svc__degree\": 7,\n",
    "    \"svc__gamma\": \"auto\",\n",
    "    \"svc__kernel\": \"linear\",\n",
    "}\n",
    "\n",
    "# params = {\n",
    "#     \"pca__n_components\": 11,\n",
    "#     \"svc__C\": 0.15,\n",
    "#     \"svc__degree\": 1,\n",
    "#     \"svc__gamma\": 5,\n",
    "#     \"svc__kernel\": \"poly\",\n",
    "# }\n",
    "\n",
    "# params = {\n",
    "#     \"pca__n_components\": 11,\n",
    "#     \"svc__C\": 0.2,\n",
    "#     \"svc__degree\": 4,\n",
    "#     \"svc__gamma\": \"scale\",\n",
    "#     \"svc__kernel\": \"sigmoid\",\n",
    "# }\n",
    "\n",
    "# params = {\n",
    "#     \"pca__n_components\": 11,\n",
    "#     \"svc__C\": 0.2,\n",
    "#     \"svc__degree\": 4,\n",
    "#     \"svc__gamma\": 0.05,\n",
    "#     \"svc__kernel\": \"rbf\",\n",
    "# }\n",
    "\n",
    "pipeline.set_params(**params)\n",
    "pipeline.fit(X_train_full, y_train_full)\n",
    "\n",
    "y_pred = pipeline.predict(X_train_full)\n",
    "print(\"Training report:\")\n",
    "print(classification_report(y_train_full, y_pred))\n",
    "print(confusion_matrix(y_train_full, y_pred))\n",
    "\n",
    "y_pred = pipeline.predict(X_test_reserved)\n",
    "print(\"Testing report:\")\n",
    "print(classification_report(y_test_reserved, y_pred))\n",
    "print(confusion_matrix(y_test_reserved, y_pred))"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MLEAsm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}