Diff of /svm/test.ipynb [000000] .. [e6e569]

Switch to side-by-side view

--- a
+++ b/svm/test.ipynb
@@ -0,0 +1,311 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Set the path to the `xls` file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 703,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_file = \"../TrainDataset2024.xls\"\n",
+    "# training_file = \"/kaggle/input/dataset/TrainDataset2024.xls\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Import libraries"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 704,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "import os\n",
+    "\n",
+    "# Add the parent directory to the system path\n",
+    "sys.path.append(os.path.abspath('../'))  # Adjust the path as needed\n",
+    "\n",
+    "from my_util import df_to_corr_matrix\n",
+    "\n",
+    "import tensorflow as tf\n",
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "import seaborn as sns\n",
+    "import matplotlib.pyplot as plt\n",
+    "import plotly.graph_objects as go\n",
+    "\n",
+    "from matplotlib.colors import Normalize\n",
+    "from sklearn.decomposition import PCA\n",
+    "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV\n",
+    "from sklearn.preprocessing import StandardScaler, RobustScaler\n",
+    "from sklearn.tree import DecisionTreeClassifier\n",
+    "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score\n",
+    "from sklearn.svm import SVC\n",
+    "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n",
+    "from sklearn.impute import KNNImputer\n",
+    "\n",
+    "from imblearn.over_sampling import SMOTE\n",
+    "from imblearn.pipeline import Pipeline\n",
+    "\n",
+    "from joblib import Parallel, delayed\n",
+    "\n",
+    "from pickle import dump , load\n",
+    "\n",
+    "import warnings"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Read the data and load the selected features"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 705,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded '../FeatureSelection/pkl/20_selected_features.pkl' to selected_feature\n",
+      "(395, 20) (395,)\n"
+     ]
+    }
+   ],
+   "source": [
+    "NUM_OF_SELECTED_FEATURES = 20\n",
+    "\n",
+    "data = pd.read_excel(training_file)\n",
+    "data.replace(999, np.nan, inplace=True)\n",
+    "\n",
+    "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n",
+    "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n",
+    "\n",
+    "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n",
+    "    selected_features = load(file)\n",
+    "    print(f\"Loaded '{file.name}' to selected_feature\")\n",
+    "\n",
+    "X = data[selected_features]\n",
+    "y = data[\"pCR (outcome)\"]\n",
+    "print(X.shape, y.shape)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Imputer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 706,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('pkl/imputer.pkl', 'rb') as file:\n",
+    "  imputer = load(file)\n",
+    "X = pd.DataFrame(imputer.transform(X), columns=X.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Normalisation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 707,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('pkl/StandardScaler.pkl', 'rb') as file:\n",
+    "  scaler = load(file)\n",
+    "X = pd.DataFrame(scaler.transform(X), columns=X.columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Split the data into train_full and test_reserved (untouch)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 758,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Splited the data into train and test. The test will not be used in the training, but just for test the model. \n",
+      "The training data has 316 data. The testing data has 79 data. \n",
+      "Positive ratio: \n",
+      "\tTrain: 0.22468\n",
+      "\tTest: 0.16456\n"
+     ]
+    }
+   ],
+   "source": [
+    "# Close ratio random_state\n",
+    "# [14, 47, 49, 52, 62, 76, 83, 89, 92, 116, 118, 122, 136, 138, 144, 146, 150, 156, 157, 159, 170, 172, 174, 185]\n",
+    "\n",
+    "while True:  \n",
+    "    # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0\n",
+    "    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n",
+    "\n",
+    "    X_train_full.reset_index(drop=True, inplace=True)\n",
+    "    X_test_reserved.reset_index(drop=True, inplace=True)\n",
+    "    y_train_full.reset_index(drop=True, inplace=True)\n",
+    "    y_test_reserved.reset_index(drop=True, inplace=True)\n",
+    "\n",
+    "    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n",
+    "    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n",
+    "\n",
+    "    if abs(ratio_train - ratio_test) < 0.1:\n",
+    "        break\n",
+    "\n",
+    "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the model. \")\n",
+    "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n",
+    "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Training report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "         0.0       0.92      0.69      0.79       245\n",
+      "         1.0       0.43      0.80      0.56        71\n",
+      "\n",
+      "    accuracy                           0.72       316\n",
+      "   macro avg       0.68      0.75      0.68       316\n",
+      "weighted avg       0.81      0.72      0.74       316\n",
+      "\n",
+      "[[170  75]\n",
+      " [ 14  57]]\n",
+      "Training report:\n",
+      "              precision    recall  f1-score   support\n",
+      "\n",
+      "         0.0       0.96      0.74      0.84        66\n",
+      "         1.0       0.39      0.85      0.54        13\n",
+      "\n",
+      "    accuracy                           0.76        79\n",
+      "   macro avg       0.68      0.79      0.69        79\n",
+      "weighted avg       0.87      0.76      0.79        79\n",
+      "\n",
+      "[[49 17]\n",
+      " [ 2 11]]\n"
+     ]
+    }
+   ],
+   "source": [
+    "pipeline = Pipeline(\n",
+    "    [\n",
+    "        (\"pca\", PCA()),\n",
+    "        (\"sampling\", SMOTE()),\n",
+    "        (\"svc\", SVC(max_iter=100_000_000)),\n",
+    "    ]\n",
+    ")\n",
+    "\n",
+    "params = {\n",
+    "    \"pca__n_components\": 9,\n",
+    "    \"svc__C\": 0.07,\n",
+    "    \"svc__degree\": 7,\n",
+    "    \"svc__gamma\": \"auto\",\n",
+    "    \"svc__kernel\": \"linear\",\n",
+    "}\n",
+    "\n",
+    "# params = {\n",
+    "#     \"pca__n_components\": 11,\n",
+    "#     \"svc__C\": 0.15,\n",
+    "#     \"svc__degree\": 1,\n",
+    "#     \"svc__gamma\": 5,\n",
+    "#     \"svc__kernel\": \"poly\",\n",
+    "# }\n",
+    "\n",
+    "# params = {\n",
+    "#     \"pca__n_components\": 11,\n",
+    "#     \"svc__C\": 0.2,\n",
+    "#     \"svc__degree\": 4,\n",
+    "#     \"svc__gamma\": \"scale\",\n",
+    "#     \"svc__kernel\": \"sigmoid\",\n",
+    "# }\n",
+    "\n",
+    "# params = {\n",
+    "#     \"pca__n_components\": 11,\n",
+    "#     \"svc__C\": 0.2,\n",
+    "#     \"svc__degree\": 4,\n",
+    "#     \"svc__gamma\": 0.05,\n",
+    "#     \"svc__kernel\": \"rbf\",\n",
+    "# }\n",
+    "\n",
+    "pipeline.set_params(**params)\n",
+    "pipeline.fit(X_train_full, y_train_full)\n",
+    "\n",
+    "y_pred = pipeline.predict(X_train_full)\n",
+    "print(\"Training report:\")\n",
+    "print(classification_report(y_train_full, y_pred))\n",
+    "print(confusion_matrix(y_train_full, y_pred))\n",
+    "\n",
+    "y_pred = pipeline.predict(X_test_reserved)\n",
+    "print(\"Testing report:\")\n",
+    "print(classification_report(y_test_reserved, y_pred))\n",
+    "print(confusion_matrix(y_test_reserved, y_pred))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "MLEAsm",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}