--- a +++ b/svm/test.ipynb @@ -0,0 +1,311 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Set the path to the `xls` file" + ] + }, + { + "cell_type": "code", + "execution_count": 703, + "metadata": {}, + "outputs": [], + "source": [ + "training_file = \"../TrainDataset2024.xls\"\n", + "# training_file = \"/kaggle/input/dataset/TrainDataset2024.xls\"" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Import libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 704, + "metadata": {}, + "outputs": [], + "source": [ + "import sys\n", + "import os\n", + "\n", + "# Add the parent directory to the system path\n", + "sys.path.append(os.path.abspath('../')) # Adjust the path as needed\n", + "\n", + "from my_util import df_to_corr_matrix\n", + "\n", + "import tensorflow as tf\n", + "import pandas as pd\n", + "import numpy as np\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "import plotly.graph_objects as go\n", + "\n", + "from matplotlib.colors import Normalize\n", + "from sklearn.decomposition import PCA\n", + "from sklearn.model_selection import train_test_split, KFold, cross_val_score, GridSearchCV\n", + "from sklearn.preprocessing import StandardScaler, RobustScaler\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "from sklearn.metrics import classification_report, confusion_matrix, precision_score, recall_score, accuracy_score\n", + "from sklearn.svm import SVC\n", + "from sklearn.feature_selection import SelectKBest, f_classif, chi2, mutual_info_classif\n", + "from sklearn.impute import KNNImputer\n", + "\n", + "from imblearn.over_sampling import SMOTE\n", + "from imblearn.pipeline import Pipeline\n", + "\n", + "from joblib import Parallel, delayed\n", + "\n", + "from pickle import dump , load\n", + "\n", + "import warnings" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Read the data and load the selected features" + ] + }, + { + "cell_type": "code", + "execution_count": 705, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded '../FeatureSelection/pkl/20_selected_features.pkl' to selected_feature\n", + "(395, 20) (395,)\n" + ] + } + ], + "source": [ + "NUM_OF_SELECTED_FEATURES = 20\n", + "\n", + "data = pd.read_excel(training_file)\n", + "data.replace(999, np.nan, inplace=True)\n", + "\n", + "data.drop([\"ID\", \"RelapseFreeSurvival (outcome)\"], axis=1, inplace=True)\n", + "data.dropna(subset=[\"pCR (outcome)\"], inplace=True)\n", + "\n", + "with open(f'../FeatureSelection/pkl/{NUM_OF_SELECTED_FEATURES}_selected_features.pkl', mode='rb') as file:\n", + " selected_features = load(file)\n", + " print(f\"Loaded '{file.name}' to selected_feature\")\n", + "\n", + "X = data[selected_features]\n", + "y = data[\"pCR (outcome)\"]\n", + "print(X.shape, y.shape)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Imputer" + ] + }, + { + "cell_type": "code", + "execution_count": 706, + "metadata": {}, + "outputs": [], + "source": [ + "with open('pkl/imputer.pkl', 'rb') as file:\n", + " imputer = load(file)\n", + "X = pd.DataFrame(imputer.transform(X), columns=X.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Normalisation" + ] + }, + { + "cell_type": "code", + "execution_count": 707, + "metadata": {}, + "outputs": [], + "source": [ + "with open('pkl/StandardScaler.pkl', 'rb') as file:\n", + " scaler = load(file)\n", + "X = pd.DataFrame(scaler.transform(X), columns=X.columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Split the data into train_full and test_reserved (untouch)" + ] + }, + { + "cell_type": "code", + "execution_count": 758, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Splited the data into train and test. The test will not be used in the training, but just for test the model. \n", + "The training data has 316 data. The testing data has 79 data. \n", + "Positive ratio: \n", + "\tTrain: 0.22468\n", + "\tTest: 0.16456\n" + ] + } + ], + "source": [ + "# Close ratio random_state\n", + "# [14, 47, 49, 52, 62, 76, 83, 89, 92, 116, 118, 122, 136, 138, 144, 146, 150, 156, 157, 159, 170, 172, 174, 185]\n", + "\n", + "while True: \n", + " # X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=14) # similar distribution of 1 and 0\n", + " X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n", + "\n", + " X_train_full.reset_index(drop=True, inplace=True)\n", + " X_test_reserved.reset_index(drop=True, inplace=True)\n", + " y_train_full.reset_index(drop=True, inplace=True)\n", + " y_test_reserved.reset_index(drop=True, inplace=True)\n", + "\n", + " ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n", + " ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n", + "\n", + " if abs(ratio_train - ratio_test) < 0.1:\n", + " break\n", + "\n", + "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the model. \")\n", + "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n", + "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Training report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.92 0.69 0.79 245\n", + " 1.0 0.43 0.80 0.56 71\n", + "\n", + " accuracy 0.72 316\n", + " macro avg 0.68 0.75 0.68 316\n", + "weighted avg 0.81 0.72 0.74 316\n", + "\n", + "[[170 75]\n", + " [ 14 57]]\n", + "Training report:\n", + " precision recall f1-score support\n", + "\n", + " 0.0 0.96 0.74 0.84 66\n", + " 1.0 0.39 0.85 0.54 13\n", + "\n", + " accuracy 0.76 79\n", + " macro avg 0.68 0.79 0.69 79\n", + "weighted avg 0.87 0.76 0.79 79\n", + "\n", + "[[49 17]\n", + " [ 2 11]]\n" + ] + } + ], + "source": [ + "pipeline = Pipeline(\n", + " [\n", + " (\"pca\", PCA()),\n", + " (\"sampling\", SMOTE()),\n", + " (\"svc\", SVC(max_iter=100_000_000)),\n", + " ]\n", + ")\n", + "\n", + "params = {\n", + " \"pca__n_components\": 9,\n", + " \"svc__C\": 0.07,\n", + " \"svc__degree\": 7,\n", + " \"svc__gamma\": \"auto\",\n", + " \"svc__kernel\": \"linear\",\n", + "}\n", + "\n", + "# params = {\n", + "# \"pca__n_components\": 11,\n", + "# \"svc__C\": 0.15,\n", + "# \"svc__degree\": 1,\n", + "# \"svc__gamma\": 5,\n", + "# \"svc__kernel\": \"poly\",\n", + "# }\n", + "\n", + "# params = {\n", + "# \"pca__n_components\": 11,\n", + "# \"svc__C\": 0.2,\n", + "# \"svc__degree\": 4,\n", + "# \"svc__gamma\": \"scale\",\n", + "# \"svc__kernel\": \"sigmoid\",\n", + "# }\n", + "\n", + "# params = {\n", + "# \"pca__n_components\": 11,\n", + "# \"svc__C\": 0.2,\n", + "# \"svc__degree\": 4,\n", + "# \"svc__gamma\": 0.05,\n", + "# \"svc__kernel\": \"rbf\",\n", + "# }\n", + "\n", + "pipeline.set_params(**params)\n", + "pipeline.fit(X_train_full, y_train_full)\n", + "\n", + "y_pred = pipeline.predict(X_train_full)\n", + "print(\"Training report:\")\n", + "print(classification_report(y_train_full, y_pred))\n", + "print(confusion_matrix(y_train_full, y_pred))\n", + "\n", + "y_pred = pipeline.predict(X_test_reserved)\n", + "print(\"Testing report:\")\n", + "print(classification_report(y_test_reserved, y_pred))\n", + "print(confusion_matrix(y_test_reserved, y_pred))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MLEAsm", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}