--- a
+++ b/split_data.ipynb
@@ -0,0 +1,83 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n",
+      "The training data has 316 data. The testing data has 79 data. \n",
+      "Positive ratio: \n",
+      "\tTrain: 0.21203\n",
+      "\tTest: 0.21519\n"
+     ]
+    }
+   ],
+   "source": [
+    "import pandas as pd\n",
+    "import numpy as np\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "\n",
+    "data = pd.read_excel(\"TrainDataset2024.xls\")\n",
+    "\n",
+    "data = data[data[\"pCR (outcome)\"] != 999]\n",
+    "\n",
+    "X = data.drop(columns=[\"pCR (outcome)\"], axis=1)\n",
+    "y = data[\"pCR (outcome)\"]\n",
+    "\n",
+    "while True:  \n",
+    "    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n",
+    "\n",
+    "    X_train_full.reset_index(drop=True, inplace=True)\n",
+    "    X_test_reserved.reset_index(drop=True, inplace=True)\n",
+    "    y_train_full.reset_index(drop=True, inplace=True)\n",
+    "    y_test_reserved.reset_index(drop=True, inplace=True)\n",
+    "\n",
+    "    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n",
+    "    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n",
+    "\n",
+    "    if abs(ratio_train - ratio_test) < 0.01:\n",
+    "        break\n",
+    "\n",
+    "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n",
+    "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n",
+    "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")\n",
+    "\n",
+    "id = X_train_full[\"ID\"]\n",
+    "X_train_full = X_train_full.drop(columns=[\"ID\"])\n",
+    "data_train = pd.concat([id, y_train_full, X_train_full], axis=1)\n",
+    "data_train.to_excel(\"train_data.xls\", engine='openpyxl', index=False)\n",
+    "\n",
+    "id = X_test_reserved[\"ID\"]\n",
+    "X_test_reserved = X_test_reserved.drop(columns=[\"ID\"])\n",
+    "data_test = pd.concat([id, y_test_reserved, X_test_reserved], axis=1)\n",
+    "data_test.to_excel(\"test_data.xls\", engine='openpyxl', index=False)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "MLEAsm",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.15"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}