--- a +++ b/split_data.ipynb @@ -0,0 +1,83 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n", + "The training data has 316 data. The testing data has 79 data. \n", + "Positive ratio: \n", + "\tTrain: 0.21203\n", + "\tTest: 0.21519\n" + ] + } + ], + "source": [ + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.model_selection import train_test_split\n", + "\n", + "data = pd.read_excel(\"TrainDataset2024.xls\")\n", + "\n", + "data = data[data[\"pCR (outcome)\"] != 999]\n", + "\n", + "X = data.drop(columns=[\"pCR (outcome)\"], axis=1)\n", + "y = data[\"pCR (outcome)\"]\n", + "\n", + "while True: \n", + " X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n", + "\n", + " X_train_full.reset_index(drop=True, inplace=True)\n", + " X_test_reserved.reset_index(drop=True, inplace=True)\n", + " y_train_full.reset_index(drop=True, inplace=True)\n", + " y_test_reserved.reset_index(drop=True, inplace=True)\n", + "\n", + " ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n", + " ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n", + "\n", + " if abs(ratio_train - ratio_test) < 0.01:\n", + " break\n", + "\n", + "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n", + "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n", + "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")\n", + "\n", + "id = X_train_full[\"ID\"]\n", + "X_train_full = X_train_full.drop(columns=[\"ID\"])\n", + "data_train = pd.concat([id, y_train_full, X_train_full], axis=1)\n", + "data_train.to_excel(\"train_data.xls\", engine='openpyxl', index=False)\n", + "\n", + "id = X_test_reserved[\"ID\"]\n", + "X_test_reserved = X_test_reserved.drop(columns=[\"ID\"])\n", + "data_test = pd.concat([id, y_test_reserved, X_test_reserved], axis=1)\n", + "data_test.to_excel(\"test_data.xls\", engine='openpyxl', index=False)" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "MLEAsm", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.15" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}