[e6e569]: / split_data.ipynb

Download this file

84 lines (83 with data), 2.9 kB

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \n",
      "The training data has 316 data. The testing data has 79 data. \n",
      "Positive ratio: \n",
      "\tTrain: 0.21203\n",
      "\tTest: 0.21519\n"
     ]
    }
   ],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "data = pd.read_excel(\"TrainDataset2024.xls\")\n",
    "\n",
    "data = data[data[\"pCR (outcome)\"] != 999]\n",
    "\n",
    "X = data.drop(columns=[\"pCR (outcome)\"], axis=1)\n",
    "y = data[\"pCR (outcome)\"]\n",
    "\n",
    "while True:  \n",
    "    X_train_full, X_test_reserved, y_train_full, y_test_reserved = train_test_split(X, y, test_size=0.2, random_state=None)\n",
    "\n",
    "    X_train_full.reset_index(drop=True, inplace=True)\n",
    "    X_test_reserved.reset_index(drop=True, inplace=True)\n",
    "    y_train_full.reset_index(drop=True, inplace=True)\n",
    "    y_test_reserved.reset_index(drop=True, inplace=True)\n",
    "\n",
    "    ratio_train = sum(y_train_full[y_train_full==1]) / len(y_train_full)\n",
    "    ratio_test = sum(y_test_reserved[y_test_reserved==1]) / len(y_test_reserved)\n",
    "\n",
    "    if abs(ratio_train - ratio_test) < 0.01:\n",
    "        break\n",
    "\n",
    "print(\"Splited the data into train and test. The test will not be used in the training, but just for test the xgb. \")\n",
    "print(f\"The training data has {len(X_train_full)} data. The testing data has {len(X_test_reserved)} data. \")\n",
    "print(f\"Positive ratio: \\n\\tTrain: {ratio_train:.5f}\\n\\tTest: {ratio_test:.5f}\")\n",
    "\n",
    "id = X_train_full[\"ID\"]\n",
    "X_train_full = X_train_full.drop(columns=[\"ID\"])\n",
    "data_train = pd.concat([id, y_train_full, X_train_full], axis=1)\n",
    "data_train.to_excel(\"train_data.xls\", engine='openpyxl', index=False)\n",
    "\n",
    "id = X_test_reserved[\"ID\"]\n",
    "X_test_reserved = X_test_reserved.drop(columns=[\"ID\"])\n",
    "data_test = pd.concat([id, y_test_reserved, X_test_reserved], axis=1)\n",
    "data_test.to_excel(\"test_data.xls\", engine='openpyxl', index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MLEAsm",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.15"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}