Switch to side-by-side view

--- a
+++ b/datasets/tjh/preprocess_new.ipynb
@@ -0,0 +1,381 @@
+{
+ "cells": [
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Preprocess the TJH Dataset"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Import packages"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.model_selection import StratifiedKFold\n",
+    "\n",
+    "from pathlib import Path\n",
+    "\n",
+    "import sys\n",
+    "sys.path.append('../')\n",
+    "from utils.tools import forward_fill_pipeline, normalize_dataframe"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read data from files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = pd.read_excel('./raw_data/time_series_375_prerpocess_en.xlsx')"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Preprocess Data"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Rename columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.rename(columns={\"PATIENT_ID\": \"PatientID\", \"outcome\": \"Outcome\", \"gender\": \"Sex\", \"age\": \"Age\", \"RE_DATE\": \"RecordTime\", \"Admission time\": \"AdmissionTime\", \"Discharge time\": \"DischargeTime\"})"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Fill PatientID column"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['PatientID'].fillna(method='ffill', inplace=True)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Format data values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# gender transformation: 1--male, 0--female\n",
+    "df['Sex'].replace(2, 0, inplace=True)\n",
+    "\n",
+    "# only reserve y-m-d precision for `RE_DATE` and `Discharge time` columns\n",
+    "df['RecordTime'] = df['RecordTime'].dt.strftime('%Y-%m-%d')\n",
+    "df['DischargeTime'] = df['DischargeTime'].dt.strftime('%Y-%m-%d')\n",
+    "df['AdmissionTime'] = df['AdmissionTime'].dt.strftime('%Y-%m-%d')"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Exclude patients with missing labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df.dropna(subset = ['PatientID', 'RecordTime', 'DischargeTime'], how='any')"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Calculate the Length-of-Stay (LOS) label"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df['LOS'] = (pd.to_datetime(df['DischargeTime']) - pd.to_datetime(df['RecordTime'])).dt.days\n",
+    "\n",
+    "# Notice: Set negative LOS values to 0\n",
+    "df['LOS'] = df['LOS'].apply(lambda x: 0 if x < 0 else x)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Drop columns whose values are all the same or all NaN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Drop '2019-nCoV nucleic acid detection' column \n",
+    "df = df.drop(columns=['2019-nCoV nucleic acid detection'])"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Record feature names"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "basic_records = ['PatientID', 'RecordTime', 'AdmissionTime', 'DischargeTime']\n",
+    "target_features = ['Outcome', 'LOS']\n",
+    "demographic_features = ['Sex', 'Age']\n",
+    "labtest_features = ['Hypersensitive cardiac troponinI', 'hemoglobin', 'Serum chloride', 'Prothrombin time', 'procalcitonin', 'eosinophils(%)', 'Interleukin 2 receptor', 'Alkaline phosphatase', 'albumin', 'basophil(%)', 'Interleukin 10', 'Total bilirubin', 'Platelet count', 'monocytes(%)', 'antithrombin', 'Interleukin 8', 'indirect bilirubin', 'Red blood cell distribution width ', 'neutrophils(%)', 'total protein', 'Quantification of Treponema pallidum antibodies', 'Prothrombin activity', 'HBsAg', 'mean corpuscular volume', 'hematocrit', 'White blood cell count', 'Tumor necrosis factorα', 'mean corpuscular hemoglobin concentration', 'fibrinogen', 'Interleukin 1β', 'Urea', 'lymphocyte count', 'PH value', 'Red blood cell count', 'Eosinophil count', 'Corrected calcium', 'Serum potassium', 'glucose', 'neutrophils count', 'Direct bilirubin', 'Mean platelet volume', 'ferritin', 'RBC distribution width SD', 'Thrombin time', '(%)lymphocyte', 'HCV antibody quantification', 'D-D dimer', 'Total cholesterol', 'aspartate aminotransferase', 'Uric acid', 'HCO3-', 'calcium', 'Amino-terminal brain natriuretic peptide precursor(NT-proBNP)', 'Lactate dehydrogenase', 'platelet large cell ratio ', 'Interleukin 6', 'Fibrin degradation products', 'monocytes count', 'PLT distribution width', 'globulin', 'γ-glutamyl transpeptidase', 'International standard ratio', 'basophil count(#)', 'mean corpuscular hemoglobin ', 'Activation of partial thromboplastin time', 'Hypersensitive c-reactive protein', 'HIV antibody quantification', 'serum sodium', 'thrombocytocrit', 'ESR', 'glutamic-pyruvic transaminase', 'eGFR', 'creatinine']"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Set negative values to NaN"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Set negative values to NaN\n",
+    "df[df[demographic_features + labtest_features] < 0] = np.nan"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Merge by date"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Merge by PatientID and RecordTime\n",
+    "df = df.groupby(['PatientID', 'RecordTime', 'AdmissionTime', 'DischargeTime'], dropna=True, as_index = False).mean()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Change the order of columns"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = df[ basic_records + target_features + demographic_features + labtest_features ]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Export data to files"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df.to_csv(\"./processed_data/tjh_data_before_imputation.csv\", index=False)"
+   ]
+  },
+  {
+   "attachments": {},
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Stratified split dataset into train, validation and test sets\n",
+    "\n",
+    "- Also include (Imputation & Normalization & Outlier Filtering) steps\n",
+    "- The train, validation and test sets are saved in the `./processed_data` folder\n",
+    "- For TJH dataset, use 8:1:1 10-fold, for CDSL dataset, use 8:1:1 with 5 random seeds initialization (Patient-level split)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "seed = 42\n",
+    "num_folds = 10\n",
+    "\n",
+    "# Group the dataframe by patient ID\n",
+    "grouped = df.groupby('PatientID')\n",
+    "\n",
+    "# Split the patient IDs into train/val/test sets\n",
+    "patients = np.array(list(grouped.groups.keys()))\n",
+    "kf = StratifiedKFold(n_splits=num_folds, shuffle=True, random_state=seed)\n",
+    "\n",
+    "for fold, (train_val_index, test_index) in enumerate(kf.split(patients, df.groupby('PatientID')['Outcome'].first())):\n",
+    "    # Get the train/val/test patient IDs for the current fold\n",
+    "    train_val_patients, test_patients = patients[train_val_index], patients[test_index]\n",
+    "\n",
+    "    # Split the train_val_patients into train/val sets\n",
+    "    train_patients, val_patients = train_test_split(train_val_patients, test_size=1/(num_folds-1), random_state=seed, stratify=df[df['PatientID'].isin(train_val_patients)].groupby('PatientID')['Outcome'].first())\n",
+    "\n",
+    "    # Create train, val, and test dataframes for the current fold\n",
+    "    train_df = df[df['PatientID'].isin(train_patients)]\n",
+    "    val_df = df[df['PatientID'].isin(val_patients)]\n",
+    "    test_df = df[df['PatientID'].isin(test_patients)]\n",
+    "    \n",
+    "    assert len(train_df) + len(val_df) + len(test_df) == len(df)\n",
+    "\n",
+    "    # Save the train, val, and test dataframes for the current fold to csv files\n",
+    "    Path(f\"./processed_data/fold_{fold}/\").mkdir(parents=True, exist_ok=True)\n",
+    "    train_df.to_csv(f\"./processed_data/fold_{fold}/train_raw.csv\", index=False)\n",
+    "    val_df.to_csv(f\"./processed_data/fold_{fold}/val_raw.csv\", index=False)\n",
+    "    test_df.to_csv(f\"./processed_data/fold_{fold}/test_raw.csv\", index=False)\n",
+    "\n",
+    "    # Calculate the mean and std of the train set (include age, lab test features, and LOS) on the data in 5% to 95% quantile range\n",
+    "    normalize_features = ['Age'] + labtest_features + ['LOS']\n",
+    "\n",
+    "    train_df, val_df, test_df, default_fill, LOS_info = normalize_dataframe(train_df, val_df, test_df, normalize_features)\n",
+    "\n",
+    "    \"\"\"\n",
+    "    Notice: we do not need the following code to filter outliers since some of the `outliers` are actually the real values.\n",
+    "    \"\"\"\n",
+    "    \n",
+    "    # Drop rows if all features are recorded NaN\n",
+    "    train_df = train_df.dropna(axis=0, how='all', subset=normalize_features)\n",
+    "    val_df = val_df.dropna(axis=0, how='all', subset=normalize_features)\n",
+    "    test_df = test_df.dropna(axis=0, how='all', subset=normalize_features)\n",
+    "\n",
+    "    # Save the train, val, and test dataframes for the current fold to csv files\n",
+    "    train_df.to_csv(f\"./processed_data/fold_{fold}/train_after_zscore.csv\", index=False)\n",
+    "    val_df.to_csv(f\"./processed_data/fold_{fold}/val_after_zscore.csv\", index=False)\n",
+    "    test_df.to_csv(f\"./processed_data/fold_{fold}/test_after_zscore.csv\", index=False)\n",
+    "\n",
+    "    # Forward Imputation after grouped by PatientID\n",
+    "    # Notice: if a patient has never done certain lab test, the imputed value will be the median value calculated from train set\n",
+    "    train_x, train_y, train_pid = forward_fill_pipeline(train_df, default_fill, demographic_features, labtest_features)\n",
+    "    val_x, val_y, val_pid = forward_fill_pipeline(val_df, default_fill, demographic_features, labtest_features)\n",
+    "    test_x, test_y, test_pid = forward_fill_pipeline(test_df, default_fill, demographic_features, labtest_features)\n",
+    "\n",
+    "    # Save the imputed dataset to pickle file\n",
+    "    pd.to_pickle(train_x, f\"./processed_data/fold_{fold}/train_x.pkl\")\n",
+    "    pd.to_pickle(train_y, f\"./processed_data/fold_{fold}/train_y.pkl\")\n",
+    "    pd.to_pickle(train_pid, f\"./processed_data/fold_{fold}/train_pid.pkl\")\n",
+    "    pd.to_pickle(val_x, f\"./processed_data/fold_{fold}/val_x.pkl\")\n",
+    "    pd.to_pickle(val_y, f\"./processed_data/fold_{fold}/val_y.pkl\")\n",
+    "    pd.to_pickle(val_pid, f\"./processed_data/fold_{fold}/val_pid.pkl\")\n",
+    "    pd.to_pickle(test_x, f\"./processed_data/fold_{fold}/test_x.pkl\")\n",
+    "    pd.to_pickle(test_y, f\"./processed_data/fold_{fold}/test_y.pkl\")\n",
+    "    pd.to_pickle(test_pid, f\"./processed_data/fold_{fold}/test_pid.pkl\")\n",
+    "    pd.to_pickle(LOS_info, f\"./processed_data/fold_{fold}/LOS_info.pkl\")\n"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pytorch",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.5"
+  },
+  "orig_nbformat": 4
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}