[780764]: / src / preprocess / 05_data_split.ipynb

Download this file

538 lines (537 with data), 11.8 kB

{
 "cells": [
  {
   "cell_type": "code",
   "id": "debdace9",
   "metadata": {},
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "src_path = os.path.abspath(\"../..\")\n",
    "print(src_path)\n",
    "sys.path.append(src_path)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "6bad1e09",
   "metadata": {},
   "source": [
    "from src.utils import create_directory, raw_data_path, processed_data_path, set_seed"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5d9bc78c",
   "metadata": {},
   "source": [
    "set_seed(seed=42)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "13d22a57",
   "metadata": {},
   "source": [
    "import pandas as pd"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "dd9852d5",
   "metadata": {},
   "source": [
    "mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
    "output_path = os.path.join(processed_data_path, \"mimic4\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "b6a27998",
   "metadata": {},
   "source": [
    "cohort = pd.read_csv(os.path.join(output_path, \"cohort+len.csv\"))\n",
    "print(cohort.shape)\n",
    "cohort.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "9dd92e23",
   "metadata": {},
   "source": [
    "cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n",
    "cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n",
    "cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n",
    "cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "8f55c793",
   "metadata": {},
   "source": [
    "hadm_ids = set(cohort.hadm_id.unique().tolist())\n",
    "len(hadm_ids)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "26459eda",
   "metadata": {},
   "source": [
    "import ast\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "def safe_literal_eval(s):\n",
    "    if pd.isna(s):\n",
    "        return np.nan\n",
    "    return ast.literal_eval(s)\n",
    "\n",
    "\n",
    "cohort.label_diagnosis = cohort.label_diagnosis.apply(safe_literal_eval)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "99d2b8c8",
   "metadata": {},
   "source": [
    "qa_note = pd.read_json(os.path.join(output_path, \"qa_note.jsonl\"), lines = True)\n",
    "qa_note"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "741b6ed7",
   "metadata": {},
   "source": [
    "qa_note.hadm_id.nunique()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "7d287db0",
   "metadata": {},
   "source": [
    "qa_event = pd.read_json(os.path.join(output_path, \"qa_event.jsonl\"), lines = True)\n",
    "qa_event"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "dbc2af95",
   "metadata": {},
   "source": [
    "qa_event.hadm_id.nunique()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "b4bd715e",
   "metadata": {},
   "source": [
    "qa_event.event_type.value_counts()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "5a9d60c6",
   "metadata": {},
   "source": [
    "helper"
   ]
  },
  {
   "cell_type": "code",
   "id": "5171bbae",
   "metadata": {},
   "source": [
    "from concurrent.futures import ThreadPoolExecutor\n",
    "from tqdm import tqdm\n",
    "from pandarallel import pandarallel"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5d6b9ce2",
   "metadata": {},
   "source": [
    "pandarallel.initialize(progress_bar=True)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "941b116b",
   "metadata": {},
   "source": [
    "stat"
   ]
  },
  {
   "cell_type": "code",
   "id": "503a9cbd",
   "metadata": {},
   "source": [
    "cohort = cohort[cohort.hadm_id.isin(qa_note.hadm_id.unique())]\n",
    "len(cohort)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "b4e602ba",
   "metadata": {},
   "source": [
    "cohort = cohort[cohort.hadm_id.isin(qa_event.hadm_id.unique())]\n",
    "len(cohort)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5014c99f",
   "metadata": {
    "scrolled": false
   },
   "source": [
    "cohort.hadm_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "11f5935f",
   "metadata": {},
   "source": [
    "548.490833 / 24"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "87a1b947",
   "metadata": {},
   "source": [
    "cohort.stay_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "f774a307",
   "metadata": {},
   "source": [
    "265.649889 / 24"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "99d6a2f0",
   "metadata": {},
   "source": [
    "cohort.len_selected.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "622669a8",
   "metadata": {},
   "source": [
    "cohort_filtered = cohort[cohort.len_selected <= 1256.650000]\n",
    "cohort_filtered"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "75ddbce8",
   "metadata": {},
   "source": [
    "cohort_filtered.hadm_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "aaed6fb7",
   "metadata": {},
   "source": [
    "cohort_filtered.stay_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "e2cca302",
   "metadata": {},
   "source": [
    "cohort_filtered.len_selected.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "bc283fd9",
   "metadata": {},
   "source": [
    "all_patients = cohort_filtered.subject_id.unique()\n",
    "len(all_patients)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "18aa0954",
   "metadata": {},
   "source": [
    "from sklearn.model_selection import train_test_split\n",
    "\n",
    "\n",
    "train_val_patients, test_patients = train_test_split(all_patients, test_size=0.1, random_state=42)\n",
    "train_patients, val_patients = train_test_split(all_patients, test_size=0.111, random_state=42)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "d5d0a3ac",
   "metadata": {},
   "source": [
    "print(train_patients.shape)\n",
    "print(val_patients.shape)\n",
    "print(test_patients.shape)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "57dc6d8f",
   "metadata": {},
   "source": [
    "train = cohort_filtered[cohort_filtered.subject_id.isin(train_patients)].reset_index(drop=True)\n",
    "val = cohort_filtered[cohort_filtered.subject_id.isin(val_patients)].reset_index(drop=True)\n",
    "test = cohort_filtered[cohort_filtered.subject_id.isin(test_patients)].reset_index(drop=True)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "860d6a39",
   "metadata": {},
   "source": [
    "print(train.shape)\n",
    "print(val.shape)\n",
    "print(test.shape)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "ce3c7f18",
   "metadata": {},
   "source": [
    "train.to_csv(os.path.join(output_path, \"cohort_train.csv\"), index=False)\n",
    "val.to_csv(os.path.join(output_path, \"cohort_val.csv\"), index=False)\n",
    "test.to_csv(os.path.join(output_path, \"cohort_test.csv\"), index=False)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "7b0ea788",
   "metadata": {},
   "source": [
    "_, test_subset_patients = train_test_split(test_patients, test_size=100, random_state=42)\n",
    "len(test_subset_patients)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5ba82c66",
   "metadata": {},
   "source": [
    "test_subset = test[test.subject_id.isin(test_subset_patients)].groupby(\"subject_id\").apply(lambda x: x.sample(1)).reset_index(drop=True)\n",
    "test_subset"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "metadata": {},
   "cell_type": "code",
   "source": "test_subset.len_selected.describe()",
   "id": "0bd73438",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "a58143a2",
   "metadata": {},
   "source": [
    "print(test_subset.subject_id.nunique())\n",
    "print(test_subset.hadm_id.nunique())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "745f82b5",
   "metadata": {},
   "source": [
    "test_subset.to_csv(os.path.join(output_path, \"cohort_test_subset.csv\"), index=False)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "7ba15669",
   "metadata": {},
   "source": [
    "qa_note_test_subset = qa_note[qa_note.hadm_id.isin(test_subset.hadm_id.unique())]\n",
    "qa_note_test_subset"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "040ce4fb",
   "metadata": {},
   "source": [
    "qa_event_test_subset = qa_event[qa_event.hadm_id.isin(test_subset.hadm_id.unique())].groupby(\"hadm_id\").apply(lambda x: x.sample(1)).reset_index(drop=True)\n",
    "qa_event_test_subset"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5455dbca",
   "metadata": {},
   "source": [
    "qa_event_test_subset.hadm_id.nunique()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "4c78aa45",
   "metadata": {},
   "source": [
    "qa_event_test_subset.event_type.value_counts()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "9942e684",
   "metadata": {},
   "source": [
    "qa_test_subset = pd.concat([qa_event_test_subset, qa_note_test_subset]).reset_index(drop=True)\n",
    "qa_test_subset"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "8366ae94",
   "metadata": {},
   "source": [
    "qa_test_subset.to_csv(os.path.join(output_path, \"qa_test_subset.csv\"), index=False)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "410a34c3",
   "metadata": {},
   "source": [],
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch20",
   "language": "python",
   "name": "pytorch20"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}