[780764]: / src / preprocess / 01_cohort_selection.ipynb

Download this file

576 lines (575 with data), 12.5 kB

{
 "cells": [
  {
   "cell_type": "code",
   "id": "a2ccf6ab",
   "metadata": {},
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "src_path = os.path.abspath('../..')\n",
    "print(src_path)\n",
    "sys.path.append(src_path)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "3eb75526",
   "metadata": {},
   "source": [
    "from src.utils import create_directory, dump_pickle, raw_data_path, processed_data_path, set_seed"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "17fef7f7",
   "metadata": {},
   "source": [
    "set_seed(seed=42)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "51a59258",
   "metadata": {},
   "source": [
    "import pandas as pd"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "27ed0a87",
   "metadata": {},
   "source": [
    "mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
    "mimic_iv_note_path = os.path.join(raw_data_path, \"physionet.org/files/mimic-iv-note/2.2\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "eebf095b",
   "metadata": {},
   "source": [
    "!ls {mimic_iv_path}"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "d1949918",
   "metadata": {},
   "source": [
    "!ls {mimic_iv_note_path}"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "4eef9634",
   "metadata": {},
   "source": [
    "patients = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/patients.csv.gz\"))\n",
    "print(patients.shape)\n",
    "patients.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "8f740475",
   "metadata": {},
   "source": [
    "admissions = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/admissions.csv.gz\"))\n",
    "print(admissions.shape)\n",
    "admissions.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "7314ceda",
   "metadata": {},
   "source": [
    "icustays = pd.read_csv(os.path.join(mimic_iv_path, \"icu/icustays.csv.gz\"))\n",
    "print(icustays.shape)\n",
    "icustays.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "3b9416e5",
   "metadata": {},
   "source": [
    "discharge = pd.read_csv(os.path.join(mimic_iv_note_path, \"note/discharge.csv.gz\"))\n",
    "print(discharge.shape)\n",
    "discharge.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "b6d66270",
   "metadata": {},
   "source": [
    "print(patients.subject_id.nunique())\n",
    "print(admissions.subject_id.nunique())\n",
    "print(icustays.subject_id.nunique())\n",
    "print(discharge.subject_id.nunique())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "446ad1aa",
   "metadata": {},
   "source": [
    "print(admissions.hadm_id.nunique())\n",
    "print(icustays.hadm_id.nunique())\n",
    "print(discharge.hadm_id.nunique())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "9ef85af8",
   "metadata": {},
   "source": [
    "print(icustays.stay_id.nunique())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "3df1d4d6",
   "metadata": {},
   "source": [
    "admissions_hadm_ids = set(admissions.hadm_id.tolist())\n",
    "icustays_hadm_ids = set(icustays.hadm_id.tolist())\n",
    "discharge_hadm_ids = set(discharge.hadm_id.tolist())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "0e3c3c84",
   "metadata": {},
   "source": [
    "from matplotlib_venn import venn3\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "\n",
    "plt.figure(figsize=(8, 8))\n",
    "venn3([admissions_hadm_ids, icustays_hadm_ids, discharge_hadm_ids], ('Hospital', 'ICU', 'Discharge'))\n",
    "plt.show()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "a26f90ee",
   "metadata": {},
   "source": [
    "starting cohort"
   ]
  },
  {
   "cell_type": "code",
   "id": "500460a5",
   "metadata": {},
   "source": [
    "print(icustays.subject_id.nunique())\n",
    "print(icustays.hadm_id.nunique())\n",
    "print(icustays.stay_id.nunique())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "c0d8ef77",
   "metadata": {},
   "source": [
    "remove no hospital admission"
   ]
  },
  {
   "cell_type": "code",
   "id": "fc6d90ed",
   "metadata": {},
   "source": [
    "icustays = icustays[icustays.hadm_id.isin(admissions_hadm_ids)]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "19efd63f",
   "metadata": {},
   "source": [
    "print(icustays.subject_id.nunique())\n",
    "print(icustays.hadm_id.nunique())\n",
    "print(icustays.stay_id.nunique())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "e9892dc6",
   "metadata": {},
   "source": [
    "remove no discharge note"
   ]
  },
  {
   "cell_type": "code",
   "id": "ed4347a1",
   "metadata": {},
   "source": [
    "icustays = icustays[icustays.hadm_id.isin(discharge_hadm_ids)]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "55c8af1c",
   "metadata": {},
   "source": [
    "print(icustays.subject_id.nunique())\n",
    "print(icustays.hadm_id.nunique())\n",
    "print(icustays.stay_id.nunique())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "9f6c9db1",
   "metadata": {},
   "source": [
    "remove >2 ICU stays per admission"
   ]
  },
  {
   "cell_type": "code",
   "id": "05c2f25c",
   "metadata": {},
   "source": [
    "to_keep = icustays.groupby(\"hadm_id\").stay_id.nunique().reset_index()\n",
    "to_keep = to_keep[to_keep.stay_id == 1][[\"hadm_id\"]]\n",
    "icustays = icustays.merge(to_keep, how=\"inner\", on=\"hadm_id\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "1c990f33",
   "metadata": {},
   "source": [
    "print(icustays.subject_id.nunique())\n",
    "print(icustays.hadm_id.nunique())\n",
    "print(icustays.stay_id.nunique())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "0199c558",
   "metadata": {},
   "source": [
    "remove < 18 years old patients"
   ]
  },
  {
   "cell_type": "code",
   "id": "934d050a",
   "metadata": {},
   "source": [
    "icustays.intime = pd.to_datetime(icustays.intime)\n",
    "icustays = icustays.merge(patients[[\"subject_id\", \"anchor_age\", \"anchor_year\"]], on=\"subject_id\", how=\"inner\")\n",
    "icustays[\"age\"] = icustays.intime.dt.year - icustays.anchor_year + icustays.anchor_age\n",
    "icustays.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "a658c503",
   "metadata": {},
   "source": [
    "icustays.age.min()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "acd6bbdc",
   "metadata": {},
   "source": [
    "remove negative los"
   ]
  },
  {
   "cell_type": "code",
   "id": "16829161",
   "metadata": {
    "scrolled": true
   },
   "source": [
    "icustays = icustays.merge(admissions[[\"hadm_id\", \"admittime\", \"dischtime\"]], on=\"hadm_id\", how=\"inner\")\n",
    "icustays = icustays.rename(columns={\n",
    "    \"admittime\": \"hadm_intime\",\n",
    "    \"dischtime\": \"hadm_outtime\",\n",
    "    \"intime\": \"stay_intime\",\n",
    "    \"outtime\": \"stay_outtime\",\n",
    "})\n",
    "icustays.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "59f29be8",
   "metadata": {},
   "source": [
    "icustays['hadm_intime'] = pd.to_datetime(icustays['hadm_intime'])\n",
    "icustays['hadm_outtime'] = pd.to_datetime(icustays['hadm_outtime'])\n",
    "icustays['stay_intime'] = pd.to_datetime(icustays['stay_intime'])\n",
    "icustays['stay_outtime'] = pd.to_datetime(icustays['stay_outtime'])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "f033b949",
   "metadata": {},
   "source": [
    "icustays['stay_los'] = (icustays['stay_outtime'] - icustays['stay_intime']).dt.total_seconds() / 3600\n",
    "icustays['hadm_los'] = (icustays['hadm_outtime'] - icustays['hadm_intime']).dt.total_seconds() / 3600"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "f347f40e",
   "metadata": {},
   "source": [
    "icustays.stay_los.min()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "0f41dd40",
   "metadata": {},
   "source": [
    "icustays.hadm_los.min()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "36aae19f",
   "metadata": {},
   "source": [
    "icustays = icustays[icustays.stay_los >= 0]\n",
    "icustays = icustays[icustays.hadm_los >= 0]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "452937c5",
   "metadata": {},
   "source": [
    "print(icustays.subject_id.nunique())\n",
    "print(icustays.hadm_id.nunique())\n",
    "print(icustays.stay_id.nunique())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "078c5947",
   "metadata": {},
   "source": [
    "statistics"
   ]
  },
  {
   "cell_type": "code",
   "id": "d1c4349d",
   "metadata": {},
   "source": [
    "icustays.groupby(\"subject_id\").hadm_id.nunique().value_counts()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "e9640ccb",
   "metadata": {},
   "source": [
    "icustays.groupby(\"subject_id\").hadm_id.nunique().describe()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "b1d67bba",
   "metadata": {},
   "source": [
    "icustays.groupby(\"hadm_id\").stay_id.nunique().value_counts()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "bc501162",
   "metadata": {},
   "source": [
    "icustays.groupby(\"hadm_id\").stay_id.nunique().describe()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "6242836b",
   "metadata": {},
   "source": [
    "icustays.stay_los.describe()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "4f21b790",
   "metadata": {},
   "source": [
    "icustays.hadm_los.describe()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "2870464f",
   "metadata": {},
   "source": [
    "save"
   ]
  },
  {
   "cell_type": "code",
   "id": "7692519c",
   "metadata": {},
   "source": [
    "icustays.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "af029ce5",
   "metadata": {},
   "source": [
    "icustays = icustays[[\"subject_id\", \"hadm_id\", \"stay_id\", \"hadm_intime\", \"hadm_outtime\", \"hadm_los\", \"stay_intime\", \"stay_outtime\", \"stay_los\"]]\n",
    "icustays.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "3122776d",
   "metadata": {},
   "source": [
    "output_path = os.path.join(processed_data_path, \"mimic4\")\n",
    "create_directory(output_path)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "3ca8bb7d",
   "metadata": {},
   "source": [
    "icustays.to_csv(os.path.join(output_path, 'cohort.csv'), index=False)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "806c6cc7",
   "metadata": {},
   "source": [],
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch20",
   "language": "python",
   "name": "pytorch20"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}