[780764]: / src / preprocess / 02_event_icu_inputevents.ipynb

Download this file

426 lines (425 with data), 9.2 kB

{
 "cells": [
  {
   "cell_type": "code",
   "id": "debdace9",
   "metadata": {},
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "src_path = os.path.abspath(\"../..\")\n",
    "print(src_path)\n",
    "sys.path.append(src_path)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "6bad1e09",
   "metadata": {},
   "source": [
    "from src.utils import create_directory, raw_data_path, processed_data_path, set_seed"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5d9bc78c",
   "metadata": {},
   "source": [
    "set_seed(seed=42)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "13d22a57",
   "metadata": {},
   "source": [
    "import pandas as pd"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "dd9852d5",
   "metadata": {},
   "source": [
    "mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
    "output_path = os.path.join(processed_data_path, \"mimic4\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "b6a27998",
   "metadata": {},
   "source": [
    "cohort = pd.read_csv(os.path.join(output_path, \"cohort.csv\"))\n",
    "print(cohort.shape)\n",
    "cohort.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "9dd92e23",
   "metadata": {},
   "source": [
    "cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n",
    "cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n",
    "cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n",
    "cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "8f55c793",
   "metadata": {},
   "source": [
    "hadm_ids = set(cohort.hadm_id.unique().tolist())\n",
    "len(hadm_ids)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "5a9d60c6",
   "metadata": {},
   "source": [
    "helper"
   ]
  },
  {
   "cell_type": "code",
   "id": "5171bbae",
   "metadata": {},
   "source": [
    "from concurrent.futures import ThreadPoolExecutor\n",
    "from tqdm import tqdm\n",
    "from pandarallel import pandarallel"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5d6b9ce2",
   "metadata": {},
   "source": [
    "pandarallel.initialize(progress_bar=True)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "ab5525be",
   "metadata": {},
   "source": [
    "def save_group(group_df, hadm_id, event_type):\n",
    "    file_path = f\"{output_path}/event_{event_type}/event_{int(hadm_id)}.csv\"\n",
    "    group_df.to_csv(file_path, index=False)\n",
    "    return True"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "b5baa08b",
   "metadata": {},
   "source": [
    "dict"
   ]
  },
  {
   "cell_type": "code",
   "id": "13036c54",
   "metadata": {},
   "source": [
    "d_items = pd.read_csv(os.path.join(mimic_iv_path, \"icu/d_items.csv.gz\"))\n",
    "print(d_items.shape)\n",
    "d_items.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "35ad1847",
   "metadata": {},
   "source": [
    "d_labitems = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/d_labitems.csv.gz\"))\n",
    "print(d_labitems.shape)\n",
    "d_labitems.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "e77f628d",
   "metadata": {},
   "source": [
    "## inputevents"
   ]
  },
  {
   "cell_type": "code",
   "id": "6c616069",
   "metadata": {},
   "source": [
    "event_type = \"inputevents\""
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "7ae555e1",
   "metadata": {},
   "source": [
    "!rm -r {output_path}/event_{event_type}"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "05d1f0e1",
   "metadata": {},
   "source": [
    "create_directory(f\"{output_path}/event_{event_type}\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "7b2f1ecf",
   "metadata": {},
   "source": [
    "df_raw = pd.read_csv(os.path.join(mimic_iv_path, f\"icu/{event_type}.csv.gz\"))\n",
    "print(df_raw.shape)\n",
    "df_raw.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5412327d",
   "metadata": {},
   "source": [
    "df = df_raw.merge(cohort[[\"hadm_id\", \"stay_id\", \"hadm_intime\"]], on=[\"hadm_id\", \"stay_id\"], how=\"inner\")\n",
    "print(df.shape)\n",
    "df.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5faeca51",
   "metadata": {},
   "source": [
    "df[\"starttime\"] = pd.to_datetime(df[\"starttime\"])\n",
    "df[\"endtime\"] = pd.to_datetime(df[\"endtime\"])\n",
    "df[\"timestamp\"] = (df.starttime - df.hadm_intime).dt.total_seconds() / 3600\n",
    "df[\"duration\"] = (df.endtime - df.starttime).dt.total_seconds() / 3600\n",
    "df.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "856a830a",
   "metadata": {},
   "source": [
    "df = df.sort_values([\"subject_id\", \"hadm_id\", \"timestamp\"], ascending=True)\n",
    "df.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "d6347877",
   "metadata": {},
   "source": [
    "df = df.merge(d_items[[\"itemid\", \"label\"]], on=\"itemid\", how=\"left\")\n",
    "df.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "11441cef",
   "metadata": {},
   "source": [
    "df = df[df.duration > 0]\n",
    "print(df.shape)\n",
    "df.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "c858223c",
   "metadata": {},
   "source": [
    "df.isna().sum()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "20f9d1bc",
   "metadata": {},
   "source": [
    "def generate_event_value(x):\n",
    "    s = f\"{x.label} {x.amount:.2f} {x.amountuom} for {x.duration:.2f} hour\"\n",
    "    return s"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "21bcf6bf",
   "metadata": {},
   "source": [
    "meta_cols = [\"label\", \"amount\", \"amountuom\", \"duration\"]\n",
    "for c in meta_cols:\n",
    "    df[\"meta_\" + c] = df[c]\n",
    "meta_cols = [\"meta_\" + c for c in meta_cols]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "354e1327",
   "metadata": {},
   "source": [
    "df[\"timestamp_avail\"] = df[\"timestamp\"]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "0aba1577",
   "metadata": {},
   "source": [
    "generate_event_value(df.iloc[4444])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "1ad10334",
   "metadata": {},
   "source": [
    "df[\"event_type\"] = event_type\n",
    "df[\"event_value\"] = df.parallel_apply(generate_event_value, axis=1)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "62024747",
   "metadata": {},
   "source": [
    "df[df.hadm_id == 29079034]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "ea230f67",
   "metadata": {},
   "source": [
    "df.groupby(\"hadm_id\").timestamp.count().describe()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "7b798191",
   "metadata": {},
   "source": [
    "groups = df.groupby(\"hadm_id\")\n",
    "    \n",
    "with ThreadPoolExecutor(max_workers=4) as executor:\n",
    "    for hadm_id, group_df in tqdm(groups, total=groups.ngroups):\n",
    "        future = executor.submit(\n",
    "            save_group, \n",
    "            group_df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"] + meta_cols], \n",
    "            hadm_id, \n",
    "            event_type\n",
    "        )"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "6d69ca68",
   "metadata": {},
   "source": [
    "!ls -1 {output_path}/event_{event_type} | wc -l"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "6f596497",
   "metadata": {},
   "source": [],
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch20",
   "language": "python",
   "name": "pytorch20"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}