Switch to side-by-side view

--- a
+++ b/src/preprocess/03_merge_events.ipynb
@@ -0,0 +1,366 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "debdace9",
+   "metadata": {},
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "src_path = os.path.abspath(\"../..\")\n",
+    "print(src_path)\n",
+    "sys.path.append(src_path)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "6bad1e09",
+   "metadata": {},
+   "source": [
+    "from src.utils import create_directory, raw_data_path, processed_data_path, set_seed"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "5d9bc78c",
+   "metadata": {},
+   "source": [
+    "set_seed(seed=42)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "13d22a57",
+   "metadata": {},
+   "source": [
+    "import pandas as pd"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "dd9852d5",
+   "metadata": {},
+   "source": [
+    "mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
+    "output_path = os.path.join(processed_data_path, \"mimic4\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "b6a27998",
+   "metadata": {},
+   "source": [
+    "cohort = pd.read_csv(os.path.join(output_path, \"cohort.csv\"))\n",
+    "print(cohort.shape)\n",
+    "cohort.head()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "9dd92e23",
+   "metadata": {},
+   "source": [
+    "cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n",
+    "cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n",
+    "cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n",
+    "cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "8f55c793",
+   "metadata": {},
+   "source": [
+    "hadm_ids = set(cohort.hadm_id.unique().tolist())\n",
+    "len(hadm_ids)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "d03d447c",
+   "metadata": {},
+   "source": [
+    "import ast\n",
+    "import numpy as np\n",
+    "\n",
+    "\n",
+    "def safe_literal_eval(s):\n",
+    "    if pd.isna(s):\n",
+    "        return np.nan\n",
+    "    return ast.literal_eval(s)\n",
+    "\n",
+    "\n",
+    "cohort.label_diagnosis = cohort.label_diagnosis.apply(safe_literal_eval)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5a9d60c6",
+   "metadata": {},
+   "source": [
+    "helper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "5171bbae",
+   "metadata": {},
+   "source": [
+    "from concurrent.futures import ThreadPoolExecutor\n",
+    "from tqdm import tqdm\n",
+    "from pandarallel import pandarallel"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "5d6b9ce2",
+   "metadata": {},
+   "source": [
+    "pandarallel.initialize(progress_bar=True)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e77f628d",
+   "metadata": {},
+   "source": [
+    "merge"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "f86a3633",
+   "metadata": {},
+   "source": [
+    "events_selected = [   \n",
+    "    \"labevents\",           \n",
+    "    \"microbiologyevents\",\n",
+    "    \"prescriptions\",\n",
+    "    \"transfers\",\n",
+    "    \"procedureevents\",\n",
+    "]"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "7ae555e1",
+   "metadata": {},
+   "source": [
+    "def merge_and_save(events, hadm_id, folder_name):\n",
+    "    \n",
+    "    df = []\n",
+    "    for event in events:\n",
+    "        try:\n",
+    "            tmp = pd.read_csv(os.path.join(output_path, f\"event_{event}/event_{hadm_id}.csv\"),\n",
+    "                              usecols=[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"])\n",
+    "            df.append(tmp)\n",
+    "        except FileNotFoundError:\n",
+    "            continue\n",
+    "    \n",
+    "    assert len(df) > 0, hadm_id\n",
+    "    df = pd.concat(df)\n",
+    "    df.hadm_id = df.hadm_id.astype(int)\n",
+    "    df = df.sort_values(by=\"timestamp\", ascending=True)\n",
+    "    \n",
+    "    tmp1 = pd.read_csv(os.path.join(output_path, f\"event_patient_demographics/event_{hadm_id}.csv\"))\n",
+    "    tmp2 = pd.read_csv(os.path.join(output_path, f\"event_admission_info/event_{hadm_id}.csv\"))\n",
+    "    df = pd.concat([tmp1, tmp2, df])\n",
+    "    \n",
+    "    df = df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"]]\n",
+    "\n",
+    "    file_path = os.path.join(output_path, f\"{folder_name}/event_{hadm_id}.csv\")\n",
+    "    df.to_csv(file_path, index=False)\n",
+    "\n",
+    "    return True"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "98067450",
+   "metadata": {},
+   "source": [
+    "!rm -r {output_path}/event_selected"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "81096fa5",
+   "metadata": {},
+   "source": [
+    "create_directory(f\"{output_path}/event_selected\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "2858ec13",
+   "metadata": {},
+   "source": [
+    "with ThreadPoolExecutor(max_workers=4) as executor:\n",
+    "    for hadm_id in tqdm(hadm_ids, total=len(hadm_ids)):\n",
+    "        future = executor.submit(\n",
+    "            merge_and_save, \n",
+    "            events_selected, \n",
+    "            hadm_id, \n",
+    "            \"event_selected\"\n",
+    "        )"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "993412bf",
+   "metadata": {},
+   "source": [
+    "stat"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "78ff0517",
+   "metadata": {},
+   "source": [
+    "from tqdm import tqdm"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "87ca202e",
+   "metadata": {},
+   "source": [
+    "hadm_id_to_len = {}\n",
+    "for hadm_id in tqdm(hadm_ids):\n",
+    "    try:\n",
+    "        df = pd.read_csv(os.path.join(output_path, f\"event_selected/event_{hadm_id}.csv\"))        \n",
+    "        hadm_id_to_len[hadm_id] = len(df)\n",
+    "        del df\n",
+    "    except FileNotFoundError:\n",
+    "        print(f\"{hadm_id} not found!\")\n",
+    "        hadm_id_to_len[hadm_id] = 0"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "9e282998",
+   "metadata": {},
+   "source": [
+    "cohort[\"len_selected\"] = cohort.hadm_id.map(hadm_id_to_len)\n",
+    "cohort.head()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "4891f34e",
+   "metadata": {},
+   "source": [
+    "len(cohort)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "79d1e1f8",
+   "metadata": {},
+   "source": [
+    "cohort.hadm_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "4d89e2e7",
+   "metadata": {},
+   "source": [
+    "cohort.stay_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "defa6a7e",
+   "metadata": {},
+   "source": [
+    "cohort.len_selected.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "d8d8675e",
+   "metadata": {},
+   "source": "cohort.to_csv(os.path.join(output_path, 'cohort+len.csv'), index=False)",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "6a846dff",
+   "metadata": {},
+   "source": [],
+   "outputs": [],
+   "execution_count": null
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pytorch20",
+   "language": "python",
+   "name": "pytorch20"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}