--- a +++ b/src/preprocess/03_merge_events.ipynb @@ -0,0 +1,366 @@ +{ + "cells": [ + { + "cell_type": "code", + "id": "debdace9", + "metadata": {}, + "source": [ + "import os\n", + "import sys\n", + "\n", + "src_path = os.path.abspath(\"../..\")\n", + "print(src_path)\n", + "sys.path.append(src_path)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "6bad1e09", + "metadata": {}, + "source": [ + "from src.utils import create_directory, raw_data_path, processed_data_path, set_seed" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "5d9bc78c", + "metadata": {}, + "source": [ + "set_seed(seed=42)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "13d22a57", + "metadata": {}, + "source": [ + "import pandas as pd" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "dd9852d5", + "metadata": {}, + "source": [ + "mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n", + "output_path = os.path.join(processed_data_path, \"mimic4\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "b6a27998", + "metadata": {}, + "source": [ + "cohort = pd.read_csv(os.path.join(output_path, \"cohort.csv\"))\n", + "print(cohort.shape)\n", + "cohort.head()" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "9dd92e23", + "metadata": {}, + "source": [ + "cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n", + "cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n", + "cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n", + "cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "8f55c793", + "metadata": {}, + "source": [ + "hadm_ids = set(cohort.hadm_id.unique().tolist())\n", + "len(hadm_ids)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "d03d447c", + "metadata": {}, + "source": [ + "import ast\n", + "import numpy as np\n", + "\n", + "\n", + "def safe_literal_eval(s):\n", + " if pd.isna(s):\n", + " return np.nan\n", + " return ast.literal_eval(s)\n", + "\n", + "\n", + "cohort.label_diagnosis = cohort.label_diagnosis.apply(safe_literal_eval)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "5a9d60c6", + "metadata": {}, + "source": [ + "helper" + ] + }, + { + "cell_type": "code", + "id": "5171bbae", + "metadata": {}, + "source": [ + "from concurrent.futures import ThreadPoolExecutor\n", + "from tqdm import tqdm\n", + "from pandarallel import pandarallel" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "5d6b9ce2", + "metadata": {}, + "source": [ + "pandarallel.initialize(progress_bar=True)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "e77f628d", + "metadata": {}, + "source": [ + "merge" + ] + }, + { + "cell_type": "code", + "id": "f86a3633", + "metadata": {}, + "source": [ + "events_selected = [ \n", + " \"labevents\", \n", + " \"microbiologyevents\",\n", + " \"prescriptions\",\n", + " \"transfers\",\n", + " \"procedureevents\",\n", + "]" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "7ae555e1", + "metadata": {}, + "source": [ + "def merge_and_save(events, hadm_id, folder_name):\n", + " \n", + " df = []\n", + " for event in events:\n", + " try:\n", + " tmp = pd.read_csv(os.path.join(output_path, f\"event_{event}/event_{hadm_id}.csv\"),\n", + " usecols=[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"])\n", + " df.append(tmp)\n", + " except FileNotFoundError:\n", + " continue\n", + " \n", + " assert len(df) > 0, hadm_id\n", + " df = pd.concat(df)\n", + " df.hadm_id = df.hadm_id.astype(int)\n", + " df = df.sort_values(by=\"timestamp\", ascending=True)\n", + " \n", + " tmp1 = pd.read_csv(os.path.join(output_path, f\"event_patient_demographics/event_{hadm_id}.csv\"))\n", + " tmp2 = pd.read_csv(os.path.join(output_path, f\"event_admission_info/event_{hadm_id}.csv\"))\n", + " df = pd.concat([tmp1, tmp2, df])\n", + " \n", + " df = df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"]]\n", + "\n", + " file_path = os.path.join(output_path, f\"{folder_name}/event_{hadm_id}.csv\")\n", + " df.to_csv(file_path, index=False)\n", + "\n", + " return True" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "98067450", + "metadata": {}, + "source": [ + "!rm -r {output_path}/event_selected" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "81096fa5", + "metadata": {}, + "source": [ + "create_directory(f\"{output_path}/event_selected\")" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "2858ec13", + "metadata": {}, + "source": [ + "with ThreadPoolExecutor(max_workers=4) as executor:\n", + " for hadm_id in tqdm(hadm_ids, total=len(hadm_ids)):\n", + " future = executor.submit(\n", + " merge_and_save, \n", + " events_selected, \n", + " hadm_id, \n", + " \"event_selected\"\n", + " )" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "markdown", + "id": "993412bf", + "metadata": {}, + "source": [ + "stat" + ] + }, + { + "cell_type": "code", + "id": "78ff0517", + "metadata": {}, + "source": [ + "from tqdm import tqdm" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "87ca202e", + "metadata": {}, + "source": [ + "hadm_id_to_len = {}\n", + "for hadm_id in tqdm(hadm_ids):\n", + " try:\n", + " df = pd.read_csv(os.path.join(output_path, f\"event_selected/event_{hadm_id}.csv\")) \n", + " hadm_id_to_len[hadm_id] = len(df)\n", + " del df\n", + " except FileNotFoundError:\n", + " print(f\"{hadm_id} not found!\")\n", + " hadm_id_to_len[hadm_id] = 0" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "9e282998", + "metadata": {}, + "source": [ + "cohort[\"len_selected\"] = cohort.hadm_id.map(hadm_id_to_len)\n", + "cohort.head()" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "4891f34e", + "metadata": {}, + "source": [ + "len(cohort)" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "79d1e1f8", + "metadata": {}, + "source": [ + "cohort.hadm_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "4d89e2e7", + "metadata": {}, + "source": [ + "cohort.stay_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "defa6a7e", + "metadata": {}, + "source": [ + "cohort.len_selected.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])" + ], + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "d8d8675e", + "metadata": {}, + "source": "cohort.to_csv(os.path.join(output_path, 'cohort+len.csv'), index=False)", + "outputs": [], + "execution_count": null + }, + { + "cell_type": "code", + "id": "6a846dff", + "metadata": {}, + "source": [], + "outputs": [], + "execution_count": null + } + ], + "metadata": { + "kernelspec": { + "display_name": "pytorch20", + "language": "python", + "name": "pytorch20" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.19" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}