[780764]: / src / preprocess / 03_merge_events.ipynb

Download this file

367 lines (366 with data), 8.4 kB

{
 "cells": [
  {
   "cell_type": "code",
   "id": "debdace9",
   "metadata": {},
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "src_path = os.path.abspath(\"../..\")\n",
    "print(src_path)\n",
    "sys.path.append(src_path)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "6bad1e09",
   "metadata": {},
   "source": [
    "from src.utils import create_directory, raw_data_path, processed_data_path, set_seed"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5d9bc78c",
   "metadata": {},
   "source": [
    "set_seed(seed=42)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "13d22a57",
   "metadata": {},
   "source": [
    "import pandas as pd"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "dd9852d5",
   "metadata": {},
   "source": [
    "mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
    "output_path = os.path.join(processed_data_path, \"mimic4\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "b6a27998",
   "metadata": {},
   "source": [
    "cohort = pd.read_csv(os.path.join(output_path, \"cohort.csv\"))\n",
    "print(cohort.shape)\n",
    "cohort.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "9dd92e23",
   "metadata": {},
   "source": [
    "cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n",
    "cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n",
    "cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n",
    "cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "8f55c793",
   "metadata": {},
   "source": [
    "hadm_ids = set(cohort.hadm_id.unique().tolist())\n",
    "len(hadm_ids)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "d03d447c",
   "metadata": {},
   "source": [
    "import ast\n",
    "import numpy as np\n",
    "\n",
    "\n",
    "def safe_literal_eval(s):\n",
    "    if pd.isna(s):\n",
    "        return np.nan\n",
    "    return ast.literal_eval(s)\n",
    "\n",
    "\n",
    "cohort.label_diagnosis = cohort.label_diagnosis.apply(safe_literal_eval)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "5a9d60c6",
   "metadata": {},
   "source": [
    "helper"
   ]
  },
  {
   "cell_type": "code",
   "id": "5171bbae",
   "metadata": {},
   "source": [
    "from concurrent.futures import ThreadPoolExecutor\n",
    "from tqdm import tqdm\n",
    "from pandarallel import pandarallel"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5d6b9ce2",
   "metadata": {},
   "source": [
    "pandarallel.initialize(progress_bar=True)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "e77f628d",
   "metadata": {},
   "source": [
    "merge"
   ]
  },
  {
   "cell_type": "code",
   "id": "f86a3633",
   "metadata": {},
   "source": [
    "events_selected = [   \n",
    "    \"labevents\",           \n",
    "    \"microbiologyevents\",\n",
    "    \"prescriptions\",\n",
    "    \"transfers\",\n",
    "    \"procedureevents\",\n",
    "]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "7ae555e1",
   "metadata": {},
   "source": [
    "def merge_and_save(events, hadm_id, folder_name):\n",
    "    \n",
    "    df = []\n",
    "    for event in events:\n",
    "        try:\n",
    "            tmp = pd.read_csv(os.path.join(output_path, f\"event_{event}/event_{hadm_id}.csv\"),\n",
    "                              usecols=[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"])\n",
    "            df.append(tmp)\n",
    "        except FileNotFoundError:\n",
    "            continue\n",
    "    \n",
    "    assert len(df) > 0, hadm_id\n",
    "    df = pd.concat(df)\n",
    "    df.hadm_id = df.hadm_id.astype(int)\n",
    "    df = df.sort_values(by=\"timestamp\", ascending=True)\n",
    "    \n",
    "    tmp1 = pd.read_csv(os.path.join(output_path, f\"event_patient_demographics/event_{hadm_id}.csv\"))\n",
    "    tmp2 = pd.read_csv(os.path.join(output_path, f\"event_admission_info/event_{hadm_id}.csv\"))\n",
    "    df = pd.concat([tmp1, tmp2, df])\n",
    "    \n",
    "    df = df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"]]\n",
    "\n",
    "    file_path = os.path.join(output_path, f\"{folder_name}/event_{hadm_id}.csv\")\n",
    "    df.to_csv(file_path, index=False)\n",
    "\n",
    "    return True"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "98067450",
   "metadata": {},
   "source": [
    "!rm -r {output_path}/event_selected"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "81096fa5",
   "metadata": {},
   "source": [
    "create_directory(f\"{output_path}/event_selected\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "2858ec13",
   "metadata": {},
   "source": [
    "with ThreadPoolExecutor(max_workers=4) as executor:\n",
    "    for hadm_id in tqdm(hadm_ids, total=len(hadm_ids)):\n",
    "        future = executor.submit(\n",
    "            merge_and_save, \n",
    "            events_selected, \n",
    "            hadm_id, \n",
    "            \"event_selected\"\n",
    "        )"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "993412bf",
   "metadata": {},
   "source": [
    "stat"
   ]
  },
  {
   "cell_type": "code",
   "id": "78ff0517",
   "metadata": {},
   "source": [
    "from tqdm import tqdm"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "87ca202e",
   "metadata": {},
   "source": [
    "hadm_id_to_len = {}\n",
    "for hadm_id in tqdm(hadm_ids):\n",
    "    try:\n",
    "        df = pd.read_csv(os.path.join(output_path, f\"event_selected/event_{hadm_id}.csv\"))        \n",
    "        hadm_id_to_len[hadm_id] = len(df)\n",
    "        del df\n",
    "    except FileNotFoundError:\n",
    "        print(f\"{hadm_id} not found!\")\n",
    "        hadm_id_to_len[hadm_id] = 0"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "9e282998",
   "metadata": {},
   "source": [
    "cohort[\"len_selected\"] = cohort.hadm_id.map(hadm_id_to_len)\n",
    "cohort.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "4891f34e",
   "metadata": {},
   "source": [
    "len(cohort)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "79d1e1f8",
   "metadata": {},
   "source": [
    "cohort.hadm_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "4d89e2e7",
   "metadata": {},
   "source": [
    "cohort.stay_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "defa6a7e",
   "metadata": {},
   "source": [
    "cohort.len_selected.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "d8d8675e",
   "metadata": {},
   "source": "cohort.to_csv(os.path.join(output_path, 'cohort+len.csv'), index=False)",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "6a846dff",
   "metadata": {},
   "source": [],
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch20",
   "language": "python",
   "name": "pytorch20"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}