367 lines (366 with data), 8.4 kB
{
"cells": [
{
"cell_type": "code",
"id": "debdace9",
"metadata": {},
"source": [
"import os\n",
"import sys\n",
"\n",
"src_path = os.path.abspath(\"../..\")\n",
"print(src_path)\n",
"sys.path.append(src_path)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "6bad1e09",
"metadata": {},
"source": [
"from src.utils import create_directory, raw_data_path, processed_data_path, set_seed"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "5d9bc78c",
"metadata": {},
"source": [
"set_seed(seed=42)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "13d22a57",
"metadata": {},
"source": [
"import pandas as pd"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "dd9852d5",
"metadata": {},
"source": [
"mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
"output_path = os.path.join(processed_data_path, \"mimic4\")"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "b6a27998",
"metadata": {},
"source": [
"cohort = pd.read_csv(os.path.join(output_path, \"cohort.csv\"))\n",
"print(cohort.shape)\n",
"cohort.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "9dd92e23",
"metadata": {},
"source": [
"cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n",
"cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n",
"cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n",
"cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "8f55c793",
"metadata": {},
"source": [
"hadm_ids = set(cohort.hadm_id.unique().tolist())\n",
"len(hadm_ids)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "d03d447c",
"metadata": {},
"source": [
"import ast\n",
"import numpy as np\n",
"\n",
"\n",
"def safe_literal_eval(s):\n",
" if pd.isna(s):\n",
" return np.nan\n",
" return ast.literal_eval(s)\n",
"\n",
"\n",
"cohort.label_diagnosis = cohort.label_diagnosis.apply(safe_literal_eval)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "5a9d60c6",
"metadata": {},
"source": [
"helper"
]
},
{
"cell_type": "code",
"id": "5171bbae",
"metadata": {},
"source": [
"from concurrent.futures import ThreadPoolExecutor\n",
"from tqdm import tqdm\n",
"from pandarallel import pandarallel"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "5d6b9ce2",
"metadata": {},
"source": [
"pandarallel.initialize(progress_bar=True)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "e77f628d",
"metadata": {},
"source": [
"merge"
]
},
{
"cell_type": "code",
"id": "f86a3633",
"metadata": {},
"source": [
"events_selected = [ \n",
" \"labevents\", \n",
" \"microbiologyevents\",\n",
" \"prescriptions\",\n",
" \"transfers\",\n",
" \"procedureevents\",\n",
"]"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "7ae555e1",
"metadata": {},
"source": [
"def merge_and_save(events, hadm_id, folder_name):\n",
" \n",
" df = []\n",
" for event in events:\n",
" try:\n",
" tmp = pd.read_csv(os.path.join(output_path, f\"event_{event}/event_{hadm_id}.csv\"),\n",
" usecols=[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"])\n",
" df.append(tmp)\n",
" except FileNotFoundError:\n",
" continue\n",
" \n",
" assert len(df) > 0, hadm_id\n",
" df = pd.concat(df)\n",
" df.hadm_id = df.hadm_id.astype(int)\n",
" df = df.sort_values(by=\"timestamp\", ascending=True)\n",
" \n",
" tmp1 = pd.read_csv(os.path.join(output_path, f\"event_patient_demographics/event_{hadm_id}.csv\"))\n",
" tmp2 = pd.read_csv(os.path.join(output_path, f\"event_admission_info/event_{hadm_id}.csv\"))\n",
" df = pd.concat([tmp1, tmp2, df])\n",
" \n",
" df = df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"]]\n",
"\n",
" file_path = os.path.join(output_path, f\"{folder_name}/event_{hadm_id}.csv\")\n",
" df.to_csv(file_path, index=False)\n",
"\n",
" return True"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "98067450",
"metadata": {},
"source": [
"!rm -r {output_path}/event_selected"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "81096fa5",
"metadata": {},
"source": [
"create_directory(f\"{output_path}/event_selected\")"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "2858ec13",
"metadata": {},
"source": [
"with ThreadPoolExecutor(max_workers=4) as executor:\n",
" for hadm_id in tqdm(hadm_ids, total=len(hadm_ids)):\n",
" future = executor.submit(\n",
" merge_and_save, \n",
" events_selected, \n",
" hadm_id, \n",
" \"event_selected\"\n",
" )"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "993412bf",
"metadata": {},
"source": [
"stat"
]
},
{
"cell_type": "code",
"id": "78ff0517",
"metadata": {},
"source": [
"from tqdm import tqdm"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "87ca202e",
"metadata": {},
"source": [
"hadm_id_to_len = {}\n",
"for hadm_id in tqdm(hadm_ids):\n",
" try:\n",
" df = pd.read_csv(os.path.join(output_path, f\"event_selected/event_{hadm_id}.csv\")) \n",
" hadm_id_to_len[hadm_id] = len(df)\n",
" del df\n",
" except FileNotFoundError:\n",
" print(f\"{hadm_id} not found!\")\n",
" hadm_id_to_len[hadm_id] = 0"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "9e282998",
"metadata": {},
"source": [
"cohort[\"len_selected\"] = cohort.hadm_id.map(hadm_id_to_len)\n",
"cohort.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "4891f34e",
"metadata": {},
"source": [
"len(cohort)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "79d1e1f8",
"metadata": {},
"source": [
"cohort.hadm_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "4d89e2e7",
"metadata": {},
"source": [
"cohort.stay_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "defa6a7e",
"metadata": {},
"source": [
"cohort.len_selected.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "d8d8675e",
"metadata": {},
"source": "cohort.to_csv(os.path.join(output_path, 'cohort+len.csv'), index=False)",
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "6a846dff",
"metadata": {},
"source": [],
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "pytorch20",
"language": "python",
"name": "pytorch20"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}