459 lines (458 with data), 10.0 kB
{
"cells": [
{
"cell_type": "code",
"id": "debdace9",
"metadata": {},
"source": [
"import os\n",
"import sys\n",
"\n",
"src_path = os.path.abspath(\"../..\")\n",
"print(src_path)\n",
"sys.path.append(src_path)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "6bad1e09",
"metadata": {},
"source": [
"from src.utils import create_directory, raw_data_path, processed_data_path, set_seed"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "5d9bc78c",
"metadata": {},
"source": [
"set_seed(seed=42)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "13d22a57",
"metadata": {},
"source": [
"import pandas as pd"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "dd9852d5",
"metadata": {},
"source": [
"mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
"output_path = os.path.join(processed_data_path, \"mimic4\")"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "b6a27998",
"metadata": {},
"source": [
"cohort = pd.read_csv(os.path.join(output_path, \"cohort.csv\"))\n",
"print(cohort.shape)\n",
"cohort.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "9dd92e23",
"metadata": {},
"source": [
"cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n",
"cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n",
"cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n",
"cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "8f55c793",
"metadata": {},
"source": [
"hadm_ids = set(cohort.hadm_id.unique().tolist())\n",
"len(hadm_ids)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "5a9d60c6",
"metadata": {},
"source": [
"helper"
]
},
{
"cell_type": "code",
"id": "5171bbae",
"metadata": {},
"source": [
"from concurrent.futures import ThreadPoolExecutor\n",
"from tqdm import tqdm\n",
"from pandarallel import pandarallel"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "5d6b9ce2",
"metadata": {},
"source": [
"pandarallel.initialize(progress_bar=True)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "ab5525be",
"metadata": {},
"source": [
"def save_group(group_df, hadm_id, event_type):\n",
" file_path = f\"{output_path}/event_{event_type}/event_{int(hadm_id)}.csv\"\n",
" group_df.to_csv(file_path, index=False)\n",
" return True"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "80a7a430",
"metadata": {},
"source": [
"dict"
]
},
{
"cell_type": "code",
"id": "38fbe144",
"metadata": {},
"source": [
"d_items = pd.read_csv(os.path.join(mimic_iv_path, \"icu/d_items.csv.gz\"))\n",
"print(d_items.shape)\n",
"d_items.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "28b698c5",
"metadata": {},
"source": [
"d_labitems = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/d_labitems.csv.gz\"))\n",
"print(d_labitems.shape)\n",
"d_labitems.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "8388eb65",
"metadata": {},
"source": [
"## labevents"
]
},
{
"cell_type": "code",
"id": "e075f2db",
"metadata": {},
"source": [
"event_type = \"labevents\""
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "841e4c08",
"metadata": {},
"source": [
"!rm -r {output_path}/event_{event_type}"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "369b9488",
"metadata": {},
"source": [
"create_directory(f\"{output_path}/event_{event_type}\")"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "3141e899",
"metadata": {},
"source": [
"df_raw = pd.read_csv(os.path.join(mimic_iv_path, f\"hosp/{event_type}.csv.gz\"))\n",
"print(df_raw.shape)\n",
"df_raw.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "f4a241a6",
"metadata": {},
"source": [
"df = df_raw.merge(cohort[[\"hadm_id\", \"hadm_intime\"]], on=[\"hadm_id\"], how=\"inner\")\n",
"print(df.shape)\n",
"df.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "8d76dad0",
"metadata": {},
"source": [
"df[\"charttime\"] = pd.to_datetime(df[\"charttime\"])\n",
"df[\"storetime\"] = pd.to_datetime(df[\"storetime\"])\n",
"df[\"timestamp\"] = (df.charttime - df.hadm_intime).dt.total_seconds() / 3600\n",
"df[\"timestamp_avail\"] = (df.storetime - df.hadm_intime).dt.total_seconds() / 3600\n",
"df.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "e1b5e814",
"metadata": {},
"source": [
"df = df.sort_values([\"subject_id\", \"hadm_id\", \"timestamp\", \"specimen_id\"], ascending=True)\n",
"df.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "bf9217d5",
"metadata": {},
"source": [
"df = df.merge(d_labitems, on=\"itemid\", how=\"left\")\n",
"df.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "93251944",
"metadata": {},
"source": [
"import numpy as np\n",
"\n",
"\n",
"df.value = df.value.replace(\"___\", np.NaN)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "b578f760",
"metadata": {},
"source": [
"df.isna().sum()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "f78c499c",
"metadata": {},
"source": [
"df.value = df.value.fillna(df.valuenum.apply(lambda x: f\"{x:.2f}\" if not pd.isna(x) else np.nan))"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "f858190c",
"metadata": {
"scrolled": true
},
"source": [
"df.isna().sum()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "3bc1208a",
"metadata": {},
"source": [
"df = df.dropna(subset=[\"value\", \"timestamp_avail\"])\n",
"print(df.shape)\n",
"df.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "4f149a6f",
"metadata": {},
"source": [
"def generate_event_value(x):\n",
" s = \"\"\n",
" if pd.isna(x.valueuom):\n",
" s += f\"{x.fluid} {x.label} {x.category}: {x.value}\"\n",
" else:\n",
" s += f\"{x.fluid} {x.label} {x.category}: {x.value} {x.valueuom}\"\n",
" if pd.isna(x.flag):\n",
" s += \" (normal)\"\n",
" else:\n",
" s += \" (abnormal)\"\n",
" return s"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "803193c3",
"metadata": {},
"source": [
"meta_cols = [\"fluid\", \"label\", \"category\", \"value\", \"valueuom\", \"flag\"]\n",
"for c in meta_cols:\n",
" df[\"meta_\" + c] = df[c]\n",
"meta_cols = [\"meta_\" + c for c in meta_cols]"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "ab18f0e2",
"metadata": {},
"source": [
"generate_event_value(df.iloc[8])"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "e9fb38f1",
"metadata": {},
"source": [
"df[\"event_type\"] = event_type\n",
"df[\"event_value\"] = df.parallel_apply(generate_event_value, axis=1)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "9babf2a4",
"metadata": {},
"source": [
"df[df.hadm_id == 29079034]"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "ade4facf",
"metadata": {},
"source": [
"df.groupby(\"hadm_id\").timestamp.count().describe()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "96874ba9",
"metadata": {},
"source": [
"groups = df.groupby(\"hadm_id\")\n",
" \n",
"with ThreadPoolExecutor(max_workers=4) as executor:\n",
" for hadm_id, group_df in tqdm(groups, total=groups.ngroups):\n",
" future = executor.submit(\n",
" save_group, \n",
" group_df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"] + meta_cols], \n",
" hadm_id, \n",
" event_type\n",
" )"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "b5318d52",
"metadata": {},
"source": [
"!ls -1 {output_path}/event_{event_type} | wc -l"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "6fbc0ea1",
"metadata": {},
"source": [],
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "pytorch20",
"language": "python",
"name": "pytorch20"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}