576 lines (575 with data), 12.5 kB
{
"cells": [
{
"cell_type": "code",
"id": "a2ccf6ab",
"metadata": {},
"source": [
"import os\n",
"import sys\n",
"\n",
"src_path = os.path.abspath('../..')\n",
"print(src_path)\n",
"sys.path.append(src_path)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "3eb75526",
"metadata": {},
"source": [
"from src.utils import create_directory, dump_pickle, raw_data_path, processed_data_path, set_seed"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "17fef7f7",
"metadata": {},
"source": [
"set_seed(seed=42)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "51a59258",
"metadata": {},
"source": [
"import pandas as pd"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "27ed0a87",
"metadata": {},
"source": [
"mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
"mimic_iv_note_path = os.path.join(raw_data_path, \"physionet.org/files/mimic-iv-note/2.2\")"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "eebf095b",
"metadata": {},
"source": [
"!ls {mimic_iv_path}"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "d1949918",
"metadata": {},
"source": [
"!ls {mimic_iv_note_path}"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "4eef9634",
"metadata": {},
"source": [
"patients = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/patients.csv.gz\"))\n",
"print(patients.shape)\n",
"patients.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "8f740475",
"metadata": {},
"source": [
"admissions = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/admissions.csv.gz\"))\n",
"print(admissions.shape)\n",
"admissions.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "7314ceda",
"metadata": {},
"source": [
"icustays = pd.read_csv(os.path.join(mimic_iv_path, \"icu/icustays.csv.gz\"))\n",
"print(icustays.shape)\n",
"icustays.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "3b9416e5",
"metadata": {},
"source": [
"discharge = pd.read_csv(os.path.join(mimic_iv_note_path, \"note/discharge.csv.gz\"))\n",
"print(discharge.shape)\n",
"discharge.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "b6d66270",
"metadata": {},
"source": [
"print(patients.subject_id.nunique())\n",
"print(admissions.subject_id.nunique())\n",
"print(icustays.subject_id.nunique())\n",
"print(discharge.subject_id.nunique())"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "446ad1aa",
"metadata": {},
"source": [
"print(admissions.hadm_id.nunique())\n",
"print(icustays.hadm_id.nunique())\n",
"print(discharge.hadm_id.nunique())"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "9ef85af8",
"metadata": {},
"source": [
"print(icustays.stay_id.nunique())"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "3df1d4d6",
"metadata": {},
"source": [
"admissions_hadm_ids = set(admissions.hadm_id.tolist())\n",
"icustays_hadm_ids = set(icustays.hadm_id.tolist())\n",
"discharge_hadm_ids = set(discharge.hadm_id.tolist())"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "0e3c3c84",
"metadata": {},
"source": [
"from matplotlib_venn import venn3\n",
"import matplotlib.pyplot as plt\n",
"\n",
"\n",
"plt.figure(figsize=(8, 8))\n",
"venn3([admissions_hadm_ids, icustays_hadm_ids, discharge_hadm_ids], ('Hospital', 'ICU', 'Discharge'))\n",
"plt.show()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "a26f90ee",
"metadata": {},
"source": [
"starting cohort"
]
},
{
"cell_type": "code",
"id": "500460a5",
"metadata": {},
"source": [
"print(icustays.subject_id.nunique())\n",
"print(icustays.hadm_id.nunique())\n",
"print(icustays.stay_id.nunique())"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "c0d8ef77",
"metadata": {},
"source": [
"remove no hospital admission"
]
},
{
"cell_type": "code",
"id": "fc6d90ed",
"metadata": {},
"source": [
"icustays = icustays[icustays.hadm_id.isin(admissions_hadm_ids)]"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "19efd63f",
"metadata": {},
"source": [
"print(icustays.subject_id.nunique())\n",
"print(icustays.hadm_id.nunique())\n",
"print(icustays.stay_id.nunique())"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "e9892dc6",
"metadata": {},
"source": [
"remove no discharge note"
]
},
{
"cell_type": "code",
"id": "ed4347a1",
"metadata": {},
"source": [
"icustays = icustays[icustays.hadm_id.isin(discharge_hadm_ids)]"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "55c8af1c",
"metadata": {},
"source": [
"print(icustays.subject_id.nunique())\n",
"print(icustays.hadm_id.nunique())\n",
"print(icustays.stay_id.nunique())"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "9f6c9db1",
"metadata": {},
"source": [
"remove >2 ICU stays per admission"
]
},
{
"cell_type": "code",
"id": "05c2f25c",
"metadata": {},
"source": [
"to_keep = icustays.groupby(\"hadm_id\").stay_id.nunique().reset_index()\n",
"to_keep = to_keep[to_keep.stay_id == 1][[\"hadm_id\"]]\n",
"icustays = icustays.merge(to_keep, how=\"inner\", on=\"hadm_id\")"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "1c990f33",
"metadata": {},
"source": [
"print(icustays.subject_id.nunique())\n",
"print(icustays.hadm_id.nunique())\n",
"print(icustays.stay_id.nunique())"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "0199c558",
"metadata": {},
"source": [
"remove < 18 years old patients"
]
},
{
"cell_type": "code",
"id": "934d050a",
"metadata": {},
"source": [
"icustays.intime = pd.to_datetime(icustays.intime)\n",
"icustays = icustays.merge(patients[[\"subject_id\", \"anchor_age\", \"anchor_year\"]], on=\"subject_id\", how=\"inner\")\n",
"icustays[\"age\"] = icustays.intime.dt.year - icustays.anchor_year + icustays.anchor_age\n",
"icustays.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "a658c503",
"metadata": {},
"source": [
"icustays.age.min()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "acd6bbdc",
"metadata": {},
"source": [
"remove negative los"
]
},
{
"cell_type": "code",
"id": "16829161",
"metadata": {
"scrolled": true
},
"source": [
"icustays = icustays.merge(admissions[[\"hadm_id\", \"admittime\", \"dischtime\"]], on=\"hadm_id\", how=\"inner\")\n",
"icustays = icustays.rename(columns={\n",
" \"admittime\": \"hadm_intime\",\n",
" \"dischtime\": \"hadm_outtime\",\n",
" \"intime\": \"stay_intime\",\n",
" \"outtime\": \"stay_outtime\",\n",
"})\n",
"icustays.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "59f29be8",
"metadata": {},
"source": [
"icustays['hadm_intime'] = pd.to_datetime(icustays['hadm_intime'])\n",
"icustays['hadm_outtime'] = pd.to_datetime(icustays['hadm_outtime'])\n",
"icustays['stay_intime'] = pd.to_datetime(icustays['stay_intime'])\n",
"icustays['stay_outtime'] = pd.to_datetime(icustays['stay_outtime'])"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "f033b949",
"metadata": {},
"source": [
"icustays['stay_los'] = (icustays['stay_outtime'] - icustays['stay_intime']).dt.total_seconds() / 3600\n",
"icustays['hadm_los'] = (icustays['hadm_outtime'] - icustays['hadm_intime']).dt.total_seconds() / 3600"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "f347f40e",
"metadata": {},
"source": [
"icustays.stay_los.min()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "0f41dd40",
"metadata": {},
"source": [
"icustays.hadm_los.min()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "36aae19f",
"metadata": {},
"source": [
"icustays = icustays[icustays.stay_los >= 0]\n",
"icustays = icustays[icustays.hadm_los >= 0]"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "452937c5",
"metadata": {},
"source": [
"print(icustays.subject_id.nunique())\n",
"print(icustays.hadm_id.nunique())\n",
"print(icustays.stay_id.nunique())"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "078c5947",
"metadata": {},
"source": [
"statistics"
]
},
{
"cell_type": "code",
"id": "d1c4349d",
"metadata": {},
"source": [
"icustays.groupby(\"subject_id\").hadm_id.nunique().value_counts()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "e9640ccb",
"metadata": {},
"source": [
"icustays.groupby(\"subject_id\").hadm_id.nunique().describe()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "b1d67bba",
"metadata": {},
"source": [
"icustays.groupby(\"hadm_id\").stay_id.nunique().value_counts()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "bc501162",
"metadata": {},
"source": [
"icustays.groupby(\"hadm_id\").stay_id.nunique().describe()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "6242836b",
"metadata": {},
"source": [
"icustays.stay_los.describe()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "4f21b790",
"metadata": {},
"source": [
"icustays.hadm_los.describe()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "2870464f",
"metadata": {},
"source": [
"save"
]
},
{
"cell_type": "code",
"id": "7692519c",
"metadata": {},
"source": [
"icustays.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "af029ce5",
"metadata": {},
"source": [
"icustays = icustays[[\"subject_id\", \"hadm_id\", \"stay_id\", \"hadm_intime\", \"hadm_outtime\", \"hadm_los\", \"stay_intime\", \"stay_outtime\", \"stay_los\"]]\n",
"icustays.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "3122776d",
"metadata": {},
"source": [
"output_path = os.path.join(processed_data_path, \"mimic4\")\n",
"create_directory(output_path)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "3ca8bb7d",
"metadata": {},
"source": [
"icustays.to_csv(os.path.join(output_path, 'cohort.csv'), index=False)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "806c6cc7",
"metadata": {},
"source": [],
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "pytorch20",
"language": "python",
"name": "pytorch20"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}