735 lines (734 with data), 16.4 kB
{
"cells": [
{
"cell_type": "code",
"id": "bf6469fe",
"metadata": {},
"source": [
"import os\n",
"import sys\n",
"\n",
"src_path = os.path.abspath('../..')\n",
"print(src_path)\n",
"sys.path.append(src_path)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "fcd74d2c",
"metadata": {},
"source": [
"from src.utils import create_directory, raw_data_path, processed_data_path, set_seed"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "4cae42b3",
"metadata": {},
"source": [
"set_seed(seed=42)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "57e86fc8",
"metadata": {},
"source": [
"import pandas as pd"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "d5a4a2f7",
"metadata": {},
"source": [
"mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
"mimic_iv_note_path = os.path.join(raw_data_path, \"physionet.org/files/mimic-iv-note/2.2\")\n",
"output_path = os.path.join(processed_data_path, \"mimic4\")"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "3d241540",
"metadata": {},
"source": [
"cohort = pd.read_csv(os.path.join(output_path, \"cohort.csv\"))\n",
"print(cohort.shape)\n",
"cohort.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "00486178",
"metadata": {},
"source": [
"cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n",
"cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n",
"cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n",
"cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "64cc5546",
"metadata": {},
"source": [
"hadm_ids = set(cohort.hadm_id.unique().tolist())\n",
"len(hadm_ids)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "4774f5d5",
"metadata": {},
"source": [
"helper"
]
},
{
"cell_type": "code",
"id": "ff882410",
"metadata": {},
"source": [
"from concurrent.futures import ThreadPoolExecutor\n",
"from tqdm import tqdm\n",
"from pandarallel import pandarallel"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "f841dfbb",
"metadata": {},
"source": [
"pandarallel.initialize(progress_bar=True)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "65b989bc",
"metadata": {},
"source": [
"def save_group(group_df, hadm_id, event_type):\n",
" file_path = f\"{output_path}/event_{event_type}/event_{int(hadm_id)}.csv\"\n",
" group_df.to_csv(file_path, index=False)\n",
" return True"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "29ca0184",
"metadata": {},
"source": [
"## patients"
]
},
{
"cell_type": "code",
"id": "ce3d37a4",
"metadata": {},
"source": [
"patients = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/patients.csv.gz\"))\n",
"print(patients.shape)\n",
"patients.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "96e9a2d4",
"metadata": {},
"source": [
"cohort = cohort.merge(patients[[\"subject_id\", \"gender\", \"anchor_age\", \"anchor_year\"]], on=\"subject_id\", how=\"inner\")\n",
"cohort[\"age\"] = cohort.hadm_intime.dt.year - cohort.anchor_year + cohort.anchor_age\n",
"print(cohort.shape)\n",
"cohort.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "7e56a347",
"metadata": {},
"source": [
"print(cohort.age.min())\n",
"print(cohort.age.max())\n",
"print(cohort.age.mean())\n",
"print(cohort.age.std())"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "a3dc3f7d",
"metadata": {},
"source": [
"cohort.gender.value_counts()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "685091a4",
"metadata": {},
"source": [
"## admissions"
]
},
{
"cell_type": "code",
"id": "20da413d",
"metadata": {},
"source": [
"admissions = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/admissions.csv.gz\"))\n",
"print(admissions.shape)\n",
"admissions.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "e5796ed0",
"metadata": {},
"source": [
"cohort = cohort.merge(admissions[[\"subject_id\", \"hadm_id\", \"admission_type\", \"admission_location\", \"insurance\", \"language\", \"marital_status\", \"race\"]], on=[\"subject_id\", \"hadm_id\"], how=\"inner\")\n",
"print(cohort.shape)\n",
"cohort.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "3b1486b0",
"metadata": {},
"source": [
"## discharge"
]
},
{
"cell_type": "code",
"id": "f3c47a56",
"metadata": {},
"source": [
"discharge = pd.read_csv(os.path.join(mimic_iv_note_path, \"note/discharge.csv.gz\"))\n",
"print(discharge.shape)\n",
"discharge.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "695c5b5a",
"metadata": {},
"source": [
"import re\n",
"\n",
"def extract_chief_complaint(discharge_summary):\n",
" # Define the regex pattern to capture the Chief Complaint text\n",
" # The pattern looks for the literal string \"Chief Complaint:\" followed by any characters until the first newline\n",
" pattern = r\"(Chief Complaint|___ Complaint):\\s*(.+?)\\s*\\n\"\n",
" \n",
" # Search for the pattern in the discharge summary\n",
" match = re.search(pattern, discharge_summary)\n",
" \n",
" # If a match is found, return the captured group; otherwise, return None\n",
" if match:\n",
" return match.group(2).strip() # Use strip to remove any extra whitespace\n",
" else:\n",
" return None"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "9aa30a39",
"metadata": {},
"source": [
"extract_chief_complaint(discharge.iloc[42332].text)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "7f32adac",
"metadata": {},
"source": [
"extract_chief_complaint(discharge.iloc[4332].text)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "8339a776",
"metadata": {},
"source": [
"discharge[\"chief_complaint\"] = discharge.text.parallel_apply(extract_chief_complaint)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "bb36888f",
"metadata": {},
"source": [
"discharge.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "c251a181",
"metadata": {},
"source": [
"discharge.isna().sum()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "88ceedbd",
"metadata": {},
"source": [
"cohort = cohort.merge(discharge[[\"subject_id\", \"hadm_id\", \"chief_complaint\"]], on=[\"subject_id\", \"hadm_id\"], how=\"inner\")\n",
"print(cohort.shape)\n",
"cohort.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "markdown",
"id": "2f2eefa1",
"metadata": {},
"source": [
"## post-process"
]
},
{
"cell_type": "code",
"id": "6f504122",
"metadata": {},
"source": [
"cohort = cohort.drop(columns=[\"anchor_age\", \"anchor_year\"])\n",
"cohort.head()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "048fdb57",
"metadata": {},
"source": [
"cohort.isna().sum()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "192fb0a1",
"metadata": {},
"source": [
"cohort.admission_type.unique()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "fd7ea73b",
"metadata": {},
"source": [
"cohort.admission_location.unique()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "2ef33de0",
"metadata": {},
"source": [
"cohort.insurance.unique()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "01d704cc",
"metadata": {},
"source": [
"cohort.language.unique()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "85d70f6f",
"metadata": {},
"source": [
"cohort.marital_status.unique()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "4a4047f3",
"metadata": {},
"source": [
"cohort.race.unique()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "de006e4c",
"metadata": {},
"source": "event_type = \"patient_demographics\"",
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "41fa6397",
"metadata": {},
"source": [
"def generate_event_value(x):\n",
" s = f\"gender: {x.gender}, age: {x.age}, race: {x.race}\"\n",
" if not pd.isna(x.marital_status):\n",
" s += f\", marital status: {x.marital_status}\"\n",
" s += f\", insurance: {x.insurance}\"\n",
" return s"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "3d53e74e",
"metadata": {},
"source": [
"meta_cols = [\"gender\", \"age\", \"race\", \"marital_status\", \"insurance\"]\n",
"for c in meta_cols:\n",
" cohort[\"meta_\" + c] = cohort[c]\n",
"meta_cols = [\"meta_\" + c for c in meta_cols]"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "1cf76546",
"metadata": {},
"source": [
"cohort[\"timestamp\"] = 0\n",
"cohort[\"timestamp_avail\"] = 0"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "88c506ec",
"metadata": {},
"source": [
"print(generate_event_value(cohort.iloc[5]))"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "eba10521",
"metadata": {},
"source": [
"print(generate_event_value(cohort.iloc[520]))"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "5d832292",
"metadata": {},
"source": [
"cohort[\"event_type\"] = event_type\n",
"cohort[\"event_value\"] = cohort.parallel_apply(generate_event_value, axis=1)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "3c46789d",
"metadata": {},
"source": [
"cohort[cohort.hadm_id == 29079034]"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "bee6b1f6",
"metadata": {},
"source": [
"cohort.groupby(\"hadm_id\").event_type.count().describe()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "56e27ab3",
"metadata": {},
"source": [
"!rm -r {output_path}/'event_{event_type}'"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "30433d47",
"metadata": {},
"source": [
"create_directory(f\"{output_path}/event_{event_type}\")"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "0ad33569",
"metadata": {},
"source": [
"groups = cohort.groupby(\"hadm_id\")\n",
" \n",
"with ThreadPoolExecutor(max_workers=4) as executor:\n",
" for hadm_id, group_df in tqdm(groups, total=groups.ngroups):\n",
" future = executor.submit(\n",
" save_group, \n",
" group_df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"] + meta_cols], \n",
" hadm_id, \n",
" event_type\n",
" )"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "43e3f09a",
"metadata": {},
"source": [
"!ls -1 {output_path}/'event_{event_type}' | wc -l"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "5ec6d393",
"metadata": {},
"source": "event_type = \"admission_info\"",
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "73f5aa52",
"metadata": {},
"source": [
"def generate_event_value(x):\n",
" s = f\"type: {x.admission_type}, location: {x.admission_location}\"\n",
" if not pd.isna(x.chief_complaint):\n",
" s += f\", chief complaint: {x.chief_complaint}\"\n",
" return s"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "8c1d9ea5",
"metadata": {},
"source": [
"meta_cols = [\"admission_type\", \"admission_location\", \"chief_complaint\"]\n",
"for c in meta_cols:\n",
" cohort[\"meta_\" + c] = cohort[c]\n",
"meta_cols = [\"meta_\" + c for c in meta_cols]"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "814a4d14",
"metadata": {},
"source": [
"print(generate_event_value(cohort.iloc[5]))"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "a3041145",
"metadata": {},
"source": [
"print(generate_event_value(cohort.iloc[520]))"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "cf03657c",
"metadata": {},
"source": [
"cohort[\"event_type\"] = event_type\n",
"cohort[\"event_value\"] = cohort.parallel_apply(generate_event_value, axis=1)"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "3eeaa8ae",
"metadata": {},
"source": [
"cohort[cohort.hadm_id == 29079034]"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "6af53072",
"metadata": {},
"source": [
"cohort.groupby(\"hadm_id\").event_type.count().describe()"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "2aa672f8",
"metadata": {},
"source": [
"!rm -r {output_path}/'event_{event_type}'"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "161a75a7",
"metadata": {},
"source": [
"create_directory(f\"{output_path}/event_{event_type}\")"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "334a9676",
"metadata": {},
"source": [
"groups = cohort.groupby(\"hadm_id\")\n",
" \n",
"with ThreadPoolExecutor(max_workers=4) as executor:\n",
" for hadm_id, group_df in tqdm(groups, total=groups.ngroups):\n",
" future = executor.submit(\n",
" save_group, \n",
" group_df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"] + meta_cols], \n",
" hadm_id, \n",
" event_type\n",
" )"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "318c6395",
"metadata": {},
"source": [
"!ls -1 {output_path}/'event_{event_type}' | wc -l"
],
"outputs": [],
"execution_count": null
},
{
"cell_type": "code",
"id": "a8e66a9f",
"metadata": {},
"source": [],
"outputs": [],
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "pytorch20",
"language": "python",
"name": "pytorch20"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}