llemr / Git / Diff of /src/preprocess/02_event

Models:

philipB/

llemr

Downloads: 1

Diff of /src/preprocess/02_event_static.ipynb [000000] .. [780764]

Switch to unified view

 b/src/preprocess/02_event_static.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "id": "bf6469fe",
+   "metadata": {},
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "\n",
+    "src_path = os.path.abspath('../..')\n",
+    "print(src_path)\n",
+    "sys.path.append(src_path)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "fcd74d2c",
+   "metadata": {},
+   "source": [
+    "from src.utils import create_directory, raw_data_path, processed_data_path, set_seed"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "4cae42b3",
+   "metadata": {},
+   "source": [
+    "set_seed(seed=42)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "57e86fc8",
+   "metadata": {},
+   "source": [
+    "import pandas as pd"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "d5a4a2f7",
+   "metadata": {},
+   "source": [
+    "mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
+    "mimic_iv_note_path = os.path.join(raw_data_path, \"physionet.org/files/mimic-iv-note/2.2\")\n",
+    "output_path = os.path.join(processed_data_path, \"mimic4\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "3d241540",
+   "metadata": {},
+   "source": [
+    "cohort = pd.read_csv(os.path.join(output_path, \"cohort.csv\"))\n",
+    "print(cohort.shape)\n",
+    "cohort.head()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "00486178",
+   "metadata": {},
+   "source": [
+    "cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n",
+    "cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n",
+    "cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n",
+    "cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "64cc5546",
+   "metadata": {},
+   "source": [
+    "hadm_ids = set(cohort.hadm_id.unique().tolist())\n",
+    "len(hadm_ids)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4774f5d5",
+   "metadata": {},
+   "source": [
+    "helper"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "ff882410",
+   "metadata": {},
+   "source": [
+    "from concurrent.futures import ThreadPoolExecutor\n",
+    "from tqdm import tqdm\n",
+    "from pandarallel import pandarallel"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "f841dfbb",
+   "metadata": {},
+   "source": [
+    "pandarallel.initialize(progress_bar=True)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "65b989bc",
+   "metadata": {},
+   "source": [
+    "def save_group(group_df, hadm_id, event_type):\n",
+    "    file_path = f\"{output_path}/event_{event_type}/event_{int(hadm_id)}.csv\"\n",
+    "    group_df.to_csv(file_path, index=False)\n",
+    "    return True"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "29ca0184",
+   "metadata": {},
+   "source": [
+    "## patients"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "ce3d37a4",
+   "metadata": {},
+   "source": [
+    "patients = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/patients.csv.gz\"))\n",
+    "print(patients.shape)\n",
+    "patients.head()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "96e9a2d4",
+   "metadata": {},
+   "source": [
+    "cohort = cohort.merge(patients[[\"subject_id\", \"gender\", \"anchor_age\", \"anchor_year\"]], on=\"subject_id\", how=\"inner\")\n",
+    "cohort[\"age\"] = cohort.hadm_intime.dt.year - cohort.anchor_year + cohort.anchor_age\n",
+    "print(cohort.shape)\n",
+    "cohort.head()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "7e56a347",
+   "metadata": {},
+   "source": [
+    "print(cohort.age.min())\n",
+    "print(cohort.age.max())\n",
+    "print(cohort.age.mean())\n",
+    "print(cohort.age.std())"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "a3dc3f7d",
+   "metadata": {},
+   "source": [
+    "cohort.gender.value_counts()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "685091a4",
+   "metadata": {},
+   "source": [
+    "## admissions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "20da413d",
+   "metadata": {},
+   "source": [
+    "admissions = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/admissions.csv.gz\"))\n",
+    "print(admissions.shape)\n",
+    "admissions.head()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "e5796ed0",
+   "metadata": {},
+   "source": [
+    "cohort = cohort.merge(admissions[[\"subject_id\", \"hadm_id\", \"admission_type\", \"admission_location\", \"insurance\", \"language\", \"marital_status\", \"race\"]], on=[\"subject_id\", \"hadm_id\"], how=\"inner\")\n",
+    "print(cohort.shape)\n",
+    "cohort.head()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "3b1486b0",
+   "metadata": {},
+   "source": [
+    "## discharge"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "f3c47a56",
+   "metadata": {},
+   "source": [
+    "discharge = pd.read_csv(os.path.join(mimic_iv_note_path, \"note/discharge.csv.gz\"))\n",
+    "print(discharge.shape)\n",
+    "discharge.head()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "695c5b5a",
+   "metadata": {},
+   "source": [
+    "import re\n",
+    "\n",
+    "def extract_chief_complaint(discharge_summary):\n",
+    "    # Define the regex pattern to capture the Chief Complaint text\n",
+    "    # The pattern looks for the literal string \"Chief Complaint:\" followed by any characters until the first newline\n",
+    "    pattern = r\"(Chief Complaint|___ Complaint):\\s*(.+?)\\s*\\n\"\n",
+    "    \n",
+    "    # Search for the pattern in the discharge summary\n",
+    "    match = re.search(pattern, discharge_summary)\n",
+    "    \n",
+    "    # If a match is found, return the captured group; otherwise, return None\n",
+    "    if match:\n",
+    "        return match.group(2).strip()  # Use strip to remove any extra whitespace\n",
+    "    else:\n",
+    "        return None"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "9aa30a39",
+   "metadata": {},
+   "source": [
+    "extract_chief_complaint(discharge.iloc[42332].text)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "7f32adac",
+   "metadata": {},
+   "source": [
+    "extract_chief_complaint(discharge.iloc[4332].text)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "8339a776",
+   "metadata": {},
+   "source": [
+    "discharge[\"chief_complaint\"] = discharge.text.parallel_apply(extract_chief_complaint)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "bb36888f",
+   "metadata": {},
+   "source": [
+    "discharge.head()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "c251a181",
+   "metadata": {},
+   "source": [
+    "discharge.isna().sum()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "88ceedbd",
+   "metadata": {},
+   "source": [
+    "cohort = cohort.merge(discharge[[\"subject_id\", \"hadm_id\", \"chief_complaint\"]], on=[\"subject_id\", \"hadm_id\"], how=\"inner\")\n",
+    "print(cohort.shape)\n",
+    "cohort.head()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2f2eefa1",
+   "metadata": {},
+   "source": [
+    "## post-process"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "id": "6f504122",
+   "metadata": {},
+   "source": [
+    "cohort = cohort.drop(columns=[\"anchor_age\", \"anchor_year\"])\n",
+    "cohort.head()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "048fdb57",
+   "metadata": {},
+   "source": [
+    "cohort.isna().sum()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "192fb0a1",
+   "metadata": {},
+   "source": [
+    "cohort.admission_type.unique()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "fd7ea73b",
+   "metadata": {},
+   "source": [
+    "cohort.admission_location.unique()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "2ef33de0",
+   "metadata": {},
+   "source": [
+    "cohort.insurance.unique()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "01d704cc",
+   "metadata": {},
+   "source": [
+    "cohort.language.unique()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "85d70f6f",
+   "metadata": {},
+   "source": [
+    "cohort.marital_status.unique()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "4a4047f3",
+   "metadata": {},
+   "source": [
+    "cohort.race.unique()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "de006e4c",
+   "metadata": {},
+   "source": "event_type = \"patient_demographics\"",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "41fa6397",
+   "metadata": {},
+   "source": [
+    "def generate_event_value(x):\n",
+    "    s = f\"gender: {x.gender}, age: {x.age}, race: {x.race}\"\n",
+    "    if not pd.isna(x.marital_status):\n",
+    "        s += f\", marital status: {x.marital_status}\"\n",
+    "    s += f\", insurance: {x.insurance}\"\n",
+    "    return s"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "3d53e74e",
+   "metadata": {},
+   "source": [
+    "meta_cols = [\"gender\", \"age\", \"race\", \"marital_status\", \"insurance\"]\n",
+    "for c in meta_cols:\n",
+    "    cohort[\"meta_\" + c] = cohort[c]\n",
+    "meta_cols = [\"meta_\" + c for c in meta_cols]"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "1cf76546",
+   "metadata": {},
+   "source": [
+    "cohort[\"timestamp\"] = 0\n",
+    "cohort[\"timestamp_avail\"] = 0"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "88c506ec",
+   "metadata": {},
+   "source": [
+    "print(generate_event_value(cohort.iloc[5]))"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "eba10521",
+   "metadata": {},
+   "source": [
+    "print(generate_event_value(cohort.iloc[520]))"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "5d832292",
+   "metadata": {},
+   "source": [
+    "cohort[\"event_type\"] = event_type\n",
+    "cohort[\"event_value\"] = cohort.parallel_apply(generate_event_value, axis=1)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "3c46789d",
+   "metadata": {},
+   "source": [
+    "cohort[cohort.hadm_id == 29079034]"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "bee6b1f6",
+   "metadata": {},
+   "source": [
+    "cohort.groupby(\"hadm_id\").event_type.count().describe()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "56e27ab3",
+   "metadata": {},
+   "source": [
+    "!rm -r {output_path}/'event_{event_type}'"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "30433d47",
+   "metadata": {},
+   "source": [
+    "create_directory(f\"{output_path}/event_{event_type}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "0ad33569",
+   "metadata": {},
+   "source": [
+    "groups = cohort.groupby(\"hadm_id\")\n",
+    "    \n",
+    "with ThreadPoolExecutor(max_workers=4) as executor:\n",
+    "    for hadm_id, group_df in tqdm(groups, total=groups.ngroups):\n",
+    "        future = executor.submit(\n",
+    "            save_group, \n",
+    "            group_df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"] + meta_cols], \n",
+    "            hadm_id, \n",
+    "            event_type\n",
+    "        )"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "43e3f09a",
+   "metadata": {},
+   "source": [
+    "!ls -1 {output_path}/'event_{event_type}' | wc -l"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "5ec6d393",
+   "metadata": {},
+   "source": "event_type = \"admission_info\"",
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "73f5aa52",
+   "metadata": {},
+   "source": [
+    "def generate_event_value(x):\n",
+    "    s = f\"type: {x.admission_type}, location: {x.admission_location}\"\n",
+    "    if not pd.isna(x.chief_complaint):\n",
+    "        s += f\", chief complaint: {x.chief_complaint}\"\n",
+    "    return s"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "8c1d9ea5",
+   "metadata": {},
+   "source": [
+    "meta_cols = [\"admission_type\", \"admission_location\", \"chief_complaint\"]\n",
+    "for c in meta_cols:\n",
+    "    cohort[\"meta_\" + c] = cohort[c]\n",
+    "meta_cols = [\"meta_\" + c for c in meta_cols]"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "814a4d14",
+   "metadata": {},
+   "source": [
+    "print(generate_event_value(cohort.iloc[5]))"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "a3041145",
+   "metadata": {},
+   "source": [
+    "print(generate_event_value(cohort.iloc[520]))"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "cf03657c",
+   "metadata": {},
+   "source": [
+    "cohort[\"event_type\"] = event_type\n",
+    "cohort[\"event_value\"] = cohort.parallel_apply(generate_event_value, axis=1)"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "3eeaa8ae",
+   "metadata": {},
+   "source": [
+    "cohort[cohort.hadm_id == 29079034]"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "6af53072",
+   "metadata": {},
+   "source": [
+    "cohort.groupby(\"hadm_id\").event_type.count().describe()"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "2aa672f8",
+   "metadata": {},
+   "source": [
+    "!rm -r {output_path}/'event_{event_type}'"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "161a75a7",
+   "metadata": {},
+   "source": [
+    "create_directory(f\"{output_path}/event_{event_type}\")"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "334a9676",
+   "metadata": {},
+   "source": [
+    "groups = cohort.groupby(\"hadm_id\")\n",
+    "    \n",
+    "with ThreadPoolExecutor(max_workers=4) as executor:\n",
+    "    for hadm_id, group_df in tqdm(groups, total=groups.ngroups):\n",
+    "        future = executor.submit(\n",
+    "            save_group, \n",
+    "            group_df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"] + meta_cols], \n",
+    "            hadm_id, \n",
+    "            event_type\n",
+    "        )"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "318c6395",
+   "metadata": {},
+   "source": [
+    "!ls -1 {output_path}/'event_{event_type}' | wc -l"
+   ],
+   "outputs": [],
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "id": "a8e66a9f",
+   "metadata": {},
+   "source": [],
+   "outputs": [],
+   "execution_count": null
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "pytorch20",
+   "language": "python",
+   "name": "pytorch20"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.19"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}