[780764]: / src / preprocess / 02_event_static.ipynb

Download this file

735 lines (734 with data), 16.4 kB

{
 "cells": [
  {
   "cell_type": "code",
   "id": "bf6469fe",
   "metadata": {},
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "src_path = os.path.abspath('../..')\n",
    "print(src_path)\n",
    "sys.path.append(src_path)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "fcd74d2c",
   "metadata": {},
   "source": [
    "from src.utils import create_directory, raw_data_path, processed_data_path, set_seed"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "4cae42b3",
   "metadata": {},
   "source": [
    "set_seed(seed=42)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "57e86fc8",
   "metadata": {},
   "source": [
    "import pandas as pd"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "d5a4a2f7",
   "metadata": {},
   "source": [
    "mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
    "mimic_iv_note_path = os.path.join(raw_data_path, \"physionet.org/files/mimic-iv-note/2.2\")\n",
    "output_path = os.path.join(processed_data_path, \"mimic4\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "3d241540",
   "metadata": {},
   "source": [
    "cohort = pd.read_csv(os.path.join(output_path, \"cohort.csv\"))\n",
    "print(cohort.shape)\n",
    "cohort.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "00486178",
   "metadata": {},
   "source": [
    "cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n",
    "cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n",
    "cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n",
    "cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "64cc5546",
   "metadata": {},
   "source": [
    "hadm_ids = set(cohort.hadm_id.unique().tolist())\n",
    "len(hadm_ids)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "4774f5d5",
   "metadata": {},
   "source": [
    "helper"
   ]
  },
  {
   "cell_type": "code",
   "id": "ff882410",
   "metadata": {},
   "source": [
    "from concurrent.futures import ThreadPoolExecutor\n",
    "from tqdm import tqdm\n",
    "from pandarallel import pandarallel"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "f841dfbb",
   "metadata": {},
   "source": [
    "pandarallel.initialize(progress_bar=True)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "65b989bc",
   "metadata": {},
   "source": [
    "def save_group(group_df, hadm_id, event_type):\n",
    "    file_path = f\"{output_path}/event_{event_type}/event_{int(hadm_id)}.csv\"\n",
    "    group_df.to_csv(file_path, index=False)\n",
    "    return True"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "29ca0184",
   "metadata": {},
   "source": [
    "## patients"
   ]
  },
  {
   "cell_type": "code",
   "id": "ce3d37a4",
   "metadata": {},
   "source": [
    "patients = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/patients.csv.gz\"))\n",
    "print(patients.shape)\n",
    "patients.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "96e9a2d4",
   "metadata": {},
   "source": [
    "cohort = cohort.merge(patients[[\"subject_id\", \"gender\", \"anchor_age\", \"anchor_year\"]], on=\"subject_id\", how=\"inner\")\n",
    "cohort[\"age\"] = cohort.hadm_intime.dt.year - cohort.anchor_year + cohort.anchor_age\n",
    "print(cohort.shape)\n",
    "cohort.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "7e56a347",
   "metadata": {},
   "source": [
    "print(cohort.age.min())\n",
    "print(cohort.age.max())\n",
    "print(cohort.age.mean())\n",
    "print(cohort.age.std())"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "a3dc3f7d",
   "metadata": {},
   "source": [
    "cohort.gender.value_counts()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "685091a4",
   "metadata": {},
   "source": [
    "## admissions"
   ]
  },
  {
   "cell_type": "code",
   "id": "20da413d",
   "metadata": {},
   "source": [
    "admissions = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/admissions.csv.gz\"))\n",
    "print(admissions.shape)\n",
    "admissions.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "e5796ed0",
   "metadata": {},
   "source": [
    "cohort = cohort.merge(admissions[[\"subject_id\", \"hadm_id\", \"admission_type\", \"admission_location\", \"insurance\", \"language\", \"marital_status\", \"race\"]], on=[\"subject_id\", \"hadm_id\"], how=\"inner\")\n",
    "print(cohort.shape)\n",
    "cohort.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "3b1486b0",
   "metadata": {},
   "source": [
    "## discharge"
   ]
  },
  {
   "cell_type": "code",
   "id": "f3c47a56",
   "metadata": {},
   "source": [
    "discharge = pd.read_csv(os.path.join(mimic_iv_note_path, \"note/discharge.csv.gz\"))\n",
    "print(discharge.shape)\n",
    "discharge.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "695c5b5a",
   "metadata": {},
   "source": [
    "import re\n",
    "\n",
    "def extract_chief_complaint(discharge_summary):\n",
    "    # Define the regex pattern to capture the Chief Complaint text\n",
    "    # The pattern looks for the literal string \"Chief Complaint:\" followed by any characters until the first newline\n",
    "    pattern = r\"(Chief Complaint|___ Complaint):\\s*(.+?)\\s*\\n\"\n",
    "    \n",
    "    # Search for the pattern in the discharge summary\n",
    "    match = re.search(pattern, discharge_summary)\n",
    "    \n",
    "    # If a match is found, return the captured group; otherwise, return None\n",
    "    if match:\n",
    "        return match.group(2).strip()  # Use strip to remove any extra whitespace\n",
    "    else:\n",
    "        return None"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "9aa30a39",
   "metadata": {},
   "source": [
    "extract_chief_complaint(discharge.iloc[42332].text)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "7f32adac",
   "metadata": {},
   "source": [
    "extract_chief_complaint(discharge.iloc[4332].text)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "8339a776",
   "metadata": {},
   "source": [
    "discharge[\"chief_complaint\"] = discharge.text.parallel_apply(extract_chief_complaint)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "bb36888f",
   "metadata": {},
   "source": [
    "discharge.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "c251a181",
   "metadata": {},
   "source": [
    "discharge.isna().sum()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "88ceedbd",
   "metadata": {},
   "source": [
    "cohort = cohort.merge(discharge[[\"subject_id\", \"hadm_id\", \"chief_complaint\"]], on=[\"subject_id\", \"hadm_id\"], how=\"inner\")\n",
    "print(cohort.shape)\n",
    "cohort.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "2f2eefa1",
   "metadata": {},
   "source": [
    "## post-process"
   ]
  },
  {
   "cell_type": "code",
   "id": "6f504122",
   "metadata": {},
   "source": [
    "cohort = cohort.drop(columns=[\"anchor_age\", \"anchor_year\"])\n",
    "cohort.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "048fdb57",
   "metadata": {},
   "source": [
    "cohort.isna().sum()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "192fb0a1",
   "metadata": {},
   "source": [
    "cohort.admission_type.unique()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "fd7ea73b",
   "metadata": {},
   "source": [
    "cohort.admission_location.unique()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "2ef33de0",
   "metadata": {},
   "source": [
    "cohort.insurance.unique()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "01d704cc",
   "metadata": {},
   "source": [
    "cohort.language.unique()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "85d70f6f",
   "metadata": {},
   "source": [
    "cohort.marital_status.unique()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "4a4047f3",
   "metadata": {},
   "source": [
    "cohort.race.unique()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "de006e4c",
   "metadata": {},
   "source": "event_type = \"patient_demographics\"",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "41fa6397",
   "metadata": {},
   "source": [
    "def generate_event_value(x):\n",
    "    s = f\"gender: {x.gender}, age: {x.age}, race: {x.race}\"\n",
    "    if not pd.isna(x.marital_status):\n",
    "        s += f\", marital status: {x.marital_status}\"\n",
    "    s += f\", insurance: {x.insurance}\"\n",
    "    return s"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "3d53e74e",
   "metadata": {},
   "source": [
    "meta_cols = [\"gender\", \"age\", \"race\", \"marital_status\", \"insurance\"]\n",
    "for c in meta_cols:\n",
    "    cohort[\"meta_\" + c] = cohort[c]\n",
    "meta_cols = [\"meta_\" + c for c in meta_cols]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "1cf76546",
   "metadata": {},
   "source": [
    "cohort[\"timestamp\"] = 0\n",
    "cohort[\"timestamp_avail\"] = 0"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "88c506ec",
   "metadata": {},
   "source": [
    "print(generate_event_value(cohort.iloc[5]))"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "eba10521",
   "metadata": {},
   "source": [
    "print(generate_event_value(cohort.iloc[520]))"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5d832292",
   "metadata": {},
   "source": [
    "cohort[\"event_type\"] = event_type\n",
    "cohort[\"event_value\"] = cohort.parallel_apply(generate_event_value, axis=1)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "3c46789d",
   "metadata": {},
   "source": [
    "cohort[cohort.hadm_id == 29079034]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "bee6b1f6",
   "metadata": {},
   "source": [
    "cohort.groupby(\"hadm_id\").event_type.count().describe()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "56e27ab3",
   "metadata": {},
   "source": [
    "!rm -r {output_path}/'event_{event_type}'"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "30433d47",
   "metadata": {},
   "source": [
    "create_directory(f\"{output_path}/event_{event_type}\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "0ad33569",
   "metadata": {},
   "source": [
    "groups = cohort.groupby(\"hadm_id\")\n",
    "    \n",
    "with ThreadPoolExecutor(max_workers=4) as executor:\n",
    "    for hadm_id, group_df in tqdm(groups, total=groups.ngroups):\n",
    "        future = executor.submit(\n",
    "            save_group, \n",
    "            group_df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"] + meta_cols], \n",
    "            hadm_id, \n",
    "            event_type\n",
    "        )"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "43e3f09a",
   "metadata": {},
   "source": [
    "!ls -1 {output_path}/'event_{event_type}' | wc -l"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5ec6d393",
   "metadata": {},
   "source": "event_type = \"admission_info\"",
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "73f5aa52",
   "metadata": {},
   "source": [
    "def generate_event_value(x):\n",
    "    s = f\"type: {x.admission_type}, location: {x.admission_location}\"\n",
    "    if not pd.isna(x.chief_complaint):\n",
    "        s += f\", chief complaint: {x.chief_complaint}\"\n",
    "    return s"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "8c1d9ea5",
   "metadata": {},
   "source": [
    "meta_cols = [\"admission_type\", \"admission_location\", \"chief_complaint\"]\n",
    "for c in meta_cols:\n",
    "    cohort[\"meta_\" + c] = cohort[c]\n",
    "meta_cols = [\"meta_\" + c for c in meta_cols]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "814a4d14",
   "metadata": {},
   "source": [
    "print(generate_event_value(cohort.iloc[5]))"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "a3041145",
   "metadata": {},
   "source": [
    "print(generate_event_value(cohort.iloc[520]))"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "cf03657c",
   "metadata": {},
   "source": [
    "cohort[\"event_type\"] = event_type\n",
    "cohort[\"event_value\"] = cohort.parallel_apply(generate_event_value, axis=1)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "3eeaa8ae",
   "metadata": {},
   "source": [
    "cohort[cohort.hadm_id == 29079034]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "6af53072",
   "metadata": {},
   "source": [
    "cohort.groupby(\"hadm_id\").event_type.count().describe()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "2aa672f8",
   "metadata": {},
   "source": [
    "!rm -r {output_path}/'event_{event_type}'"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "161a75a7",
   "metadata": {},
   "source": [
    "create_directory(f\"{output_path}/event_{event_type}\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "334a9676",
   "metadata": {},
   "source": [
    "groups = cohort.groupby(\"hadm_id\")\n",
    "    \n",
    "with ThreadPoolExecutor(max_workers=4) as executor:\n",
    "    for hadm_id, group_df in tqdm(groups, total=groups.ngroups):\n",
    "        future = executor.submit(\n",
    "            save_group, \n",
    "            group_df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"] + meta_cols], \n",
    "            hadm_id, \n",
    "            event_type\n",
    "        )"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "318c6395",
   "metadata": {},
   "source": [
    "!ls -1 {output_path}/'event_{event_type}' | wc -l"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "a8e66a9f",
   "metadata": {},
   "source": [],
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch20",
   "language": "python",
   "name": "pytorch20"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}