llemr / Git / [780764] /src/preprocess/02_event_hosp

Models:
philipB/
llemr
Downloads: 1
[780764]: / src / preprocess / 02_event_hosp_microbiologyevents.ipynb
History
Download this file
428 lines (427 with data), 9.7 kB

{
 "cells": [
  {
   "cell_type": "code",
   "id": "debdace9",
   "metadata": {},
   "source": [
    "import os\n",
    "import sys\n",
    "\n",
    "src_path = os.path.abspath(\"../..\")\n",
    "print(src_path)\n",
    "sys.path.append(src_path)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "6bad1e09",
   "metadata": {},
   "source": [
    "from src.utils import create_directory, raw_data_path, processed_data_path, set_seed"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5d9bc78c",
   "metadata": {},
   "source": [
    "set_seed(seed=42)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "13d22a57",
   "metadata": {},
   "source": [
    "import pandas as pd"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "dd9852d5",
   "metadata": {},
   "source": [
    "mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
    "output_path = os.path.join(processed_data_path, \"mimic4\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "b6a27998",
   "metadata": {},
   "source": [
    "cohort = pd.read_csv(os.path.join(output_path, \"cohort.csv\"))\n",
    "print(cohort.shape)\n",
    "cohort.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "9dd92e23",
   "metadata": {},
   "source": [
    "cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n",
    "cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n",
    "cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n",
    "cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "8f55c793",
   "metadata": {},
   "source": [
    "hadm_ids = set(cohort.hadm_id.unique().tolist())\n",
    "len(hadm_ids)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "5a9d60c6",
   "metadata": {},
   "source": [
    "helper"
   ]
  },
  {
   "cell_type": "code",
   "id": "5171bbae",
   "metadata": {},
   "source": [
    "from concurrent.futures import ThreadPoolExecutor\n",
    "from tqdm import tqdm\n",
    "from pandarallel import pandarallel"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "5d6b9ce2",
   "metadata": {},
   "source": [
    "pandarallel.initialize(progress_bar=True)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "ab5525be",
   "metadata": {},
   "source": [
    "def save_group(group_df, hadm_id, event_type):\n",
    "    file_path = f\"{output_path}/event_{event_type}/event_{int(hadm_id)}.csv\"\n",
    "    group_df.to_csv(file_path, index=False)\n",
    "    return True"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "80a7a430",
   "metadata": {},
   "source": [
    "dict"
   ]
  },
  {
   "cell_type": "code",
   "id": "38fbe144",
   "metadata": {},
   "source": [
    "d_items = pd.read_csv(os.path.join(mimic_iv_path, \"icu/d_items.csv.gz\"))\n",
    "print(d_items.shape)\n",
    "d_items.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "28b698c5",
   "metadata": {},
   "source": [
    "d_labitems = pd.read_csv(os.path.join(mimic_iv_path, \"hosp/d_labitems.csv.gz\"))\n",
    "print(d_labitems.shape)\n",
    "d_labitems.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "markdown",
   "id": "8388eb65",
   "metadata": {},
   "source": [
    "## microbiologyevents"
   ]
  },
  {
   "cell_type": "code",
   "id": "e075f2db",
   "metadata": {},
   "source": [
    "event_type = \"microbiologyevents\""
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "841e4c08",
   "metadata": {},
   "source": [
    "!rm -r {output_path}/event_{event_type}"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "369b9488",
   "metadata": {},
   "source": [
    "create_directory(f\"{output_path}/event_{event_type}\")"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "3141e899",
   "metadata": {
    "scrolled": true
   },
   "source": [
    "df_raw = pd.read_csv(os.path.join(mimic_iv_path, f\"hosp/{event_type}.csv.gz\"))\n",
    "print(df_raw.shape)\n",
    "df_raw.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "f4a241a6",
   "metadata": {},
   "source": [
    "df = df_raw.merge(cohort[[\"hadm_id\", \"hadm_intime\"]], on=[\"hadm_id\"], how=\"inner\")\n",
    "print(df.shape)\n",
    "df.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "8d76dad0",
   "metadata": {},
   "source": [
    "df[\"charttime\"] = pd.to_datetime(df[\"charttime\"])\n",
    "df[\"storetime\"] = pd.to_datetime(df[\"storetime\"])\n",
    "df[\"timestamp\"] = (df.charttime - df.hadm_intime).dt.total_seconds() / 3600\n",
    "df[\"timestamp_avail\"] = (df.storetime - df.hadm_intime).dt.total_seconds() / 3600\n",
    "df.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "e1b5e814",
   "metadata": {},
   "source": [
    "df = df.sort_values([\"subject_id\", \"hadm_id\", \"timestamp\", \"spec_itemid\"], ascending=True)\n",
    "df.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "b578f760",
   "metadata": {},
   "source": [
    "df.isna().sum()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "3bc1208a",
   "metadata": {},
   "source": [
    "df = df.dropna(subset=[\"timestamp_avail\"])\n",
    "print(df.shape)\n",
    "df.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "f4b2e7d4",
   "metadata": {},
   "source": [
    "df.interpretation.unique()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "58b8535f",
   "metadata": {},
   "source": [
    "df.interpretation = df.interpretation.replace('S', 'sensitive')\n",
    "df.interpretation = df.interpretation.replace('R', 'resistant')\n",
    "df.interpretation = df.interpretation.replace('I', 'intermediate')\n",
    "df.head()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "4f149a6f",
   "metadata": {},
   "source": [
    "def generate_event_value(x):\n",
    "    s = f\"{x.test_name} on {x.spec_type_desc}\"\n",
    "    if not pd.isna(x.org_name):\n",
    "        s += f\", organism grew: {x.org_name}\"\n",
    "    if not pd.isna(x.ab_name):\n",
    "        s += f\", antibiotic tested: {x.ab_name}\"\n",
    "    if not pd.isna(x.interpretation):\n",
    "        s += f\", antibiotic sensitivity: {x.interpretation}\"\n",
    "    if not pd.isna(x.comments):\n",
    "        s += f\", comments: {x.comments}\"\n",
    "    return s"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "b30fca1c",
   "metadata": {},
   "source": [
    "meta_cols = [\"test_name\", \"spec_type_desc\", \"org_name\", \"ab_name\", \"interpretation\", \"comments\"]\n",
    "for c in meta_cols:\n",
    "    df[\"meta_\" + c] = df[c]\n",
    "meta_cols = [\"meta_\" + c for c in meta_cols]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "e9fb38f1",
   "metadata": {},
   "source": [
    "df[\"event_type\"] = event_type\n",
    "df[\"event_value\"] = df.parallel_apply(generate_event_value, axis=1)"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "9babf2a4",
   "metadata": {},
   "source": [
    "df[df.hadm_id == 29079034]"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "ade4facf",
   "metadata": {},
   "source": [
    "df.groupby(\"hadm_id\").timestamp.count().describe()"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "96874ba9",
   "metadata": {},
   "source": [
    "groups = df.groupby(\"hadm_id\")\n",
    "    \n",
    "with ThreadPoolExecutor(max_workers=4) as executor:\n",
    "    for hadm_id, group_df in tqdm(groups, total=groups.ngroups):\n",
    "        future = executor.submit(\n",
    "            save_group, \n",
    "            group_df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"] + meta_cols], \n",
    "            hadm_id, \n",
    "            event_type\n",
    "        )"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "b5318d52",
   "metadata": {},
   "source": [
    "!ls -1 {output_path}/event_{event_type} | wc -l"
   ],
   "outputs": [],
   "execution_count": null
  },
  {
   "cell_type": "code",
   "id": "6fbc0ea1",
   "metadata": {},
   "source": [],
   "outputs": [],
   "execution_count": null
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "pytorch20",
   "language": "python",
   "name": "pytorch20"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}