a b/src/preprocess/03_merge_events.ipynb
1
{
2
 "cells": [
3
  {
4
   "cell_type": "code",
5
   "id": "debdace9",
6
   "metadata": {},
7
   "source": [
8
    "import os\n",
9
    "import sys\n",
10
    "\n",
11
    "src_path = os.path.abspath(\"../..\")\n",
12
    "print(src_path)\n",
13
    "sys.path.append(src_path)"
14
   ],
15
   "outputs": [],
16
   "execution_count": null
17
  },
18
  {
19
   "cell_type": "code",
20
   "id": "6bad1e09",
21
   "metadata": {},
22
   "source": [
23
    "from src.utils import create_directory, raw_data_path, processed_data_path, set_seed"
24
   ],
25
   "outputs": [],
26
   "execution_count": null
27
  },
28
  {
29
   "cell_type": "code",
30
   "id": "5d9bc78c",
31
   "metadata": {},
32
   "source": [
33
    "set_seed(seed=42)"
34
   ],
35
   "outputs": [],
36
   "execution_count": null
37
  },
38
  {
39
   "cell_type": "code",
40
   "id": "13d22a57",
41
   "metadata": {},
42
   "source": [
43
    "import pandas as pd"
44
   ],
45
   "outputs": [],
46
   "execution_count": null
47
  },
48
  {
49
   "cell_type": "code",
50
   "id": "dd9852d5",
51
   "metadata": {},
52
   "source": [
53
    "mimic_iv_path = os.path.join(raw_data_path, \"physionet.org/files/mimiciv/2.2\")\n",
54
    "output_path = os.path.join(processed_data_path, \"mimic4\")"
55
   ],
56
   "outputs": [],
57
   "execution_count": null
58
  },
59
  {
60
   "cell_type": "code",
61
   "id": "b6a27998",
62
   "metadata": {},
63
   "source": [
64
    "cohort = pd.read_csv(os.path.join(output_path, \"cohort.csv\"))\n",
65
    "print(cohort.shape)\n",
66
    "cohort.head()"
67
   ],
68
   "outputs": [],
69
   "execution_count": null
70
  },
71
  {
72
   "cell_type": "code",
73
   "id": "9dd92e23",
74
   "metadata": {},
75
   "source": [
76
    "cohort[\"hadm_intime\"] = pd.to_datetime(cohort[\"hadm_intime\"])\n",
77
    "cohort[\"hadm_outtime\"] = pd.to_datetime(cohort[\"hadm_outtime\"])\n",
78
    "cohort[\"stay_intime\"] = pd.to_datetime(cohort[\"stay_intime\"])\n",
79
    "cohort[\"stay_outtime\"] = pd.to_datetime(cohort[\"stay_outtime\"])"
80
   ],
81
   "outputs": [],
82
   "execution_count": null
83
  },
84
  {
85
   "cell_type": "code",
86
   "id": "8f55c793",
87
   "metadata": {},
88
   "source": [
89
    "hadm_ids = set(cohort.hadm_id.unique().tolist())\n",
90
    "len(hadm_ids)"
91
   ],
92
   "outputs": [],
93
   "execution_count": null
94
  },
95
  {
96
   "cell_type": "code",
97
   "id": "d03d447c",
98
   "metadata": {},
99
   "source": [
100
    "import ast\n",
101
    "import numpy as np\n",
102
    "\n",
103
    "\n",
104
    "def safe_literal_eval(s):\n",
105
    "    if pd.isna(s):\n",
106
    "        return np.nan\n",
107
    "    return ast.literal_eval(s)\n",
108
    "\n",
109
    "\n",
110
    "cohort.label_diagnosis = cohort.label_diagnosis.apply(safe_literal_eval)"
111
   ],
112
   "outputs": [],
113
   "execution_count": null
114
  },
115
  {
116
   "cell_type": "markdown",
117
   "id": "5a9d60c6",
118
   "metadata": {},
119
   "source": [
120
    "helper"
121
   ]
122
  },
123
  {
124
   "cell_type": "code",
125
   "id": "5171bbae",
126
   "metadata": {},
127
   "source": [
128
    "from concurrent.futures import ThreadPoolExecutor\n",
129
    "from tqdm import tqdm\n",
130
    "from pandarallel import pandarallel"
131
   ],
132
   "outputs": [],
133
   "execution_count": null
134
  },
135
  {
136
   "cell_type": "code",
137
   "id": "5d6b9ce2",
138
   "metadata": {},
139
   "source": [
140
    "pandarallel.initialize(progress_bar=True)"
141
   ],
142
   "outputs": [],
143
   "execution_count": null
144
  },
145
  {
146
   "cell_type": "markdown",
147
   "id": "e77f628d",
148
   "metadata": {},
149
   "source": [
150
    "merge"
151
   ]
152
  },
153
  {
154
   "cell_type": "code",
155
   "id": "f86a3633",
156
   "metadata": {},
157
   "source": [
158
    "events_selected = [   \n",
159
    "    \"labevents\",           \n",
160
    "    \"microbiologyevents\",\n",
161
    "    \"prescriptions\",\n",
162
    "    \"transfers\",\n",
163
    "    \"procedureevents\",\n",
164
    "]"
165
   ],
166
   "outputs": [],
167
   "execution_count": null
168
  },
169
  {
170
   "cell_type": "code",
171
   "id": "7ae555e1",
172
   "metadata": {},
173
   "source": [
174
    "def merge_and_save(events, hadm_id, folder_name):\n",
175
    "    \n",
176
    "    df = []\n",
177
    "    for event in events:\n",
178
    "        try:\n",
179
    "            tmp = pd.read_csv(os.path.join(output_path, f\"event_{event}/event_{hadm_id}.csv\"),\n",
180
    "                              usecols=[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"])\n",
181
    "            df.append(tmp)\n",
182
    "        except FileNotFoundError:\n",
183
    "            continue\n",
184
    "    \n",
185
    "    assert len(df) > 0, hadm_id\n",
186
    "    df = pd.concat(df)\n",
187
    "    df.hadm_id = df.hadm_id.astype(int)\n",
188
    "    df = df.sort_values(by=\"timestamp\", ascending=True)\n",
189
    "    \n",
190
    "    tmp1 = pd.read_csv(os.path.join(output_path, f\"event_patient_demographics/event_{hadm_id}.csv\"))\n",
191
    "    tmp2 = pd.read_csv(os.path.join(output_path, f\"event_admission_info/event_{hadm_id}.csv\"))\n",
192
    "    df = pd.concat([tmp1, tmp2, df])\n",
193
    "    \n",
194
    "    df = df[[\"hadm_id\", \"event_type\", \"timestamp\", \"event_value\", \"timestamp_avail\"]]\n",
195
    "\n",
196
    "    file_path = os.path.join(output_path, f\"{folder_name}/event_{hadm_id}.csv\")\n",
197
    "    df.to_csv(file_path, index=False)\n",
198
    "\n",
199
    "    return True"
200
   ],
201
   "outputs": [],
202
   "execution_count": null
203
  },
204
  {
205
   "cell_type": "code",
206
   "id": "98067450",
207
   "metadata": {},
208
   "source": [
209
    "!rm -r {output_path}/event_selected"
210
   ],
211
   "outputs": [],
212
   "execution_count": null
213
  },
214
  {
215
   "cell_type": "code",
216
   "id": "81096fa5",
217
   "metadata": {},
218
   "source": [
219
    "create_directory(f\"{output_path}/event_selected\")"
220
   ],
221
   "outputs": [],
222
   "execution_count": null
223
  },
224
  {
225
   "cell_type": "code",
226
   "id": "2858ec13",
227
   "metadata": {},
228
   "source": [
229
    "with ThreadPoolExecutor(max_workers=4) as executor:\n",
230
    "    for hadm_id in tqdm(hadm_ids, total=len(hadm_ids)):\n",
231
    "        future = executor.submit(\n",
232
    "            merge_and_save, \n",
233
    "            events_selected, \n",
234
    "            hadm_id, \n",
235
    "            \"event_selected\"\n",
236
    "        )"
237
   ],
238
   "outputs": [],
239
   "execution_count": null
240
  },
241
  {
242
   "cell_type": "markdown",
243
   "id": "993412bf",
244
   "metadata": {},
245
   "source": [
246
    "stat"
247
   ]
248
  },
249
  {
250
   "cell_type": "code",
251
   "id": "78ff0517",
252
   "metadata": {},
253
   "source": [
254
    "from tqdm import tqdm"
255
   ],
256
   "outputs": [],
257
   "execution_count": null
258
  },
259
  {
260
   "cell_type": "code",
261
   "id": "87ca202e",
262
   "metadata": {},
263
   "source": [
264
    "hadm_id_to_len = {}\n",
265
    "for hadm_id in tqdm(hadm_ids):\n",
266
    "    try:\n",
267
    "        df = pd.read_csv(os.path.join(output_path, f\"event_selected/event_{hadm_id}.csv\"))        \n",
268
    "        hadm_id_to_len[hadm_id] = len(df)\n",
269
    "        del df\n",
270
    "    except FileNotFoundError:\n",
271
    "        print(f\"{hadm_id} not found!\")\n",
272
    "        hadm_id_to_len[hadm_id] = 0"
273
   ],
274
   "outputs": [],
275
   "execution_count": null
276
  },
277
  {
278
   "cell_type": "code",
279
   "id": "9e282998",
280
   "metadata": {},
281
   "source": [
282
    "cohort[\"len_selected\"] = cohort.hadm_id.map(hadm_id_to_len)\n",
283
    "cohort.head()"
284
   ],
285
   "outputs": [],
286
   "execution_count": null
287
  },
288
  {
289
   "cell_type": "code",
290
   "id": "4891f34e",
291
   "metadata": {},
292
   "source": [
293
    "len(cohort)"
294
   ],
295
   "outputs": [],
296
   "execution_count": null
297
  },
298
  {
299
   "cell_type": "code",
300
   "id": "79d1e1f8",
301
   "metadata": {},
302
   "source": [
303
    "cohort.hadm_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
304
   ],
305
   "outputs": [],
306
   "execution_count": null
307
  },
308
  {
309
   "cell_type": "code",
310
   "id": "4d89e2e7",
311
   "metadata": {},
312
   "source": [
313
    "cohort.stay_los.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
314
   ],
315
   "outputs": [],
316
   "execution_count": null
317
  },
318
  {
319
   "cell_type": "code",
320
   "id": "defa6a7e",
321
   "metadata": {},
322
   "source": [
323
    "cohort.len_selected.describe(percentiles=[.1, .25, .5, .75, .9, .95, .99])"
324
   ],
325
   "outputs": [],
326
   "execution_count": null
327
  },
328
  {
329
   "cell_type": "code",
330
   "id": "d8d8675e",
331
   "metadata": {},
332
   "source": "cohort.to_csv(os.path.join(output_path, 'cohort+len.csv'), index=False)",
333
   "outputs": [],
334
   "execution_count": null
335
  },
336
  {
337
   "cell_type": "code",
338
   "id": "6a846dff",
339
   "metadata": {},
340
   "source": [],
341
   "outputs": [],
342
   "execution_count": null
343
  }
344
 ],
345
 "metadata": {
346
  "kernelspec": {
347
   "display_name": "pytorch20",
348
   "language": "python",
349
   "name": "pytorch20"
350
  },
351
  "language_info": {
352
   "codemirror_mode": {
353
    "name": "ipython",
354
    "version": 3
355
   },
356
   "file_extension": ".py",
357
   "mimetype": "text/x-python",
358
   "name": "python",
359
   "nbconvert_exporter": "python",
360
   "pygments_lexer": "ipython3",
361
   "version": "3.9.19"
362
  }
363
 },
364
 "nbformat": 4,
365
 "nbformat_minor": 5
366
}