|
a |
|
b/mimicsql/evaluation/process_mimic_db.py |
|
|
1 |
import os |
|
|
2 |
import csv |
|
|
3 |
import shutil |
|
|
4 |
import sqlite3 |
|
|
5 |
from datetime import datetime |
|
|
6 |
|
|
|
7 |
from process_mimic_db.utils import * |
|
|
8 |
from process_mimic_db.process_tables import * |
|
|
9 |
|
|
|
10 |
# Specify the path to the downloaded MIMIC III data |
|
|
11 |
data_dir = '/home/nas1_userA/junwoopark/datasets/mimiciii/1.4/' |
|
|
12 |
# Path to the generated mimic.db. No need to update. |
|
|
13 |
out_dir = 'mimic_db' |
|
|
14 |
|
|
|
15 |
# Generate five tables and the database with all admissions |
|
|
16 |
# if os.path.exists(out_dir): |
|
|
17 |
# shutil.rmtree(out_dir) |
|
|
18 |
# os.mkdir(out_dir) |
|
|
19 |
conn = sqlite3.connect(os.path.join(out_dir, 'mimic_all.db')) |
|
|
20 |
build_demographic_table(data_dir, out_dir, conn) |
|
|
21 |
build_diagnoses_table(data_dir, out_dir, conn) |
|
|
22 |
build_procedures_table(data_dir, out_dir, conn) |
|
|
23 |
build_prescriptions_table(data_dir, out_dir, conn) |
|
|
24 |
build_lab_table(data_dir, out_dir, conn) |
|
|
25 |
|
|
|
26 |
''' |
|
|
27 |
1. We did not emumerate all possible questions about MIMIC III. |
|
|
28 |
MIMICSQL data is generated based on the patient information |
|
|
29 |
related to 100 randomly selected admissions. |
|
|
30 |
2. The following codes are used for sampling the admissions |
|
|
31 |
from the large database. |
|
|
32 |
3. The parameter 'random_state=0' in line 41 will provide you |
|
|
33 |
the same set of sampled admissions and the same database as we used. |
|
|
34 |
''' |
|
|
35 |
|
|
|
36 |
print('Begin sampling ...') |
|
|
37 |
# DEMOGRAPHIC |
|
|
38 |
print('Processing DEMOGRAPHIC') |
|
|
39 |
conn = sqlite3.connect(os.path.join(out_dir, 'mimic.db')) |
|
|
40 |
data_demo = pandas.read_csv(os.path.join(out_dir, "DEMOGRAPHIC.csv")) |
|
|
41 |
data_demo_sample = data_demo.sample(100, random_state=0) |
|
|
42 |
for k, v in data_demo_sample.dtypes.items(): |
|
|
43 |
data_demo_sample[k] = data_demo_sample[k].apply(lambda x: x.lower() if type(x) == str else x) |
|
|
44 |
data_demo_sample.to_sql('DEMOGRAPHIC', conn, if_exists='replace', index=False) |
|
|
45 |
sampled_id = data_demo_sample['HADM_ID'].values |
|
|
46 |
|
|
|
47 |
# DIAGNOSES |
|
|
48 |
print('Processing DIAGNOSES') |
|
|
49 |
data_input = pandas.read_csv(os.path.join(out_dir, "DIAGNOSES.csv")) |
|
|
50 |
data_filter = [] |
|
|
51 |
cnt = 0 |
|
|
52 |
for itm in sampled_id: |
|
|
53 |
msg = 'HADM_ID=='+str(itm) |
|
|
54 |
data_filter.append(data_input.query(msg)) |
|
|
55 |
cnt += 1 |
|
|
56 |
show_progress(cnt, len(sampled_id)) |
|
|
57 |
data_out = pandas.concat(data_filter, ignore_index=True) |
|
|
58 |
for k, v in data_out.dtypes.items(): |
|
|
59 |
data_out[k] = data_out[k].apply(lambda x: x.lower() if type(x) == str else x) |
|
|
60 |
data_out.to_sql('DIAGNOSES', conn, if_exists='replace', index=False) |
|
|
61 |
|
|
|
62 |
# PROCEDURES |
|
|
63 |
print('Processing PROCEDURES') |
|
|
64 |
data_input = pandas.read_csv(os.path.join(out_dir, "PROCEDURES.csv")) |
|
|
65 |
data_filter = [] |
|
|
66 |
cnt = 0 |
|
|
67 |
for itm in sampled_id: |
|
|
68 |
msg = 'HADM_ID=='+str(itm) |
|
|
69 |
data_filter.append(data_input.query(msg)) |
|
|
70 |
cnt += 1 |
|
|
71 |
show_progress(cnt, len(sampled_id)) |
|
|
72 |
data_out = pandas.concat(data_filter, ignore_index=True) |
|
|
73 |
for k, v in data_out.dtypes.items(): |
|
|
74 |
data_out[k] = data_out[k].apply(lambda x: x.lower() if type(x) == str else x) |
|
|
75 |
data_out.to_sql('PROCEDURES', conn, if_exists='replace', index=False) |
|
|
76 |
|
|
|
77 |
# PRESCRIPTIONS |
|
|
78 |
print('Processing PRESCRIPTIONS') |
|
|
79 |
data_input = pandas.read_csv(os.path.join(out_dir, "PRESCRIPTIONS.csv")) |
|
|
80 |
data_filter = [] |
|
|
81 |
cnt = 0 |
|
|
82 |
for itm in sampled_id: |
|
|
83 |
msg = 'HADM_ID=='+str(itm) |
|
|
84 |
data_filter.append(data_input.query(msg)) |
|
|
85 |
cnt += 1 |
|
|
86 |
show_progress(cnt, len(sampled_id)) |
|
|
87 |
data_out = pandas.concat(data_filter, ignore_index=True) |
|
|
88 |
for k, v in data_out.dtypes.items(): |
|
|
89 |
data_out[k] = data_out[k].apply(lambda x: x.lower() if type(x) == str else x) |
|
|
90 |
data_out.to_sql('PRESCRIPTIONS', conn, if_exists='replace', index=False) |
|
|
91 |
|
|
|
92 |
# LAB |
|
|
93 |
print('Processing LAB') |
|
|
94 |
data_input = pandas.read_csv(os.path.join(out_dir, "LAB.csv")) |
|
|
95 |
data_filter = [] |
|
|
96 |
cnt = 0 |
|
|
97 |
for itm in sampled_id: |
|
|
98 |
msg = 'HADM_ID=='+str(itm) |
|
|
99 |
data_filter.append(data_input.query(msg)) |
|
|
100 |
cnt += 1 |
|
|
101 |
show_progress(cnt, len(sampled_id)) |
|
|
102 |
data_out = pandas.concat(data_filter, ignore_index=True) |
|
|
103 |
for k, v in data_out.dtypes.items(): |
|
|
104 |
data_out[k] = data_out[k].apply(lambda x: x.lower() if type(x) == str else x) |
|
|
105 |
data_out.to_sql('LAB', conn, if_exists='replace', index=False) |
|
|
106 |
print('Done!') |