a b/mimicsql/evaluation/process_mimic_db.py
1
import os
2
import csv
3
import shutil
4
import sqlite3
5
from datetime import datetime
6
7
from process_mimic_db.utils import *
8
from process_mimic_db.process_tables import *
9
10
# Specify the path to the downloaded MIMIC III data
11
data_dir = '/home/nas1_userA/junwoopark/datasets/mimiciii/1.4/'
12
# Path to the generated mimic.db. No need to update.
13
out_dir = 'mimic_db'
14
15
# Generate five tables and the database with all admissions
16
# if os.path.exists(out_dir):
17
#     shutil.rmtree(out_dir)
18
# os.mkdir(out_dir)
19
conn = sqlite3.connect(os.path.join(out_dir, 'mimic_all.db'))
20
build_demographic_table(data_dir, out_dir, conn)
21
build_diagnoses_table(data_dir, out_dir, conn)
22
build_procedures_table(data_dir, out_dir, conn)
23
build_prescriptions_table(data_dir, out_dir, conn)
24
build_lab_table(data_dir, out_dir, conn)
25
26
'''
27
1. We did not emumerate all possible questions about MIMIC III.
28
MIMICSQL data is generated based on the patient information 
29
related to 100 randomly selected admissions.
30
2. The following codes are used for sampling the admissions 
31
from the large database. 
32
3. The parameter 'random_state=0' in line 41 will provide you 
33
the same set of sampled admissions and the same database as we used.
34
'''
35
36
print('Begin sampling ...')
37
# DEMOGRAPHIC
38
print('Processing DEMOGRAPHIC')
39
conn = sqlite3.connect(os.path.join(out_dir, 'mimic.db'))
40
data_demo = pandas.read_csv(os.path.join(out_dir, "DEMOGRAPHIC.csv"))
41
data_demo_sample = data_demo.sample(100, random_state=0)
42
for k, v in data_demo_sample.dtypes.items():
43
    data_demo_sample[k] = data_demo_sample[k].apply(lambda x: x.lower() if type(x) == str else x)
44
data_demo_sample.to_sql('DEMOGRAPHIC', conn, if_exists='replace', index=False)
45
sampled_id = data_demo_sample['HADM_ID'].values
46
47
# DIAGNOSES
48
print('Processing DIAGNOSES')
49
data_input = pandas.read_csv(os.path.join(out_dir, "DIAGNOSES.csv"))
50
data_filter = []
51
cnt = 0
52
for itm in sampled_id:
53
    msg = 'HADM_ID=='+str(itm)
54
    data_filter.append(data_input.query(msg))
55
    cnt += 1
56
    show_progress(cnt, len(sampled_id))
57
data_out = pandas.concat(data_filter, ignore_index=True)
58
for k, v in data_out.dtypes.items():
59
    data_out[k] = data_out[k].apply(lambda x: x.lower() if type(x) == str else x)
60
data_out.to_sql('DIAGNOSES', conn, if_exists='replace', index=False)
61
62
# PROCEDURES
63
print('Processing PROCEDURES')
64
data_input = pandas.read_csv(os.path.join(out_dir, "PROCEDURES.csv"))
65
data_filter = []
66
cnt = 0
67
for itm in sampled_id:
68
    msg = 'HADM_ID=='+str(itm)
69
    data_filter.append(data_input.query(msg))
70
    cnt += 1
71
    show_progress(cnt, len(sampled_id))
72
data_out = pandas.concat(data_filter, ignore_index=True)
73
for k, v in data_out.dtypes.items():
74
    data_out[k] = data_out[k].apply(lambda x: x.lower() if type(x) == str else x)
75
data_out.to_sql('PROCEDURES', conn, if_exists='replace', index=False)
76
77
# PRESCRIPTIONS
78
print('Processing PRESCRIPTIONS')
79
data_input = pandas.read_csv(os.path.join(out_dir, "PRESCRIPTIONS.csv"))
80
data_filter = []
81
cnt = 0
82
for itm in sampled_id:
83
    msg = 'HADM_ID=='+str(itm)
84
    data_filter.append(data_input.query(msg))
85
    cnt += 1
86
    show_progress(cnt, len(sampled_id))
87
data_out = pandas.concat(data_filter, ignore_index=True)
88
for k, v in data_out.dtypes.items():
89
    data_out[k] = data_out[k].apply(lambda x: x.lower() if type(x) == str else x)
90
data_out.to_sql('PRESCRIPTIONS', conn, if_exists='replace', index=False)
91
92
# LAB
93
print('Processing LAB')
94
data_input = pandas.read_csv(os.path.join(out_dir, "LAB.csv"))
95
data_filter = []
96
cnt = 0
97
for itm in sampled_id:
98
    msg = 'HADM_ID=='+str(itm)
99
    data_filter.append(data_input.query(msg))
100
    cnt += 1
101
    show_progress(cnt, len(sampled_id))
102
data_out = pandas.concat(data_filter, ignore_index=True)
103
for k, v in data_out.dtypes.items():
104
    data_out[k] = data_out[k].apply(lambda x: x.lower() if type(x) == str else x)
105
data_out.to_sql('LAB', conn, if_exists='replace', index=False)
106
print('Done!')