[ab27bc]: / mimicsql / evaluation / process_mimic_db.py

Download this file

106 lines (97 with data), 3.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
import os
import csv
import shutil
import sqlite3
from datetime import datetime
from process_mimic_db.utils import *
from process_mimic_db.process_tables import *
# Specify the path to the downloaded MIMIC III data
data_dir = '/home/nas1_userA/junwoopark/datasets/mimiciii/1.4/'
# Path to the generated mimic.db. No need to update.
out_dir = 'mimic_db'
# Generate five tables and the database with all admissions
# if os.path.exists(out_dir):
# shutil.rmtree(out_dir)
# os.mkdir(out_dir)
conn = sqlite3.connect(os.path.join(out_dir, 'mimic_all.db'))
build_demographic_table(data_dir, out_dir, conn)
build_diagnoses_table(data_dir, out_dir, conn)
build_procedures_table(data_dir, out_dir, conn)
build_prescriptions_table(data_dir, out_dir, conn)
build_lab_table(data_dir, out_dir, conn)
'''
1. We did not emumerate all possible questions about MIMIC III.
MIMICSQL data is generated based on the patient information
related to 100 randomly selected admissions.
2. The following codes are used for sampling the admissions
from the large database.
3. The parameter 'random_state=0' in line 41 will provide you
the same set of sampled admissions and the same database as we used.
'''
print('Begin sampling ...')
# DEMOGRAPHIC
print('Processing DEMOGRAPHIC')
conn = sqlite3.connect(os.path.join(out_dir, 'mimic.db'))
data_demo = pandas.read_csv(os.path.join(out_dir, "DEMOGRAPHIC.csv"))
data_demo_sample = data_demo.sample(100, random_state=0)
for k, v in data_demo_sample.dtypes.items():
data_demo_sample[k] = data_demo_sample[k].apply(lambda x: x.lower() if type(x) == str else x)
data_demo_sample.to_sql('DEMOGRAPHIC', conn, if_exists='replace', index=False)
sampled_id = data_demo_sample['HADM_ID'].values
# DIAGNOSES
print('Processing DIAGNOSES')
data_input = pandas.read_csv(os.path.join(out_dir, "DIAGNOSES.csv"))
data_filter = []
cnt = 0
for itm in sampled_id:
msg = 'HADM_ID=='+str(itm)
data_filter.append(data_input.query(msg))
cnt += 1
show_progress(cnt, len(sampled_id))
data_out = pandas.concat(data_filter, ignore_index=True)
for k, v in data_out.dtypes.items():
data_out[k] = data_out[k].apply(lambda x: x.lower() if type(x) == str else x)
data_out.to_sql('DIAGNOSES', conn, if_exists='replace', index=False)
# PROCEDURES
print('Processing PROCEDURES')
data_input = pandas.read_csv(os.path.join(out_dir, "PROCEDURES.csv"))
data_filter = []
cnt = 0
for itm in sampled_id:
msg = 'HADM_ID=='+str(itm)
data_filter.append(data_input.query(msg))
cnt += 1
show_progress(cnt, len(sampled_id))
data_out = pandas.concat(data_filter, ignore_index=True)
for k, v in data_out.dtypes.items():
data_out[k] = data_out[k].apply(lambda x: x.lower() if type(x) == str else x)
data_out.to_sql('PROCEDURES', conn, if_exists='replace', index=False)
# PRESCRIPTIONS
print('Processing PRESCRIPTIONS')
data_input = pandas.read_csv(os.path.join(out_dir, "PRESCRIPTIONS.csv"))
data_filter = []
cnt = 0
for itm in sampled_id:
msg = 'HADM_ID=='+str(itm)
data_filter.append(data_input.query(msg))
cnt += 1
show_progress(cnt, len(sampled_id))
data_out = pandas.concat(data_filter, ignore_index=True)
for k, v in data_out.dtypes.items():
data_out[k] = data_out[k].apply(lambda x: x.lower() if type(x) == str else x)
data_out.to_sql('PRESCRIPTIONS', conn, if_exists='replace', index=False)
# LAB
print('Processing LAB')
data_input = pandas.read_csv(os.path.join(out_dir, "LAB.csv"))
data_filter = []
cnt = 0
for itm in sampled_id:
msg = 'HADM_ID=='+str(itm)
data_filter.append(data_input.query(msg))
cnt += 1
show_progress(cnt, len(sampled_id))
data_out = pandas.concat(data_filter, ignore_index=True)
for k, v in data_out.dtypes.items():
data_out[k] = data_out[k].apply(lambda x: x.lower() if type(x) == str else x)
data_out.to_sql('LAB', conn, if_exists='replace', index=False)
print('Done!')