[dec218]: / dataset_builder / preprocess_db.py

Download this file

94 lines (71 with data), 4.3 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import os
import argparse
def config():
parser = argparse.ArgumentParser()
parser.add_argument("--split", default="test", type=str, choices=["train", "valid", "test"], help="split")
# parser.add_argument("--data_dir", required=True, type=str, help="csv file dir")
parser.add_argument("--mimic_iv_dir", required=True, type=str, help="mimic iv directory")
parser.add_argument("--mimic_cxr_jpg_dir", required=True, type=str, help="mimic cxr jpg directory")
parser.add_argument("--chest_imagenome_dir", required=True, type=str, help="chest imagenome directory")
parser.add_argument("--db_name", required=True, type=str, choices=["mimic_iii", "eicu", "mimic_iv", "mimic_iv_cxr"], help="choose between mimic_iii, eicu, mimic_iv")
parser.add_argument("--out_dir", default="../dataset/ehrsql", type=str, help="output file directory")
parser.add_argument("--sample_icu_patient_only", action="store_true", help="sample only patients who went to the ICU")
parser.add_argument("--num_patient", default=1000, type=int, help="number of patients")
parser.add_argument("--deid", action="store_true", help="do deidentification")
parser.add_argument("--timeshift", action="store_true", help="do time shift")
parser.add_argument("--start_year", default=None, type=int, help="start sampling year")
parser.add_argument(
"--time_span", default=None, type=int, help="time span starting from start_year"
) # mimic_iii: 2001 - 2012 => 2100 - 2105 / eicu: 2014 - 2015 => 2100 - 2105 / mimic_iv: 2008 - 2019 => 2100 - 2105
parser.add_argument("--cur_patient_ratio", default=0.0, type=float, help="ratio of inpatient")
parser.add_argument("--current_time", default=None, type=str, help="any record past current_time is removed")
args = parser.parse_args()
return args
def main(args):
if args.timeshift:
assert args.start_year is not None, 'To do a time shift, "start_year" must be specified'
assert args.time_span is not None, 'To do a time shift, "time_span" must be specified'
assert args.current_time is not None, 'To do a time shift, "current_time" must be specified'
if args.db_name == "mimic_iv_cxr":
from preprocess_utils.preprocess_db_mimic_iv_cxr import Build_MIMIC_IV_CXR
print(vars(args))
mimic_writer = Build_MIMIC_IV_CXR(
split=args.split,
mimic_iv_dir=args.mimic_iv_dir,
mimic_cxr_jpg_dir=args.mimic_cxr_jpg_dir,
chest_imagenome_dir=args.chest_imagenome_dir,
out_dir=args.out_dir,
db_name=args.db_name,
num_patient=args.num_patient,
sample_icu_patient_only=args.sample_icu_patient_only,
deid=args.deid,
timeshift=args.timeshift,
start_year=args.start_year,
time_span=args.time_span,
cur_patient_ratio=args.cur_patient_ratio,
current_time=args.current_time,
)
mimic_writer.build_tb_cxr_table(flag_for_plus=False) # tb_cxr
mimic_writer.build_tb_cxr_table(flag_for_plus=True) # tb_cxr_plus
mimic_writer.build_admission_table() # patients, admissions, icustays, transfers
mimic_writer.build_dictionary_table() # d_icu_diagnoses, d_icu_procedures, d_items, d_labitems
mimic_writer.build_diagnosis_table() # diagnoses_icd
mimic_writer.build_procedure_table() # procedures_icd
mimic_writer.build_labevent_table() # labevents
mimic_writer.build_prescriptions_table() # prescriptions
mimic_writer.build_cost_table() # cost
mimic_writer.build_chartevent_table() # chartevents
mimic_writer.build_inputevent_table() # inputevents
mimic_writer.build_outputevent_table() # outputevents
mimic_writer.build_microbiology_table() # microbiologyevents
mimic_writer.generate_db()
elif args.db_name == "mimic_iv":
raise NotImplementedError("In this repository, we only support MIMIC-IV-CXR dataset")
elif args.db_name == "mimic_iii":
raise NotImplementedError("In this repository, we only support MIMIC-IV-CXR dataset")
else:
raise ValueError("Invalid db_name")
if __name__ == "__main__":
args = config()
main(args)
print("Done!\n")