--- a
+++ b/dataset_builder/preprocess_db.py
@@ -0,0 +1,93 @@
+import os
+import argparse
+
+
+def config():
+    parser = argparse.ArgumentParser()
+
+    parser.add_argument("--split", default="test", type=str, choices=["train", "valid", "test"], help="split")
+
+    # parser.add_argument("--data_dir", required=True, type=str, help="csv file dir")
+    parser.add_argument("--mimic_iv_dir", required=True, type=str, help="mimic iv directory")
+    parser.add_argument("--mimic_cxr_jpg_dir", required=True, type=str, help="mimic cxr jpg directory")
+    parser.add_argument("--chest_imagenome_dir", required=True, type=str, help="chest imagenome directory")
+
+    parser.add_argument("--db_name", required=True, type=str, choices=["mimic_iii", "eicu", "mimic_iv", "mimic_iv_cxr"], help="choose between mimic_iii, eicu, mimic_iv")
+    parser.add_argument("--out_dir", default="../dataset/ehrsql", type=str, help="output file directory")
+
+    parser.add_argument("--sample_icu_patient_only", action="store_true", help="sample only patients who went to the ICU")
+    parser.add_argument("--num_patient", default=1000, type=int, help="number of patients")
+
+    parser.add_argument("--deid", action="store_true", help="do deidentification")
+
+    parser.add_argument("--timeshift", action="store_true", help="do time shift")
+    parser.add_argument("--start_year", default=None, type=int, help="start sampling year")
+    parser.add_argument(
+        "--time_span", default=None, type=int, help="time span starting from start_year"
+    )  # mimic_iii: 2001 - 2012 => 2100 - 2105 / eicu: 2014 - 2015 => 2100 - 2105  / mimic_iv: 2008 - 2019 => 2100 - 2105
+    parser.add_argument("--cur_patient_ratio", default=0.0, type=float, help="ratio of inpatient")
+    parser.add_argument("--current_time", default=None, type=str, help="any record past current_time is removed")
+
+    args = parser.parse_args()
+
+    return args
+
+
+def main(args):
+    if args.timeshift:
+        assert args.start_year is not None, 'To do a time shift, "start_year" must be specified'
+        assert args.time_span is not None, 'To do a time shift, "time_span" must be specified'
+        assert args.current_time is not None, 'To do a time shift, "current_time" must be specified'
+
+    if args.db_name == "mimic_iv_cxr":
+        from preprocess_utils.preprocess_db_mimic_iv_cxr import Build_MIMIC_IV_CXR
+
+        print(vars(args))
+
+        mimic_writer = Build_MIMIC_IV_CXR(
+            split=args.split,
+            mimic_iv_dir=args.mimic_iv_dir,
+            mimic_cxr_jpg_dir=args.mimic_cxr_jpg_dir,
+            chest_imagenome_dir=args.chest_imagenome_dir,
+            out_dir=args.out_dir,
+            db_name=args.db_name,
+            num_patient=args.num_patient,
+            sample_icu_patient_only=args.sample_icu_patient_only,
+            deid=args.deid,
+            timeshift=args.timeshift,
+            start_year=args.start_year,
+            time_span=args.time_span,
+            cur_patient_ratio=args.cur_patient_ratio,
+            current_time=args.current_time,
+        )
+
+        mimic_writer.build_tb_cxr_table(flag_for_plus=False)  # tb_cxr
+        mimic_writer.build_tb_cxr_table(flag_for_plus=True)  # tb_cxr_plus
+        mimic_writer.build_admission_table()  # patients, admissions, icustays, transfers
+        mimic_writer.build_dictionary_table()  # d_icu_diagnoses, d_icu_procedures, d_items, d_labitems
+        mimic_writer.build_diagnosis_table()  # diagnoses_icd
+        mimic_writer.build_procedure_table()  # procedures_icd
+        mimic_writer.build_labevent_table()  # labevents
+        mimic_writer.build_prescriptions_table()  # prescriptions
+        mimic_writer.build_cost_table()  # cost
+        mimic_writer.build_chartevent_table()  # chartevents
+        mimic_writer.build_inputevent_table()  # inputevents
+        mimic_writer.build_outputevent_table()  # outputevents
+        mimic_writer.build_microbiology_table()  # microbiologyevents
+
+        mimic_writer.generate_db()
+
+    elif args.db_name == "mimic_iv":
+        raise NotImplementedError("In this repository, we only support MIMIC-IV-CXR dataset")
+
+    elif args.db_name == "mimic_iii":
+        raise NotImplementedError("In this repository, we only support MIMIC-IV-CXR dataset")
+
+    else:
+        raise ValueError("Invalid db_name")
+
+
+if __name__ == "__main__":
+    args = config()
+    main(args)
+    print("Done!\n")