--- a +++ b/dataset_builder/preprocess_cohort.py @@ -0,0 +1,285 @@ +import os +import argparse +import numpy as np +import pandas as pd + +# from tqdm import tqdm + + +def config(): + parser = argparse.ArgumentParser(description="preprocessing cohorts") + + # debug + parser.add_argument("--debug", action="store_true", help="debug mode") + parser.add_argument("--debug_nrows", default=100000, type=int, help="debug mode - nrows") + + # file directory + parser.add_argument("--mimic_cxr_jpg_dir", default="../mimic-cxr-jpg/", type=str) + parser.add_argument("--chest_imagenome_dir", default="../chest-imagenome/", type=str) + parser.add_argument("--save_dir", default="./preprocessed_data", type=str) + + parser.add_argument("--max_study_order", default=20, type=int) + + args = parser.parse_args() + + return args + + +class CohortPreprocessor: + def __init__( + self, + args, + ): + self.args = args + + # check debug + self.nrows = args.debug_nrows if args.debug else None + + # load dataset + self._load_mimic_cxr_metadata() + self._load_bbox_objects_tabular() # silver bbox + self._load_attribute_relations_tabular() # silver attribute + self._load_gold_patient_ids() + self._load_gold_1st_image_ids() + + def _load_mimic_cxr_metadata(self): + # read + cxr_meta = pd.read_csv( + os.path.join(self.args.mimic_cxr_jpg_dir, "mimic-cxr-2.0.0-metadata.csv"), + usecols=["dicom_id", "subject_id", "study_id", "ViewPosition", "Rows", "Columns", "StudyDate", "StudyTime"], + ) + print(cxr_meta.shape) + + # rename columns + cxr_meta = cxr_meta.rename(columns={"dicom_id": "image_id"}) + + # build a new column: StudyDateTime + cxr_meta["StudyDateTime"] = pd.to_datetime(cxr_meta.StudyDate.astype(str).apply(lambda x: f"{x[:4]}-{x[4:6]}-{x[6:]}") + " " + cxr_meta.StudyTime.apply(lambda x: "%010.3f" % x)) + + # build a new column: StudyOrder + cxr_meta_ = cxr_meta.copy() + cxr_meta_ = cxr_meta_.sort_values(by=["subject_id", "study_id", "StudyDateTime"]) + cxr_meta_ = cxr_meta_.drop_duplicates(subset=["subject_id", "study_id"], keep="first").copy() + cxr_meta_["StudyDateTime_study_id"] = cxr_meta_["StudyDateTime"].astype(str) + cxr_meta_["study_id"].astype(str) + cxr_meta_["StudyDateTime_study_id"] = pd.to_datetime(cxr_meta_["StudyDateTime_study_id"]) + cxr_meta_["StudyOrder"] = cxr_meta_.groupby(["subject_id"])["StudyDateTime_study_id"].rank(method="dense") + cxr_meta["StudyOrder"] = cxr_meta["study_id"].map(cxr_meta_[["study_id", "StudyOrder"]].set_index("study_id")["StudyOrder"]) + + # remove overlapped columns + del cxr_meta["StudyDate"] + del cxr_meta["StudyTime"] + + # after base preprocessing, keep all data + self.mimic_cxr_metadata = cxr_meta.copy() + + # Assumption: Use only frontal images + cxr_meta = cxr_meta[cxr_meta["ViewPosition"].isin(["AP", "PA"])].reset_index(drop=True) + print(cxr_meta.shape) + + # Assumption: Given the same study_id, use only one image (studydatetime-first + dicom_id-first) + cxr_meta = cxr_meta.sort_values(["study_id", "StudyDateTime", "image_id"], ascending=[True, True, True]) + cxr_meta = cxr_meta[cxr_meta["image_id"].isin(cxr_meta.groupby(["study_id"])["image_id"].first().values)] + print(cxr_meta.shape) + + assert cxr_meta.groupby(["study_id", "StudyDateTime"])["image_id"].nunique().value_counts().size == 1 + + self.meta_data = cxr_meta.copy() + + def _load_bbox_objects_tabular(self): + bbox_objects_tabular = pd.read_csv( + os.path.join(self.args.chest_imagenome_dir, "silver_dataset/scene_tabular/bbox_objects_tabular.txt"), + sep="\t", + nrows=self.nrows, + usecols=[ + "object_id", + "x1", + "y1", + "x2", + "y2", + "width", + "height", + "bbox_name", + # 'synsets', 'name', + # 'original_x1', 'original_y1', 'original_x2','original_y2', 'original_width', 'original_height', + ], + ) + bbox_objects_tabular["image_id"] = bbox_objects_tabular["object_id"].apply(lambda x: x.split("_")[0]) # add column + self.bbox_objects_tabular = bbox_objects_tabular + + def _load_attribute_relations_tabular(self): + attribute_relations_tabular = pd.read_csv( + os.path.join(self.args.chest_imagenome_dir, "silver_dataset/scene_tabular/attribute_relations_tabular.txt"), + sep="\t", + nrows=self.nrows, + usecols=["study_id", "image_id", "sent_loc", "row_id", "bbox", "categoryID", "label_name", "relation"], + ) + self.attribute_relations_tabular = attribute_relations_tabular + + def _load_gold_patient_ids(self): + gold_dataset = pd.read_csv( + os.path.join(self.args.chest_imagenome_dir, "gold_dataset/gold_attributes_relations_500pts_500studies1st.txt"), + sep="\t", + ) + gold_dataset = gold_dataset.rename(columns={"patient_id": "subject_id"}) + gold_dataset["image_id"] = gold_dataset["image_id"].str.replace(".dcm", "") + assert gold_dataset["subject_id"].nunique() == 500 + self.gold_pids = gold_dataset["subject_id"].unique() + + def _load_gold_1st_image_ids(self): + gold_dataset = pd.read_csv( + os.path.join(self.args.chest_imagenome_dir, "gold_dataset/gold_attributes_relations_500pts_500studies1st.txt"), + sep="\t", + ) + gold_dataset = gold_dataset.rename(columns={"patient_id": "subject_id"}) + gold_dataset["image_id"] = gold_dataset["image_id"].str.replace(".dcm", "") + assert gold_dataset["subject_id"].nunique() == 500 + self.gold_1st_iids = gold_dataset["image_id"].unique() + + def preprocessImage_bounding_box(self): + """ + 1) Remove frontal images where the number of the bounding box in each image less than 36 + 2) Remove frontal images whose width is more than 3 standard deviations. (in 224*224 image size) + """ + # 0 + meta_data = self.meta_data.copy() + bbox_objects_tabular = self.bbox_objects_tabular.copy() + # 1 + num_of_unique_bbox = bbox_objects_tabular.groupby(["image_id"])["bbox_name"].nunique() + remove_iids = num_of_unique_bbox[num_of_unique_bbox != 36].index + meta_data = meta_data[~meta_data["image_id"].isin(remove_iids)] + + # 2 + def get_outlier_image_ids(bbox_name, tgt_data, src_data, measure_of_unit="width", n_std=3): + tgt_array = tgt_data[tgt_data["bbox_name"] == bbox_name][measure_of_unit].copy() + src_array = src_data[src_data["bbox_name"] == bbox_name][measure_of_unit].copy() + + mean, std = src_array.mean(), src_array.std() + threshold_min = mean - n_std * std + threshold_max = mean + n_std * std + tgt_array_refined = tgt_array[(tgt_array < threshold_min) | (tgt_array > threshold_max)] + + outlier_image_ids = tgt_data.loc[tgt_array_refined.index]["image_id"].values + return outlier_image_ids + + meta_bbox = bbox_objects_tabular[bbox_objects_tabular["image_id"].isin(meta_data.image_id.unique())].copy() + assert len(bbox_objects_tabular.bbox_name.unique()) == 36 + for bbox_name in bbox_objects_tabular.bbox_name.unique(): + outlier_image_ids = get_outlier_image_ids(bbox_name=bbox_name, tgt_data=meta_bbox, src_data=bbox_objects_tabular) + meta_data = meta_data[~meta_data["image_id"].isin(outlier_image_ids)] + # -1 + self.meta_data = meta_data.reset_index(drop=True) + print("preprocessImage_bounding_box: {}".format(self.meta_data.shape)) + + def preprocessStudy_study_order(self, max_study_order=20): + """ + we retain studies with study order <= max_study_order + """ + # 0 + meta_data = self.meta_data.copy() + # 1 + meta_data = meta_data[meta_data["StudyOrder"] <= max_study_order] + # -1 + self.meta_data = meta_data.reset_index(drop=True) + print("preprocessStudy_study_order: {}".format(self.meta_data.shape)) + + def preprocessPatient_gold_pids(self, flag="silver"): + """ + remove gold pids + """ + # 0 + meta_data = self.meta_data.copy() + # 1 + if flag == "silver": + meta_data = meta_data[~meta_data["subject_id"].isin(self.gold_pids)] + elif flag == "gold": + meta_data = meta_data[meta_data["subject_id"].isin(self.gold_pids)] + else: + raise ValueError("flag must be either 'silver' or 'gold'") + # -1 + self.meta_data = meta_data.reset_index(drop=True) + print("remove_gold_pids: {}".format(self.meta_data.shape)) + + def save_cohort_silver(self): + # load + meta_data = self.meta_data.copy() + + # arrange + meta_data = meta_data[ + [ + "subject_id", + "study_id", + "image_id", + "ViewPosition", + "StudyDateTime", + "StudyOrder", + ] + ] + meta_data = meta_data.sort_values(by=["subject_id", "StudyOrder"]) + + # save + os.makedirs(self.args.save_dir, exist_ok=True) + meta_data = meta_data.reset_index(drop=True) + meta_data.to_csv(os.path.join(self.args.save_dir, f"cohort_silver.csv"), index=False) + + def reset_meta_data(self): + self._load_mimic_cxr_metadata() + + def preprocessStudy_gold_1st(self): + meta_data_raw = self.mimic_cxr_metadata.copy() + meta_data_gold_1st = meta_data_raw[meta_data_raw["image_id"].isin(self.gold_1st_iids)] + assert len(meta_data_gold_1st) == 500 + meta_data = self.meta_data.copy() + meta_data = pd.concat( + [ + meta_data[meta_data["StudyOrder"] != 1], + meta_data_gold_1st, + ] + ) + assert len(self.meta_data) == len(meta_data) + self.meta_data = meta_data.reset_index(drop=True) + + def save_cohort_gold(self): + # load + meta_data = self.meta_data.copy() + + # arrange + meta_data = meta_data[ + [ + "subject_id", + "study_id", + "image_id", + "ViewPosition", + "StudyDateTime", + "StudyOrder", + ] + ] + meta_data = meta_data.sort_values(by=["subject_id", "StudyOrder"]) + + # save + os.makedirs(self.args.save_dir, exist_ok=True) + meta_data = meta_data.reset_index(drop=True) + meta_data.to_csv(os.path.join(self.args.save_dir, f"cohort_gold.csv"), index=False) + + +def main(args): + # load preprocessor + cohort_preproc = CohortPreprocessor(args) + + # NOTE: preprocessing for silver dataset + cohort_preproc.preprocessImage_bounding_box() + cohort_preproc.preprocessStudy_study_order(max_study_order=args.max_study_order) + cohort_preproc.preprocessPatient_gold_pids(flag="silver") + cohort_preproc.save_cohort_silver() + + # NOTE: preprocessing for gold dataset + cohort_preproc.reset_meta_data() + cohort_preproc.preprocessStudy_study_order(max_study_order=args.max_study_order) + cohort_preproc.preprocessPatient_gold_pids(flag="gold") + cohort_preproc.preprocessStudy_gold_1st() # NOTE: specialized pre-processing for gold dataset + cohort_preproc.save_cohort_gold() + + +if __name__ == "__main__": + args = config() + main(args) + print("Done")