--- a +++ b/utils/convert_h5.py @@ -0,0 +1,120 @@ +""" +Convert to h5 utility. +Sample command to create new dataset +- python3 utils/convert_h5.py -dd /home/masterthesis/shayan/nas_drive/Data_Neuro/OASISchallenge/FS -ld /home/masterthesis/shayan/nas_drive/Data_Neuro/OASISchallenge -trv datasets/train_volumes.txt -tev datasets/test_volumes.txt -id MALC -rc Neo -o COR -df datasets/MALC/coronal +- python utils/convert_h5.py -dd /home/masterthesis/shayan/nas_drive/Data_Neuro/IXI/IXI_FS -ld /home/masterthesis/shayan/nas_drive/Data_Neuro/IXI/IXI_FS -ds 98,2 -rc FS -o COR -df datasets/IXI/coronal +""" + +import argparse +import os + +import h5py +import numpy as np + +import common_utils +import data_utils as du +import preprocessor as preprocessor + + +def apply_split(data_split, data_dir, label_dir): + file_paths = du.load_file_paths(data_dir, label_dir) + print("Total no of volumes to process : %d" % len(file_paths)) + train_ratio, test_ratio = data_split.split(",") + train_len = int((int(train_ratio) / 100) * len(file_paths)) + train_idx = np.random.choice(len(file_paths), train_len, replace=False) + test_idx = np.array([i for i in range(len(file_paths)) if i not in train_idx]) + train_file_paths = [file_paths[i] for i in train_idx] + test_file_paths = [file_paths[i] for i in test_idx] + return train_file_paths, test_file_paths + + +def _write_h5(data, label, class_weights, weights, f, mode): + no_slices, H, W = data[0].shape + with h5py.File(f[mode]['data'], "w") as data_handle: + data_handle.create_dataset("data", data=np.concatenate(data).reshape((-1, H, W))) + with h5py.File(f[mode]['label'], "w") as label_handle: + label_handle.create_dataset("label", data=np.concatenate(label).reshape((-1, H, W))) + with h5py.File(f[mode]['weights'], "w") as weights_handle: + weights_handle.create_dataset("weights", data=np.concatenate(weights)) + with h5py.File(f[mode]['class_weights'], "w") as class_weights_handle: + class_weights_handle.create_dataset("class_weights", data=np.concatenate( + class_weights).reshape((-1, H, W))) + + +def convert_h5(data_dir, label_dir, data_split, train_volumes, test_volumes, f, data_id, remap_config='Neo', + orientation=preprocessor.ORIENTATION['coronal']): + # Data splitting + if data_split: + train_file_paths, test_file_paths = apply_split(data_split, data_dir, label_dir) + elif train_volumes and test_volumes: + train_file_paths = du.load_file_paths(data_dir, label_dir, data_id, train_volumes) + test_file_paths = du.load_file_paths(data_dir, label_dir, data_id, test_volumes) + else: + raise ValueError('You must either provide the split ratio or a train, train dataset list') + + print("Train dataset size: %d, Test dataset size: %d" % (len(train_file_paths), len(test_file_paths))) + # loading,pre-processing and writing train data + print("===Train data===") + data_train, label_train, class_weights_train, weights_train, _ = du.load_dataset(train_file_paths, + orientation, + remap_config=remap_config, + return_weights=True, + reduce_slices=True, + remove_black=True) + + _write_h5(data_train, label_train, class_weights_train, weights_train, f, mode='train') + + # loading,pre-processing and writing test data + print("===Test data===") + data_test, label_test, class_weights_test, weights_test, _ = du.load_dataset(test_file_paths, + orientation, + remap_config=remap_config, + return_weights=True, + reduce_slices=True, + remove_black=True) + + _write_h5(data_test, label_test, class_weights_test, weights_test, f, mode='test') + + +if __name__ == "__main__": + print("* Start *") + parser = argparse.ArgumentParser() + parser.add_argument('--data_dir', '-dd', required=True, + help='Base directory of the data folder. This folder should contain one folder per volume.') + parser.add_argument('--label_dir', '-ld', required=True, + help='Base directory of all the label files. This folder should have one file per volumn with same name as the corresponding volumn folder name inside data_dir') + parser.add_argument('--data_split', '-ds', required=False, + help='Ratio to split data randomly into train and test. input e.g. 80,20') + parser.add_argument('--train_volumes', '-trv', required=False, + help='Path to a text file containing the list of volumes to be used for training') + parser.add_argument('--test_volumes', '-tev', required=False, + help='Path to a text file containing the list of volumes to be used for testing') + parser.add_argument('--data_id', '-id', required=True, help='Valid options are "MALC", "ADNI", "CANDI" and "IBSR"') + parser.add_argument('--remap_config', '-rc', required=True, help='Valid options are "FS" and "Neo"') + parser.add_argument('--orientation', '-o', required=True, help='Valid options are COR, AXI, SAG') + parser.add_argument('--destination_folder', '-df', required=True, help='Path where to generate the h5 files') + + args = parser.parse_args() + + common_utils.create_if_not(args.destination_folder) + + f = { + 'train': { + "data": os.path.join(args.destination_folder, "Data_train.h5"), + "label": os.path.join(args.destination_folder, "Label_train.h5"), + "weights": os.path.join(args.destination_folder, "Weight_train.h5"), + "class_weights": os.path.join(args.destination_folder, "Class_Weight_train.h5"), + }, + 'test': { + "data": os.path.join(args.destination_folder, "Data_test.h5"), + "label": os.path.join(args.destination_folder, "Label_test.h5"), + "weights": os.path.join(args.destination_folder, "Weight_test.h5"), + "class_weights": os.path.join(args.destination_folder, "Class_Weight_test.h5") + } + } + + convert_h5(args.data_dir, args.label_dir, args.data_split, args.train_volumes, args.test_volumes, f, + args.data_id, + args.remap_config, + args.orientation) + print("* Finish *")