--- a +++ b/lungs/preprocess.py @@ -0,0 +1,318 @@ +import time +from tqdm import tqdm +import numpy as np +import pydicom as dicom +import os +from scipy import ndimage +import matplotlib.pyplot as plt +from pathlib import Path +from skimage import measure +from collections import defaultdict +from sys import argv +from random import shuffle + +from lungs.utils import apply_window + + +# The pixel size/coarseness of the scan differs from scan to scan (e.g. the distance between slices may differ), which can hurt performance of +# CNN approaches. We can deal with this by isomorphic resampling. +# Below is code to load a scan, which consists of multiple slices, which we simply save in a Python list. Every folder in the dataset is one +# scan (so one patient). One metadata field is missing, the pixel size in the Z direction, which is the slice thickness. +# Fortunately we can infer this, and we add this to the metadata. + +def is_dcm_file(path): + name, ext = os.path.splitext(path) + # DICOM file extension + if 'dcm' in ext: + return True + # NLST file format + if name.isdigit() and not ext: + return True + return False + +# Load a volume from the given folder path +def load_scan(path): + slices = [dicom.read_file(path + '/' + scan) for scan in os.listdir(path) if is_dcm_file(scan)] + slices.sort(key = lambda x: float(x.ImagePositionPatient[2])) + + if not slices[0].SliceThickness: + try: + slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2]) + except: + slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation) + + for s in slices: + s.SliceThickness = slice_thickness + return slices + + +# The unit of measurement in CT scans is the **Hounsfield Unit (HU)**, which is a measure of radiodensity. +# CT scanners are carefully calibrated to accurately measure this. From Wikipedia: +# By default however, the returned values are not in this unit. Let's fix this. +# Some scanners have cylindrical scanning bounds, but the output volume is square. +# The pixels that fall outside of these bounds get the fixed value -2000. The first step is setting these values to 0, which currently corresponds to air. +# Next, let's go back to HU units, by multiplying with the rescale slope and adding the intercept (which are conveniently stored in the metadata of the scans!). + +def get_pixels_hu(slices): + volume = np.stack([s.pixel_array for s in slices]) + # Convert to int16 (from sometimes int16), + # should be possible as values should always be low enough (<32k) + volume = volume.astype(np.int16) + + # Set outside-of-scan pixels to 0 + # The intercept is usually -1024, so air is approximately 0 + volume[volume == -2000] = 0 + + # Convert to Hounsfield units (HU) + for slice_number in range(len(slices)): + + intercept = slices[slice_number].RescaleIntercept + slope = slices[slice_number].RescaleSlope + + if slope != 1: + volume[slice_number] = slope * volume[slice_number].astype(np.float64) + volume[slice_number] = volume[slice_number].astype(np.int16) + + volume[slice_number] += np.int16(intercept) + + return np.array(volume, dtype=np.int16) + +# # Resampling +# A scan may have a pixel spacing of `[2.5, 0.5, 0.5]`, which means that the distance between slices is `2.5` millimeters. +# For a different scan this may be `[1.5, 0.725, 0.725]`, +# this can be problematic for automatic analysis (e.g. using ConvNets). +# A common method of dealing with this is resampling the full dataset to a certain isotropic resolution. +# If we choose to resample everything to 1.5mm*1.5mm*1.5mm pixels we can use 3D convnets without worrying about learning zoom/slice thickness invariance. +# Whilst this may seem like a very simple step, it has quite some edge cases due to rounding. Also, it takes quite a while. + +def resample(scan_hu, scan_file, scan, new_spacing, verbose=False): + # Determine current pixel spacing + spacing = np.array([scan_file[0].SliceThickness] + list(scan_file[0].PixelSpacing), dtype=np.float32) + if verbose: + print('Spacing:', spacing) + resize_factor = spacing / new_spacing + new_real_shape = scan_hu.shape * resize_factor + new_shape = np.round(new_real_shape) + real_resize_factor = new_shape / scan_hu.shape + new_spacing = spacing / real_resize_factor + + scan_hu = ndimage.interpolation.zoom(scan_hu, real_resize_factor, mode='nearest') + return scan_hu, new_spacing + +def largest_label_volume(im, bg=-1): + vals, counts = np.unique(im, return_counts=True) + counts = counts[vals != bg] + vals = vals[vals != bg] + if len(counts) > 0: + return vals[np.argmax(counts)] + else: + return None + +# # Lung segmentation +# In order to reduce the problem space, we segment the lungs (and usually some tissue around it). +# It consists of a series of applications of region growing and morphological operations. In this case, +# we will use only connected component analysis. +# +# The steps: +# * Threshold the volume (-320 HU is a good threshold, but it doesn't matter much for this approach) +# * Do connected components, determine label of air around person, fill this with 1s in the binary volume +# * Optionally: For every axial slice in the scan, determine the largest solid connected component +# (the body+air around the person), and set others to 0. This fills the structures in the lungs in the mask. +# * Keep only the largest air pocket (the human body has other pockets of air here and there). +def segment_lung_mask(volume, fill_lung_structures=True): + + # not actually binary, but 1 and 2. + # 0 is treated as background, which we do not want + binary_volume = np.array(volume > -320, dtype=np.int8)+1 + labels = measure.label(binary_volume) + + # Pick the pixel in the very corner to determine which label is air. + # Improvement: Pick multiple background labels from around the patient + # More resistant to "trays" on which the patient lays cutting the air + # around the person in half + background_label = labels[0,0,0] + + #Fill the air around the person + binary_volume[background_label == labels] = 2 + + # Method of filling the lung structures (that is superior to something like + # morphological closing) + if fill_lung_structures: + # For every slice we determine the largest solid structure + for i, axial_slice in enumerate(binary_volume): + axial_slice = axial_slice - 1 + labeling = measure.label(axial_slice) + l_max = largest_label_volume(labeling, bg=0) + if l_max is not None: # This slice contains some lung + binary_volume[i][labeling != l_max] = 1 + + binary_volume -= 1 # Make the volume actual binary + binary_volume = 1-binary_volume # Invert it, lungs are now 1 + + # Remove other air pockets insided body + labels = measure.label(binary_volume, background=0) + l_max = largest_label_volume(labels, bg=0) + if l_max is not None: # There are air pockets + binary_volume[labels != l_max] = 0 + return binary_volume + +def bbox2_3D(volume): + r = np.any(volume, axis=(1, 2)) + c = np.any(volume, axis=(0, 2)) + z = np.any(volume, axis=(0, 1)) + rmin, rmax = np.where(r)[0][[0, -1]] + cmin, cmax = np.where(c)[0][[0, -1]] + zmin, zmax = np.where(z)[0][[0, -1]] + return rmin, rmax, cmin, cmax, zmin, zmax + +def preprocess(scan, errors_map, num_slices=224, crop_size=224, voxel_size=1.5, windowing=False, sample_volume=True, verbose=True): + orig_scan = load_scan(scan) + num_orig_slices = len(orig_scan) + if num_orig_slices < 50: + errors_map['insufficient_slices'] += 1 + raise ValueError(scan[-4:] + ': Insufficient muber of slices (<50).') + orig_scan_np = np.stack([s.pixel_array for s in orig_scan]).astype(np.int16) + + scan_hu = get_pixels_hu(orig_scan) + + # Let's resample our patient's pixels to an isomorphic resolution + resampled_scan, _ = resample(scan_hu, orig_scan, orig_scan_np, [voxel_size, voxel_size, voxel_size], verbose=verbose) + if verbose: + print("Shape before resampling:", scan_hu.shape) + print("Shape after resampling:", resampled_scan.shape) + + if resampled_scan.shape[0] < 180: + errors_map['small_z'] += 1 + raise ValueError(scan[-4:] + ': Insufficient number of resampled slices (<200).') + + lung_mask = segment_lung_mask(resampled_scan, True) + + z_min, z_max, x_min, x_max, y_min, y_max = bbox2_3D(lung_mask) + box_size = (z_max - z_min, x_max - x_min, y_max - y_min) + if verbose: + print('Lung bounding box (min, max):', (z_min, z_max), (x_min, x_max), (y_min, y_max)) + print('Bounding box size:', box_size) + + for dim in box_size: + if dim < 100: + errors_map['seg_error'] += 1 + raise ValueError(scan[-4:] + ': Segmentation error.') + + lung_center = np.array([z_min + z_max, x_min + x_max, y_min + y_max]) // 2 + context = np.array([num_slices, crop_size, crop_size]) + + volume_starts = np.array([max(0, lung_center[i] - context[i] // 2) for i in range(3)]) + volume_ends = np.array([min(resampled_scan.shape[i], lung_center[i] + context[i] // 2) for i in range(3)]) + volume_size = volume_ends - volume_starts + + starts = context // 2 - volume_size // 2 + ends = starts + volume_size + + lungs_padded = np.zeros((num_slices, crop_size, crop_size)) + lungs_padded[starts[0]: ends[0], starts[1]: ends[1], starts[2]: ends[2]] = \ + resampled_scan[volume_starts[0]: volume_ends[0], volume_starts[1]: volume_ends[1], volume_starts[2]: volume_ends[2]] + + if verbose: + print("Final shape", lungs_padded.shape) + + if sample_volume: + # Generate an RGB slice for display + lungs_rgb = np.stack((lungs_padded, lungs_padded, lungs_padded), axis=3) + lungs_sample_slice = lungs_rgb[lungs_rgb.shape[0] // 2] + else: + lungs_sample_slice = None + + return lungs_padded, lungs_sample_slice + +def walk_dicom_dirs(base_in, base_out=None, print_dirs=True): + for root, _, files in os.walk(base_in): + path = root.split(os.sep) + if print_dirs: + print((len(path) - 1) * '---', os.path.basename(root)) + # sample_filename = os.path.splitext(files[0]) + if len(files) >= 50: # and (sample_filename[0].isdigit() or 'dcm' in sample_filename[1]): + if base_out: + yield root, base_out + os.path.relpath(root, base_in) + else: + yield root + +def walk_np_files(base_in, print_dirs=True): + pathlist = Path(base_in).glob('**/*.np*') + for path in pathlist: + np_path = str(path) + print(np_path) + yield np_path + +def preprocess_all(input_dir, overwrite=False, num_slices=224, crop_size=224, voxel_size=1.5): + start = time.time() + scans = os.listdir(input_dir) + scans.sort() + errors_map = defaultdict(int) + base_out = input_dir.rstrip('/') + '_preprocessed/' + valid_scans = 0 + + scans_num = len(list(walk_dicom_dirs(input_dir, base_out, False))) + for scan_dir_path, out_path in tqdm(walk_dicom_dirs(input_dir, base_out), total=scans_num): + try: + out_dir = os.path.dirname(out_path) + os.makedirs(out_dir, exist_ok=True) + + if overwrite or not os.path.exists(out_dir) or not os.listdir(out_dir): + preprocessed_scan, scan_rgb_sample = \ + preprocess(scan_dir_path, errors_map, num_slices, crop_size, voxel_size) + plt.imshow(scan_rgb_sample) + plt.savefig(out_path + '.png', bbox_inches='tight') + np.savez_compressed(out_path + '.npz', data=preprocessed_scan) + + valid_scans += 1 + print('\n++++++++++++++++++++++++\nDiagnostics:') + print(errors_map.items()) + + except FileExistsError as e: + valid_scans += 1 + print('Exists:', out_path) + + except ValueError as e: + print('\nERROR!') + print(e) + + print('Total scans: {}'.format(scans_num)) + print('Valid scans: {}'.format(valid_scans)) + print('Scans with insufficient slices: {}'.format(errors_map['insufficient_slices'])) + print('Scans with bad segmentation: {}'.format(errors_map['bad_seg'])) + print('Scans with small resampled z dimension: {}'.format(errors_map['small_z'])) + print((time.time() - start) / scans_num, 'sec/volume') + +def split(positives, negatives, lists_dir, print_dirs=False, split_ratio=0.7): + positive_paths = [] + negative_paths = [] + + for preprocessed_dir, path_list, label in (positives, positive_paths, '1'), (negatives, negative_paths, '0'): + for root, _, files in os.walk(preprocessed_dir): + path = root.split(os.sep) + if print_dirs: + print((len(path) - 1) * '---', os.path.basename(root)) + for f in files: + if '.np' in f: + path_list.append((root + '/' + f, label)) + print('\n INFO:', 'volumes with label', label, len(path_list)) + + train_list = [] + val_list = [] + shuffle(positive_paths) + split_pos = round(split_ratio * len(positive_paths)) + shuffle(negative_paths) + split_neg = round(split_ratio * len(negative_paths)) + train_list = positive_paths[:split_pos] + negative_paths[:split_neg] + val_list = positive_paths[split_pos:] + negative_paths[split_neg:] + shuffle(train_list) + shuffle(val_list) + + os.makedirs(lists_dir, exist_ok=True) + with open(lists_dir + '/val.list', 'w') as val_f: + for path, label in val_list: + val_f.write(path + ' ' + label + '\n') + + with open(lists_dir + '/train.list', 'w') as train_f: + for path, label in train_list: + train_f.write(path + ' ' + label + '\n')