[4f54f1]: / preprocess_dicoms.py

Download this file

152 lines (113 with data), 5.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import os
import dicom
import multiprocessing
import concurrent.futures
import numpy as np # linear algebra
import matplotlib.pyplot as plt
import scipy.ndimage as spi
import lung_segmentation as ls
import config
from utils import store_patient_image, resize, normalize
NUM_PROCESSES = multiprocessing.cpu_count()
segmentation_algo = ls.get_segmentation_algorithm()
def load_scans(patient):
"""
Loads the orginal scans in dicom format for a patient.
Loaded files are located under a directory, configurable
using config.ALL_IMGS variable. Scans are sorted by
slice location, which is actually Z coordinate of
the scan.
"""
patient_dir = os.path.join(config.ALL_IMGS, patient)
slices = [dicom.read_file(os.path.join(patient_dir, scan)) for scan in os.listdir(patient_dir)]
# ImagePositionPatient[2] equals the slice location == Z coordinate of the scan
slices.sort(key = lambda x: float(x.ImagePositionPatient[2]))
# Slice thickness is missing, but is currently used only in resampling,
# which is skipped, must be checked
try:
slice_thickness = np.abs(slices[0].ImagePositionPatient[2] - slices[1].ImagePositionPatient[2])
except:
slice_thickness = np.abs(slices[0].SliceLocation - slices[1].SliceLocation)
for s in slices:
s.SliceThickness = slice_thickness
return slices
def get_pixels_hu(slices):
"""
Converts pixels from the pixel_array into Houndsfield Units.
"""
image = np.stack([s.pixel_array for s in slices])
# Convert to int16 (from sometimes int16),
# should be possible as values should always be low enough (<32k)
image = image.astype(np.int16)
# Set outside-of-scan pixels to 0
# The intercept is usually -1024, so air is approximately 0
image[image == config.OUT_SCAN] = 0
# Convert to Hounsfield units (HU)
for slice_number in range(len(slices)):
intercept = slices[slice_number].RescaleIntercept
slope = slices[slice_number].RescaleSlope
if slope != 1:
image[slice_number] = slope * image[slice_number].astype(np.float64)
image[slice_number] = image[slice_number].astype(np.int16)
image[slice_number] += np.int16(intercept)
return np.array(image, dtype=np.int16)
def resample(image, scan, new_spacing=[1, 1, 1]):
"""
Resampling to an isomorphic resolution in order to remove
variance in the scans.
"""
# Determine current pixel spacing
spacing = np.array([scan[0].SliceThickness] + scan[0].PixelSpacing,
dtype=np.float32)
resize_factor = spacing / new_spacing
new_real_shape = image.shape * resize_factor
new_shape = np.round(new_real_shape)
real_resize_factor = new_shape / image.shape
new_spacing = spacing / real_resize_factor
image = spi.interpolation.zoom(image, real_resize_factor,
mode='nearest')
return image, new_spacing
def process_patients_chunk(patients):
"""
Executes initial preprocessing operations on a list of patients.
Before serializing and compressing scans are sorted by slice
location, converted to HU units and lung is segmented.
"""
for patient in patients:
try:
scans = load_scans(patient)
patient_imgs = get_pixels_hu(scans)
if config.SEGMENTATION_ALGO == config.BASELINE_PREPROCESS:
# Clean up unnecessary pixels, not in the area of interest and normalize
lungs = resize(normalize(patient_imgs))
else:
lungs = np.stack([segmentation_algo.get_segmented_lungs(image)
for image in patient_imgs])
store_patient_image(config.SEGMENTED_LUNGS_DIR, lungs, patient)
print("====== Store patient {} image ======.".format(patient))
except Exception as e:
print("An error occured while processing {}! {}".format(patient, e))
def process_dicom_set(input_dir):
"""
Executes initial processing on the dicom images under the given directory.
Patients are split into chunks and processed in separate threads
to speed up execution.
"""
patients = np.array([patient for patient in os.listdir(input_dir)])
chunked_data = np.array_split(patients, NUM_PROCESSES)
print("Number of processes: {}, total chunks of data {}!".format(NUM_PROCESSES, len(chunked_data)))
executor = concurrent.futures.ThreadPoolExecutor(max_workers=NUM_PROCESSES)
futures = []
for i, data in enumerate(chunked_data):
try:
f = executor.submit(process_patients_chunk, data)
print("Submit {} batch to executor!". format(i))
futures.append(f)
except Exception as e:
print("An error occured while processing data chunk with size {} on iteration {}: {}".format(
len(data), str(i), e))
print(concurrent.futures.wait(futures)) # By defaults waits for all
print("Shutdown and wait for processes!")
executor.shutdown(wait=True)
if __name__ == '__main__':
process_dicom_set(config.ALL_IMGS)