[d986f2]: / experiments / lidc_exp / preprocessing.py

Download this file

151 lines (120 with data), 6.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
#!/usr/bin/env python
# Copyright 2018 Division of Medical Image Computing, German Cancer Research Center (DKFZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ==============================================================================
'''
This preprocessing script loads nrrd files obtained by the data conversion tool: https://github.com/MIC-DKFZ/LIDC-IDRI-processing/tree/v1.0.1
After applying preprocessing, images are saved as numpy arrays and the meta information for the corresponding patient is stored
as a line in the dataframe saved as info_df.pickle.
'''
import os, sys
from pathlib import Path
import SimpleITK as sitk
import numpy as np
from multiprocessing import Pool
import pandas as pd
import numpy.testing as npt
from skimage.transform import resize
import subprocess
import pickle
PROJECT_ROOT = Path(__file__).absolute().parent.parent.parent
sys.path.append(str(PROJECT_ROOT))
import utils.exp_utils as utils
def resample_array(src_imgs, src_spacing, target_spacing):
src_spacing = np.round(src_spacing, 3)
target_shape = [int(src_imgs.shape[ix] * src_spacing[::-1][ix] / target_spacing[::-1][ix]) for ix in range(len(src_imgs.shape))]
for i in range(len(target_shape)):
try:
assert target_shape[i] > 0
except:
raise AssertionError("AssertionError:", src_imgs.shape, src_spacing, target_spacing)
img = src_imgs.astype(float)
resampled_img = resize(img, target_shape, order=1, clip=True, mode='edge').astype('float32')
return resampled_img
def pp_patient(inputs):
ix, path = inputs
pid = path.split('/')[-1]
img = sitk.ReadImage(os.path.join(path, '{}_ct_scan.nrrd'.format(pid)))
# sitk.GetArray switches an image with shape (x,y,z) to (z,y,x)
img_arr = sitk.GetArrayFromImage(img)
print('processing {}'.format(pid), img.GetSpacing(), img_arr.shape)
img_arr = resample_array(img_arr, img.GetSpacing(), cf.target_spacing)
img_arr = np.clip(img_arr, -1200, 600)
#img_arr = (1200 + img_arr) / (600 + 1200) * 255 # a+x / (b-a) * (c-d) (c, d = new)
img_arr = img_arr.astype(np.float32)
img_arr = (img_arr - np.mean(img_arr)) / np.std(img_arr).astype(np.float16)
df = pd.read_csv(os.path.join(cf.root_dir, 'characteristics.csv'), sep=';')
df = df[df.PatientID == pid]
final_rois = np.zeros_like(img_arr, dtype=np.uint8)
mal_labels = []
roi_ids = set([ii.split('.')[0].split('_')[-1] for ii in os.listdir(path) if '.nii.gz' in ii])
rix = 1
for rid in roi_ids:
roi_id_paths = [ii for ii in os.listdir(path) if '{}.nii'.format(rid) in ii]
nodule_ids = [ii.split('_')[2].lstrip("0") for ii in roi_id_paths]
rater_labels = [df[df.NoduleID == int(ii)].Malignancy.values[0] for ii in nodule_ids]
rater_labels.extend([0] * (4-len(rater_labels)))
mal_label = np.mean([ii for ii in rater_labels if ii > -1])
roi_rater_list = []
for rp in roi_id_paths:
roi = sitk.ReadImage(os.path.join(cf.raw_data_dir, pid, rp))
roi_arr = sitk.GetArrayFromImage(roi).astype(np.uint8)
roi_arr = resample_array(roi_arr, roi.GetSpacing(), cf.target_spacing)
assert roi_arr.shape == img_arr.shape, [roi_arr.shape, img_arr.shape, pid, roi.GetSpacing()]
for ix in range(len(img_arr.shape)):
npt.assert_almost_equal(roi.GetSpacing()[ix], img.GetSpacing()[ix])
roi_rater_list.append(roi_arr)
roi_rater_list.extend([np.zeros_like(roi_rater_list[-1])]*(4-len(roi_id_paths)))
roi_raters = np.array(roi_rater_list)
roi_raters = np.mean(roi_raters, axis=0)
roi_raters[roi_raters < 0.5] = 0
if np.sum(roi_raters) > 0:
mal_labels.append(mal_label)
final_rois[roi_raters >= 0.5] = rix
rix += 1
else:
# indicate rois suppressed by majority voting of raters
print('suppressed roi!', roi_id_paths)
with open(os.path.join(cf.pp_dir, 'suppressed_rois.txt'), 'a') as handle:
handle.write(" ".join(roi_id_paths))
fg_slices = [ii for ii in np.unique(np.argwhere(final_rois != 0)[:, 0])]
mal_labels = np.array(mal_labels)
assert len(mal_labels) + 1 == len(np.unique(final_rois)), [len(mal_labels), np.unique(final_rois), pid]
np.save(os.path.join(cf.pp_dir, '{}_rois.npy'.format(pid)), final_rois)
np.save(os.path.join(cf.pp_dir, '{}_img.npy'.format(pid)), img_arr)
with open(os.path.join(cf.pp_dir, 'meta_info_{}.pickle'.format(pid)), 'wb') as handle:
meta_info_dict = {'pid': pid, 'class_target': mal_labels, 'spacing': img.GetSpacing(), 'fg_slices': fg_slices}
pickle.dump(meta_info_dict, handle)
def aggregate_meta_info(exp_dir):
files = [os.path.join(exp_dir, f) for f in os.listdir(exp_dir) if 'meta_info' in f]
df = pd.DataFrame(columns=['pid', 'class_target', 'spacing', 'fg_slices'])
for f in files:
with open(f, 'rb') as handle:
df.loc[len(df)] = pickle.load(handle)
df.to_pickle(os.path.join(exp_dir, 'info_df.pickle'))
print ("aggregated meta info to df with length", len(df))
if __name__ == "__main__":
cf_file = utils.import_module("cf", "configs.py")
cf = cf_file.configs()
paths = [os.path.join(cf.raw_data_dir, ii) for ii in os.listdir(cf.raw_data_dir)]
if not os.path.exists(cf.pp_dir):
os.mkdir(cf.pp_dir)
pool = Pool(processes=os.cpu_count())
p1 = pool.map(pp_patient, enumerate(paths))
pool.close()
pool.join()
# for i in enumerate(paths):
# pp_patient(i)
aggregate_meta_info(cf.pp_dir)
subprocess.call('cp {} {}'.format(os.path.join(cf.pp_dir, 'info_df.pickle'), os.path.join(cf.pp_dir, 'info_df_bk.pickle')), shell=True)