a b/preprocess/lidc-preprocessing.py
1
"""
2
Generall data pre-processing, used in all experimental settings
3
"""
4
5
# # Pre-process LIDC CT scans to extract labelled nodules
6
7
from tqdm import tqdm
8
import os, sys
9
import pickle
10
sys.path.append("./")
11
import numpy as np
12
import pandas as pd
13
import pylidc as pl
14
import png
15
from preprocessingutils import *
16
import concurrent.futures
17
import nibabel as nib
18
from PIL import Image
19
from pathlib import Path
20
21
# note that this script assumes its evaluated from the local directory;
22
# if you want the data to be stored somewhere else, adapt this,
23
# or create a simlink for data
24
25
preprocessdir = Path.cwd()
26
homedir       = preprocessdir.parent
27
DATA_DIR = homedir / 'data'
28
if not DATA_DIR.exists():
29
    os.makedirs(DATA_DIR)
30
LOG_FILE = Path("log") / "lidc-preprocessing.log"
31
if not LOG_FILE.parent.exists():
32
    os.makedirs(LOG_FILE.parent)
33
RESOURCES_DIR = homedir / 'resources'
34
if not RESOURCES_DIR.exists():
35
    os.makedirs(RESOURCES_DIR)
36
MAX_WORKES = 1
37
OUT_SIZE = (69,69,69)
38
OUT_SHAPE_2D = (180,180)
39
SPACING_2D = .5
40
OUT_SIZE_MM_2D = tuple(np.array(OUT_SHAPE_2D) * SPACING_2D)
41
# MIN_MM2  = 5**2
42
MIN_MM2 = 0
43
WRITE_NORESAMP_NODULES = False # export non-resampled nodules
44
DO_RESAMP = False# resample and crop for 3D nodules
45
DO_SLICES = True # generate 2D slices
46
47
TEST_MODE = False
48
print(f"test mode: {TEST_MODE}")
49
50
file = open(LOG_FILE, "w+")
51
52
# The LIDC database contains annotations of up to 4 radiologist per nodule.
53
# We need to combine these annotations. Luckily, the pylidc module provides a way to cluster annotations from overlapping nodules
54
# It turns out that 'nodule_id' does not refer to a nodule at all, they do not overlap.
55
# Luckily, pylidc has functionality built in to determine which nodules belong together
56
# 
57
# Extract annotations to dataframe (note: using pd.read_sql_table might be better but I couldn't figure out which connection to use)
58
# ## Load scans with pylidc
59
# Create dataframe with scan information
60
61
scans = pl.query(pl.Scan).all()
62
scan_dict = {}
63
for scan in scans:
64
    patient_id = scan.patient_id[-4:]
65
    if patient_id in scan_dict.keys():
66
        print(f"patient with multiple scans: {patient_id}; ", end="")
67
        patient_id = str(format(int(patient_id) + int(2000)))
68
        print(f"new id: {patient_id}")
69
    scan_dict[patient_id] = scan
70
assert len(scan_dict.keys()) == 1018
71
72
73
if not (RESOURCES_DIR / "scan_df.csv").exists():
74
    scan_df_dict = {}
75
    print("preparing scan dataframe")
76
    for patient_id, scan in tqdm(scan_dict.items()):   # TODO add scan-id here
77
        scan_df_dict[patient_id] = {
78
            'contrast_used':        scan.contrast_used,
79
            'id':                   scan.id,
80
            'is_from_initial':      scan.is_from_initial,
81
            'patient_id_lidc':      scan.patient_id,
82
            'pixel_spacing':        scan.pixel_spacing,
83
            'series_instance_uid':  scan.series_instance_uid,
84
            'slice_spacing':        scan.slice_spacing,
85
            'slice_thickness':      scan.slice_thickness,
86
            'spacing_x':            scan.spacings[0],
87
            'spacing_y':            scan.spacings[1],
88
            'spacing_z':            scan.spacings[2],
89
            'study_instance_uid':   scan.study_instance_uid
90
        }
91
    scan_df = pd.DataFrame.from_dict(scan_df_dict, orient="index")
92
    scan_df.index = ["{:04d}".format(int(x)) for x in scan_df.index.values]
93
    scan_df.to_csv(RESOURCES_DIR / 'scan_df.csv', index=True, index_label="patient_id")
94
else:
95
    scan_df = pd.read_csv(RESOURCES_DIR / 'scan_df.csv')
96
97
# scans can contain multiple annotations
98
#  each annooation has an id, there are nodule ids, but these don't coincide accross annotations, while in reality, some annotations concern the same actual nodule. This data is combined in the 'nodule_number' column, which numbers the nodules for each patient
99
# Add the patient number as a column to de DataFrame, and create an actual nodule ID based on the patient number and the nodule number
100
101
# Takes a long time, so this is stored and can be picked up here:
102
103
# cluster nodules
104
105
if not os.path.exists(os.path.join(DATA_DIR, "nodule-clusters")):
106
    os.makedirs(os.path.join(DATA_DIR, "nodule-clusters"))
107
108
clustered_annotations = {}
109
110
if TEST_MODE:
111
    scan_dict = {k: scan_dict[k] for k in list(scan_dict.keys())[:5]}
112
113
nodule_files = os.listdir(os.path.join(DATA_DIR, "nodule-clusters"))
114
patients_with_nodules = list(set([x.split("n")[0] for x in nodule_files]))
115
116
for patient_id, scan in scan_dict.items():
117
    if patient_id in patients_with_nodules:
118
        nodule_ids = [x for x in nodule_files if x.startswith(patient_id)]
119
        for nodule in nodule_ids:
120
            nodule_id = nodule.rstrip(".pkl")
121
            with open(os.path.join(DATA_DIR, "nodule-clusters", nodule), "rb") as f:
122
                clustered_annotations[nodule_id] = pickle.load(f)
123
    else:
124
        print("")
125
        print("extracting nodules for patient {}".format(patient_id), end="")
126
        for i, clustered_annotation in enumerate(scan.cluster_annotations()):
127
            print(" n{:02d}".format(i+1), end="")
128
            if not isinstance(clustered_annotation, list):
129
                clustered_annotation = [clustered_annotation]
130
            nodule_id = "{}n{:02d}".format(patient_id, i+1)
131
            clustered_annotations[nodule_id] = clustered_annotation
132
            with open(os.path.join(DATA_DIR, "nodule-clusters", nodule_id + ".pkl"), "wb") as f:
133
                pickle.dump(clustered_annotation, f)
134
135
136
# export all annotations in flat dict 
137
# TODO: do this earlier for prettier looping
138
# TODO: sort keys on patient id, to actually benefit from loading scans only once per patient...
139
anns = {}
140
nodule_ids = list(clustered_annotations.keys())
141
nodule_ids.sort()
142
for nodule_id in nodule_ids:
143
    annotation_list = clustered_annotations[nodule_id]
144
    for i, ann in enumerate(annotation_list):
145
        annotation_id = "{}a{}".format(nodule_id, i+1)
146
        anns[annotation_id] = ann
147
148
if not (RESOURCES_DIR / "annotation_df.csv").exists():
149
    # annotation_dfs = {k: pd.concat([annotation_to_df(ann) for ann in cluster]) for k, cluster in clustered_annotations.items()}
150
    annotation_dfs = {}
151
    for nodule_id, cluster in clustered_annotations.items():
152
        try:
153
            annotation_dfs[nodule_id] = annotation_list_to_df(cluster)
154
        except:
155
            print("annotation to df failed for nodule {}".format(nodule_id))
156
            print("annotation to df failed for nodule {}".format(nodule_id), file=file)
157
    # annotation_dfs = {k: annotation_list_to_df(cluster) for k, cluster in clustered_annotations.items()}
158
    annotation_df = pd.concat(annotation_dfs)
159
    annotation_df.reset_index(inplace=True)
160
    annotation_df.rename(index=str, columns={'level_0': 'nodule_id'}, inplace=True)
161
    annotation_df = annotation_df.drop(["level_1"], axis="columns")
162
    annotation_df["annotation_id"] = annotation_df[["nodule_id", "annotation_idx"]].apply(lambda x: "{}a{}".format(*x), axis=1)
163
    annotation_df["nodule_idx"] = [x[:4]+x[5:] for x in annotation_df["nodule_id"]]
164
165
    annotation_df.to_csv(RESOURCES_DIR / "annotation_df.csv", index=False)
166
else:
167
    annotation_df = pd.read_csv(RESOURCES_DIR / "annotation_df.csv")
168
169
# write out non-resampled nodules
170
# TODO load scan per patient id, not per annotation id (takes way longer)
171
if WRITE_NORESAMP_NODULES: 
172
    print("saving non-resampled nodules and masks")
173
    for nodule_id, annotation_list in tqdm(clustered_annotations.items()):
174
        for i, ann in enumerate(annotation_list):
175
            annotation_id = "{}a{}".format(nodule_id, i+1)
176
            if not os.path.exists(os.path.join(DATA_DIR, "nodules3d-noresamp", "{}.npy".format(annotation_id))):
177
                try:
178
                    vol  = ann.scan.to_volume()
179
                    mask = ann.boolean_mask()
180
                    bbox = ann.bbox()
181
                    nodule = vol[bbox]
182
                    np.save(os.path.join(DATA_DIR, "nodules3d-noresamp", "{}.npy".format(annotation_id)), nodule)
183
                    np.save(os.path.join(DATA_DIR, "masks3d-noresamp", "{}.npy".format(annotation_id)), mask)
184
                except:
185
                    print(f"annotation id {annotation_id} failed")
186
187
188
# ## Resample and crop
189
# CT scanners can have different intercepts and slopes for converting the raw voxel data to Hounsfield Units, which represent radiodensity.
190
# This information can be extracted from the dicom headers and used to get all images on a uniform scale
191
# 
192
# Adapted from https://www.kaggle.com/gzuidhof/full-preprocessing-tutorial
193
# 
194
# Secondly we will pick a random segmentation of the nodule and extract a bounding box around the nodule from the scan, along with the actual segmentation which is representated as a boolean mask. Set the seed to select the random annotation
195
196
197
for out_dir in ["nodules3d", "nodules2d", "masks3d", "masks2d"]:
198
    if not os.path.exists(os.path.join(DATA_DIR, out_dir)):
199
        os.makedirs(os.path.join(DATA_DIR, out_dir))
200
#%%
201
if DO_RESAMP: 
202
    print('resampling')
203
    last_pid = ""
204
    for ann_id, ann in tqdm(anns.items()):
205
        current_pid = ann_id.split("n")[0]
206
        if current_pid != last_pid:
207
            try:
208
                scan = annotation_list[0].scan
209
            except: 
210
                print(f"loading scan for patient id {current_pid}, annotation id {ann_id} failed")
211
                print(f"loading scan for patient id {current_pid}, annotation id {ann_id} failed", file=file)
212
            last_pid = current_pid
213
        if not os.path.exists(os.path.join(DATA_DIR, "nodules3d", ann_id+".npy")):
214
            resample_and_crop_annotation(ann_id, ann, 
215
                os.path.join(DATA_DIR, "nodules3d"), 
216
                os.path.join(DATA_DIR, "masks3d"),
217
                scan=scan,
218
                size_mm=OUT_SIZE[0])
219
        else:
220
            print(f"{ann_id}.npy already exists")
221
222
# make niftis, for radiomics extraction
223
for out_dir in ["nodules", "masks"]:
224
    if not os.path.exists(os.path.join(DATA_DIR, "niftis", out_dir)):
225
        os.makedirs(os.path.join(DATA_DIR, "niftis", out_dir))
226
nods = os.listdir(os.path.join(DATA_DIR, "nodules3d"))
227
print("converting nodule numpy arrays to niftis")
228
for nod in tqdm(nods):
229
    ann_id = os.path.splitext(nod)[0]
230
    out_name = ann_id+".nii.gz"
231
    if not os.path.exists(os.path.join(DATA_DIR, "niftis", "nodules", out_name)):
232
        nod_npy = np.load(os.path.join(DATA_DIR, "nodules3d", nod))
233
        nii_img = nib.Nifti1Image(nod_npy.astype(np.float64), np.eye(4))
234
        nib.save(nii_img, os.path.join(DATA_DIR, "niftis", "nodules", out_name))
235
236
masks = os.listdir(os.path.join(DATA_DIR, "masks3d"))
237
print("converting mask numpy arrays to niftis")
238
for mask in tqdm(masks):
239
    ann_id = os.path.splitext(mask)[0]
240
    out_name = ann_id + ".nii.gz"
241
    if not os.path.exists(os.path.join(DATA_DIR, "niftis", "masks", out_name)):
242
        mask_npy = np.load(os.path.join(DATA_DIR, "masks3d", mask))
243
        nii_img = nib.Nifti1Image(mask_npy.astype(np.float64), np.eye(4))
244
        nib.save(nii_img, os.path.join(DATA_DIR, "niftis", "masks", out_name))
245
    
246
# ### Generate 2D slices based on the nodules
247
# Take all slices from the non-resampled nodules
248
# 
249
250
251
for out_dir in ["imgs", "masks"]:
252
    if not os.path.exists(os.path.join(DATA_DIR, "nodules2d", out_dir)):
253
        os.makedirs(os.path.join(DATA_DIR, "nodules2d", out_dir))
254
255
existing_files = os.listdir(os.path.join(DATA_DIR, "nodules2d", "imgs"))
256
existing_slices  = list(set([x.split("s")[0] for x in existing_files]))
257
existing_nodules = list(set([x.split("a")[0] for x in existing_files]))
258
259
if DO_SLICES:
260
    print('creating slices')
261
    last_pid = ""
262
    for ann_id, ann in tqdm(anns.items()):
263
        current_pid = ann_id.split("n")[0]
264
        current_nodule_id = ann_id.split("a")[0]
265
        if current_nodule_id in existing_nodules:
266
            continue
267
        if current_pid != last_pid:
268
            try:
269
                print(f"loading scan for patient {current_pid}")
270
                scan         = ann.scan
271
                volume       = scan.to_volume()
272
                scan_spacing = scan.pixel_spacing
273
                intercept, slope = get_intercept_and_slope(scan)
274
275
                # fix slope and intercept (these are scanner settings; slope can be 0 or -1024 (big difference!))
276
                volume *= np.array(slope, dtype=np.int16)
277
                volume += np.array(intercept, dtype=np.int16)
278
                
279
            except Exception as e: 
280
                print(f"loading scan for patient {current_pid}, annotation id {ann_id} failed: {e}")
281
                print(f"loading scan for patient {current_pid}, annotation id {ann_id} failed: {e}", file=file)
282
                continue
283
            last_pid = current_pid
284
285
        if not ann_id in existing_slices:
286
            print("slicing annotation {}".format(ann_id))
287
288
            # crop and normalize
289
            try:
290
                nodule, mask, zvals = crop_nodule_tight_z(ann, volume, scan_spacing=scan_spacing, out_size_cm=OUT_SIZE_MM_2D[0] / 10)
291
                if zvals.shape[0] < nodule.shape[2]:
292
                    print(f"length of zvals ({zvals.shape[0]}) smaller than z dimension of nodule ({nodule.shape})")
293
                    print(f"length of zvals ({zvals.shape[0]}) smaller than z dimension of nodule ({nodule.shape})", file=file)
294
                    new_zvals = np.zeros((nodule.shape[2],))
295
                    new_zvals[:zvals.shape[0]] = zvals
296
                    new_zvals[zvals.shape[0]:] = zvals.max() + 1 + np.arange(len(new_zvals) - len(zvals))
297
                    zvals = new_zvals.astype(int)
298
            except Exception as e:
299
                print(f"cropping failed, skipping...: {e}")
300
                print(f"cropping failed, skipping...: {e}", file=file)
301
                continue
302
            
303
            nodule = normalize_to_8bit(nodule, in_min = -2200.0, in_max = 1000.0, center=0.0)
304
            mask   = normalize_to_8bit(mask,   in_min = 0.0, in_max = 1.0)
305
306
            num_slices = nodule.shape[2]
307
            j = int(0)
308
            # export as images
309
            for slice_index in range(num_slices):
310
                slice_i, mask_i, zval_i  = nodule[:,:,slice_index], mask[:,:,slice_index], zvals[slice_index]
311
                if mask_i.sum() > (MIN_MM2 / (scan_spacing**2)):
312
                    j += 1
313
                    slice_id = "{}s{:03d}".format(ann_id, zval_i)
314
                    img_nod = Image.fromarray(slice_i, mode="L")
315
                    img_nod = img_nod.resize(OUT_SHAPE_2D[:2])
316
                    img_nod.save(os.path.join(DATA_DIR, "nodules2d", "imgs", slice_id+".png"))
317
                    img_mask = Image.fromarray(mask_i, mode="L")
318
                    img_mask = img_mask.resize(OUT_SHAPE_2D[:2])
319
                    img_mask.save(os.path.join(DATA_DIR, "nodules2d", "masks", slice_id+".png"))
320
321
file.close()