b/preprocess/lidc-preprocessing.py
+"""
+Generall data pre-processing, used in all experimental settings
+"""
+# # Pre-process LIDC CT scans to extract labelled nodules
+from tqdm import tqdm
+import os, sys
+import pickle
+sys.path.append("./")
+import numpy as np
+import pandas as pd
+import pylidc as pl
+import png
+from preprocessingutils import *
+import concurrent.futures
+import nibabel as nib
+from PIL import Image
+from pathlib import Path
+# note that this script assumes its evaluated from the local directory;
+# if you want the data to be stored somewhere else, adapt this,
+# or create a simlink for data
+preprocessdir = Path.cwd()
+homedir       = preprocessdir.parent
+DATA_DIR = homedir / 'data'
+if not DATA_DIR.exists():
+    os.makedirs(DATA_DIR)
+LOG_FILE = Path("log") / "lidc-preprocessing.log"
+if not LOG_FILE.parent.exists():
+    os.makedirs(LOG_FILE.parent)
+RESOURCES_DIR = homedir / 'resources'
+if not RESOURCES_DIR.exists():
+    os.makedirs(RESOURCES_DIR)
+MAX_WORKES = 1
+OUT_SIZE = (69,69,69)
+OUT_SHAPE_2D = (180,180)
+SPACING_2D = .5
+OUT_SIZE_MM_2D = tuple(np.array(OUT_SHAPE_2D) * SPACING_2D)
+# MIN_MM2  = 5**2
+MIN_MM2 = 0
+WRITE_NORESAMP_NODULES = False # export non-resampled nodules
+DO_RESAMP = False# resample and crop for 3D nodules
+DO_SLICES = True # generate 2D slices
+TEST_MODE = False
+print(f"test mode: {TEST_MODE}")
+file = open(LOG_FILE, "w+")
+# The LIDC database contains annotations of up to 4 radiologist per nodule.
+# We need to combine these annotations. Luckily, the pylidc module provides a way to cluster annotations from overlapping nodules
+# It turns out that 'nodule_id' does not refer to a nodule at all, they do not overlap.
+# Luckily, pylidc has functionality built in to determine which nodules belong together
+#
+# Extract annotations to dataframe (note: using pd.read_sql_table might be better but I couldn't figure out which connection to use)
+# ## Load scans with pylidc
+# Create dataframe with scan information
+scans = pl.query(pl.Scan).all()
+scan_dict = {}
+for scan in scans:
+    patient_id = scan.patient_id[-4:]
+    if patient_id in scan_dict.keys():
+        print(f"patient with multiple scans: {patient_id}; ", end="")
+        patient_id = str(format(int(patient_id) + int(2000)))
+        print(f"new id: {patient_id}")
+    scan_dict[patient_id] = scan
+assert len(scan_dict.keys()) == 1018
+if not (RESOURCES_DIR / "scan_df.csv").exists():
+    scan_df_dict = {}
+    print("preparing scan dataframe")
+    for patient_id, scan in tqdm(scan_dict.items()):   # TODO add scan-id here
+        scan_df_dict[patient_id] = {
+            'contrast_used':        scan.contrast_used,
+            'id':                   scan.id,
+            'is_from_initial':      scan.is_from_initial,
+            'patient_id_lidc':      scan.patient_id,
+            'pixel_spacing':        scan.pixel_spacing,
+            'series_instance_uid':  scan.series_instance_uid,
+            'slice_spacing':        scan.slice_spacing,
+            'slice_thickness':      scan.slice_thickness,
+            'spacing_x':            scan.spacings[0],
+            'spacing_y':            scan.spacings[1],
+            'spacing_z':            scan.spacings[2],
+            'study_instance_uid':   scan.study_instance_uid
+        }
+    scan_df = pd.DataFrame.from_dict(scan_df_dict, orient="index")
+    scan_df.index = ["{:04d}".format(int(x)) for x in scan_df.index.values]
+    scan_df.to_csv(RESOURCES_DIR / 'scan_df.csv', index=True, index_label="patient_id")
+else:
+    scan_df = pd.read_csv(RESOURCES_DIR / 'scan_df.csv')
+# scans can contain multiple annotations
+#  each annooation has an id, there are nodule ids, but these don't coincide accross annotations, while in reality, some annotations concern the same actual nodule. This data is combined in the 'nodule_number' column, which numbers the nodules for each patient
+# Add the patient number as a column to de DataFrame, and create an actual nodule ID based on the patient number and the nodule number
+# Takes a long time, so this is stored and can be picked up here:
+# cluster nodules
+if not os.path.exists(os.path.join(DATA_DIR, "nodule-clusters")):
+    os.makedirs(os.path.join(DATA_DIR, "nodule-clusters"))
+clustered_annotations = {}
+if TEST_MODE:
+    scan_dict = {k: scan_dict[k] for k in list(scan_dict.keys())[:5]}
+nodule_files = os.listdir(os.path.join(DATA_DIR, "nodule-clusters"))
+patients_with_nodules = list(set([x.split("n")[0] for x in nodule_files]))
+for patient_id, scan in scan_dict.items():
+    if patient_id in patients_with_nodules:
+        nodule_ids = [x for x in nodule_files if x.startswith(patient_id)]
+        for nodule in nodule_ids:
+            nodule_id = nodule.rstrip(".pkl")
+            with open(os.path.join(DATA_DIR, "nodule-clusters", nodule), "rb") as f:
+                clustered_annotations[nodule_id] = pickle.load(f)
+    else:
+        print("")
+        print("extracting nodules for patient {}".format(patient_id), end="")
+        for i, clustered_annotation in enumerate(scan.cluster_annotations()):
+            print(" n{:02d}".format(i+1), end="")
+            if not isinstance(clustered_annotation, list):
+                clustered_annotation = [clustered_annotation]
+            nodule_id = "{}n{:02d}".format(patient_id, i+1)
+            clustered_annotations[nodule_id] = clustered_annotation
+            with open(os.path.join(DATA_DIR, "nodule-clusters", nodule_id + ".pkl"), "wb") as f:
+                pickle.dump(clustered_annotation, f)
+# export all annotations in flat dict
+# TODO: do this earlier for prettier looping
+# TODO: sort keys on patient id, to actually benefit from loading scans only once per patient...
+anns = {}
+nodule_ids = list(clustered_annotations.keys())
+nodule_ids.sort()
+for nodule_id in nodule_ids:
+    annotation_list = clustered_annotations[nodule_id]
+    for i, ann in enumerate(annotation_list):
+        annotation_id = "{}a{}".format(nodule_id, i+1)
+        anns[annotation_id] = ann
+if not (RESOURCES_DIR / "annotation_df.csv").exists():
+    # annotation_dfs = {k: pd.concat([annotation_to_df(ann) for ann in cluster]) for k, cluster in clustered_annotations.items()}
+    annotation_dfs = {}
+    for nodule_id, cluster in clustered_annotations.items():
+        try:
+            annotation_dfs[nodule_id] = annotation_list_to_df(cluster)
+        except:
+            print("annotation to df failed for nodule {}".format(nodule_id))
+            print("annotation to df failed for nodule {}".format(nodule_id), file=file)
+    # annotation_dfs = {k: annotation_list_to_df(cluster) for k, cluster in clustered_annotations.items()}
+    annotation_df = pd.concat(annotation_dfs)
+    annotation_df.reset_index(inplace=True)
+    annotation_df.rename(index=str, columns={'level_0': 'nodule_id'}, inplace=True)
+    annotation_df = annotation_df.drop(["level_1"], axis="columns")
+    annotation_df["annotation_id"] = annotation_df[["nodule_id", "annotation_idx"]].apply(lambda x: "{}a{}".format(*x), axis=1)
+    annotation_df["nodule_idx"] = [x[:4]+x[5:] for x in annotation_df["nodule_id"]]
+    annotation_df.to_csv(RESOURCES_DIR / "annotation_df.csv", index=False)
+else:
+    annotation_df = pd.read_csv(RESOURCES_DIR / "annotation_df.csv")
+# write out non-resampled nodules
+# TODO load scan per patient id, not per annotation id (takes way longer)
+if WRITE_NORESAMP_NODULES:
+    print("saving non-resampled nodules and masks")
+    for nodule_id, annotation_list in tqdm(clustered_annotations.items()):
+        for i, ann in enumerate(annotation_list):
+            annotation_id = "{}a{}".format(nodule_id, i+1)
+            if not os.path.exists(os.path.join(DATA_DIR, "nodules3d-noresamp", "{}.npy".format(annotation_id))):
+                try:
+                    vol  = ann.scan.to_volume()
+                    mask = ann.boolean_mask()
+                    bbox = ann.bbox()
+                    nodule = vol[bbox]
+                    np.save(os.path.join(DATA_DIR, "nodules3d-noresamp", "{}.npy".format(annotation_id)), nodule)
+                    np.save(os.path.join(DATA_DIR, "masks3d-noresamp", "{}.npy".format(annotation_id)), mask)
+                except:
+                    print(f"annotation id {annotation_id} failed")
+# ## Resample and crop
+# CT scanners can have different intercepts and slopes for converting the raw voxel data to Hounsfield Units, which represent radiodensity.
+# This information can be extracted from the dicom headers and used to get all images on a uniform scale
+#
+# Adapted from https://www.kaggle.com/gzuidhof/full-preprocessing-tutorial
+#
+# Secondly we will pick a random segmentation of the nodule and extract a bounding box around the nodule from the scan, along with the actual segmentation which is representated as a boolean mask. Set the seed to select the random annotation
+for out_dir in ["nodules3d", "nodules2d", "masks3d", "masks2d"]:
+    if not os.path.exists(os.path.join(DATA_DIR, out_dir)):
+        os.makedirs(os.path.join(DATA_DIR, out_dir))
+#%%
+if DO_RESAMP:
+    print('resampling')
+    last_pid = ""
+    for ann_id, ann in tqdm(anns.items()):
+        current_pid = ann_id.split("n")[0]
+        if current_pid != last_pid:
+            try:
+                scan = annotation_list[0].scan
+            except:
+                print(f"loading scan for patient id {current_pid}, annotation id {ann_id} failed")
+                print(f"loading scan for patient id {current_pid}, annotation id {ann_id} failed", file=file)
+            last_pid = current_pid
+        if not os.path.exists(os.path.join(DATA_DIR, "nodules3d", ann_id+".npy")):
+            resample_and_crop_annotation(ann_id, ann,
+                os.path.join(DATA_DIR, "nodules3d"),
+                os.path.join(DATA_DIR, "masks3d"),
+                scan=scan,
+                size_mm=OUT_SIZE[0])
+        else:
+            print(f"{ann_id}.npy already exists")
+# make niftis, for radiomics extraction
+for out_dir in ["nodules", "masks"]:
+    if not os.path.exists(os.path.join(DATA_DIR, "niftis", out_dir)):
+        os.makedirs(os.path.join(DATA_DIR, "niftis", out_dir))
+nods = os.listdir(os.path.join(DATA_DIR, "nodules3d"))
+print("converting nodule numpy arrays to niftis")
+for nod in tqdm(nods):
+    ann_id = os.path.splitext(nod)[0]
+    out_name = ann_id+".nii.gz"
+    if not os.path.exists(os.path.join(DATA_DIR, "niftis", "nodules", out_name)):
+        nod_npy = np.load(os.path.join(DATA_DIR, "nodules3d", nod))
+        nii_img = nib.Nifti1Image(nod_npy.astype(np.float64), np.eye(4))
+        nib.save(nii_img, os.path.join(DATA_DIR, "niftis", "nodules", out_name))
+masks = os.listdir(os.path.join(DATA_DIR, "masks3d"))
+print("converting mask numpy arrays to niftis")
+for mask in tqdm(masks):
+    ann_id = os.path.splitext(mask)[0]
+    out_name = ann_id + ".nii.gz"
+    if not os.path.exists(os.path.join(DATA_DIR, "niftis", "masks", out_name)):
+        mask_npy = np.load(os.path.join(DATA_DIR, "masks3d", mask))
+        nii_img = nib.Nifti1Image(mask_npy.astype(np.float64), np.eye(4))
+        nib.save(nii_img, os.path.join(DATA_DIR, "niftis", "masks", out_name))
+# ### Generate 2D slices based on the nodules
+# Take all slices from the non-resampled nodules
+#
+for out_dir in ["imgs", "masks"]:
+    if not os.path.exists(os.path.join(DATA_DIR, "nodules2d", out_dir)):
+        os.makedirs(os.path.join(DATA_DIR, "nodules2d", out_dir))
+existing_files = os.listdir(os.path.join(DATA_DIR, "nodules2d", "imgs"))
+existing_slices  = list(set([x.split("s")[0] for x in existing_files]))
+existing_nodules = list(set([x.split("a")[0] for x in existing_files]))
+if DO_SLICES:
+    print('creating slices')
+    last_pid = ""
+    for ann_id, ann in tqdm(anns.items()):
+        current_pid = ann_id.split("n")[0]
+        current_nodule_id = ann_id.split("a")[0]
+        if current_nodule_id in existing_nodules:
+            continue
+        if current_pid != last_pid:
+            try:
+                print(f"loading scan for patient {current_pid}")
+                scan         = ann.scan
+                volume       = scan.to_volume()
+                scan_spacing = scan.pixel_spacing
+                intercept, slope = get_intercept_and_slope(scan)
+                # fix slope and intercept (these are scanner settings; slope can be 0 or -1024 (big difference!))
+                volume *= np.array(slope, dtype=np.int16)
+                volume += np.array(intercept, dtype=np.int16)
+            except Exception as e:
+                print(f"loading scan for patient {current_pid}, annotation id {ann_id} failed: {e}")
+                print(f"loading scan for patient {current_pid}, annotation id {ann_id} failed: {e}", file=file)
+                continue
+            last_pid = current_pid
+        if not ann_id in existing_slices:
+            print("slicing annotation {}".format(ann_id))
+            # crop and normalize
+            try:
+                nodule, mask, zvals = crop_nodule_tight_z(ann, volume, scan_spacing=scan_spacing, out_size_cm=OUT_SIZE_MM_2D[0] / 10)
+                if zvals.shape[0] < nodule.shape[2]:
+                    print(f"length of zvals ({zvals.shape[0]}) smaller than z dimension of nodule ({nodule.shape})")
+                    print(f"length of zvals ({zvals.shape[0]}) smaller than z dimension of nodule ({nodule.shape})", file=file)
+                    new_zvals = np.zeros((nodule.shape[2],))
+                    new_zvals[:zvals.shape[0]] = zvals
+                    new_zvals[zvals.shape[0]:] = zvals.max() + 1 + np.arange(len(new_zvals) - len(zvals))
+                    zvals = new_zvals.astype(int)
+            except Exception as e:
+                print(f"cropping failed, skipping...: {e}")
+                print(f"cropping failed, skipping...: {e}", file=file)
+                continue
+            nodule = normalize_to_8bit(nodule, in_min = -2200.0, in_max = 1000.0, center=0.0)
+            mask   = normalize_to_8bit(mask,   in_min = 0.0, in_max = 1.0)
+            num_slices = nodule.shape[2]
+            j = int(0)
+            # export as images
+            for slice_index in range(num_slices):
+                slice_i, mask_i, zval_i  = nodule[:,:,slice_index], mask[:,:,slice_index], zvals[slice_index]
+                if mask_i.sum() > (MIN_MM2 / (scan_spacing**2)):
+                    j += 1
+                    slice_id = "{}s{:03d}".format(ann_id, zval_i)
+                    img_nod = Image.fromarray(slice_i, mode="L")
+                    img_nod = img_nod.resize(OUT_SHAPE_2D[:2])
+                    img_nod.save(os.path.join(DATA_DIR, "nodules2d", "imgs", slice_id+".png"))
+                    img_mask = Image.fromarray(mask_i, mode="L")
+                    img_mask = img_mask.resize(OUT_SHAPE_2D[:2])
+                    img_mask.save(os.path.join(DATA_DIR, "nodules2d", "masks", slice_id+".png"))
+file.close()