--- a +++ b/preprocess/preprocessingutils.py @@ -0,0 +1,175 @@ +import os +import numpy as np +import pandas as pd +import pylidc as pl +import pydicom +import pickle +from sklearn.preprocessing import PowerTransformer + +def annotation_to_dict(ann): + ''' + Flattens annotations into a single row pandas DataFrame + ''' + + d = { + 'patient_id': ann.scan.patient_id, + 'nodule_id_lidc': ann._nodule_id, + 'annotation_id_lidc': ann.id, + 'scan_id': ann.scan_id, + 'volume': ann.volume, + 'surface_area': ann.surface_area, + 'diameter': ann.diameter + } + feature_names = ['sublety', 'internalstructure', 'calcification', + 'sphericity', 'margin', 'lobulation', 'spiculation', + 'texture', 'malignancy'] + for feature, value in zip(feature_names, ann.feature_vals()): + d[feature] = value + return d + +def annotation_to_df(ann): + try: + d = annotation_to_dict(ann) + d = {k: [v] for k, v in d.items()} + df = pd.DataFrame.from_dict(d) + except: + df = None + return df + +def annotation_list_to_df(anns): + assert isinstance(anns, list) + dfs = [] + for ann in anns: + dfs.append(annotation_to_df(ann)) + df = pd.concat(dfs, ignore_index=True) + df["annotation_idx"] = range(1, df.shape[0]+1) + return df + + +def get_intercept_and_slope(scan): + ''' + scan is the results of a pydicom query + returns the intercept and slope + adapted from https://www.kaggle.com/gzuidhof/full-preprocessing-tutorial + ''' + dcm_path = scan.get_path_to_dicom_files() + dcm_files = [x for x in os.listdir(dcm_path) if x.endswith(".dcm")] + slice0 = pydicom.read_file(os.path.join(dcm_path, dcm_files[0]), stop_before_pixels=True) + intercept = slice0.RescaleIntercept + slope = slice0.RescaleSlope + return intercept, slope + +def resample_and_crop_annotation(ann_id, ann, nodule_path, mask_path=None, scan=None, size_mm = 50, export_mask = True): + ''' + take an annotation, crop and resample + size is the length of the sides of the resulting cube in millimeters + ''' + if scan is None: + scan = ann.scan + intercept, slope = get_intercept_and_slope(scan) + try: + vol, mask = ann.uniform_cubic_resample(side_length = size_mm, verbose = True) + if slope != 1: + vol = slope * vol.astype(np.float64) + + vol = vol.astype(np.int16) + vol += np.int16(intercept) + + np.save(os.path.join(nodule_path, ann_id+".npy"), vol) + if export_mask: + assert mask_path != None + np.save(os.path.join(mask_path, ann_id+".npy"), mask) + print("") + except: + print("-failed") + + +def crop_nodule_tight_z(ann, volume=None, scan=None, scan_spacing=None, out_size_cm = 5): + """ + Get nodule cropped tightly in z direction, but of minimum dimension in xy plane + """ + # print(f"trying to crop") + if volume is None: + if scan is None: + scan = ann.scan + volume = scan.to_volume() + if scan_spacing is None: + scan_spacing = scan.pixel_spacing + # print(f"scan_spacing: {scan_spacing}") + padding = get_padding_tight_z(ann, scan_spacing=scan_spacing, out_size_cm=out_size_cm) + # print(f"padding: {padding}") + + mask = ann.boolean_mask(pad=padding) + bbox = ann.bbox(pad=padding) + zvals= ann.contour_slice_indices + arr = volume[bbox] + + return arr, mask, zvals + +def get_padding_tight_z(ann, scan=None, scan_spacing=None, out_size_cm = None): + """ + Get bbox dimensions base on a minimal size, restricting to no padding in z direction + """ + if scan_spacing is None: + if scan is None: + scan_spacing = ann.scan.pixel_spacing + else: + scan_spacing = scan.pixel_spacing + # return tight bounding box + if out_size_cm is None: + padding = [(int(0), int(0))] * 3 + else: + # if len(out_size_cm == 3): + # if out_size_cm[2] is None: + # padding_z = (0,0) + out_shape = (np.ceil((out_size_cm * 10) / scan_spacing) * np.ones((2,))).astype(int) + + bb_mat = ann.bbox_matrix() + bb_shape = bb_mat[:,1] - bb_mat[:,0] + + paddings = out_shape - bb_shape[:2] + + # print(f"paddings: {paddings}") + + padding_x = (int(np.ceil(paddings[0] / 2)), int(np.floor(paddings[0] / 2))) + padding_y = (int(np.ceil(paddings[1] / 2)), int(np.floor(paddings[1] / 2))) + + padding = [padding_x, padding_y, (int(0),int(0))] + + return padding + + + +def normalize(x, window = None, level = None, in_min = -1000.0, in_max = 600.0, center=0.0): + """ + Normalize array to values between 0 and 1, possibly clipping + """ + assert type(x) is np.ndarray + + if (not window is None) & (not level is None) : + in_min = level - (window / 2) + in_max = level + (window / 2) + + x = x - in_min # add zero point + x = x / (in_max - in_min) # scale to unit + x = x + center # adjust white-balance + x = np.clip(x, 0.0, 1.0) # clip to (0,1) + return x + +def normalized_to_8bit(x): + assert ((x.min() >= 0) & (x.max() <= 1)) + x = (255 * x) + return x.astype(np.uint8) + +def normalize_to_8bit(x, *args, **kwargs): + return normalized_to_8bit(normalize(x, *args, **kwargs)) + +def pwr_transform(x, train_ids=None): + x = np.array(x).reshape(-1,1) + pt = PowerTransformer(method="yeo-johnson") + if train_ids is None: + pt.fit(x) + else: + pt.fit(x[train_ids]) + y = pt.transform(x) + return np.squeeze(y)