--- a
+++ b/code/preprocessing.py
@@ -0,0 +1,189 @@
+"""
+    Preprocessing for U-net
+    Thresholding and mask the lung part
+    Use annotation to mask the nodules 
+"""
+
+import numpy as np
+import pandas as pd
+import os
+
+from skimage.segmentation import clear_border
+from skimage.measure import label,regionprops, perimeter
+from skimage.morphology import ball, disk, dilation, binary_erosion, remove_small_objects, erosion, closing, reconstruction, binary_closing
+from skimage.filters import roberts, sobel
+from scipy import ndimage as ndi
+
+from glob import glob
+from tqdm import tqdm
+
+import SimpleITK as sitk
+import scipy.misc
+import matplotlib.pyplot as plt
+
+
+def get_segmented_lungs(im, plot=False):
+    binary = im < 604
+    cleared = clear_border(binary)
+    label_image = label(cleared)
+    areas = [r.area for r in regionprops(label_image)]
+    areas.sort()
+    if len(areas) > 2:
+        for region in regionprops(label_image):
+            if region.area < areas[-2]:
+                for coordinates in region.coords:                
+                       label_image[coordinates[0], coordinates[1]] = 0
+    binary = label_image > 0
+    selem = disk(2)
+    binary = binary_erosion(binary, selem)
+    selem = disk(10)
+    binary = binary_closing(binary, selem)
+    edges = roberts(binary)
+    binary = ndi.binary_fill_holes(edges)
+    get_high_vals = binary == 0
+    im[get_high_vals] = 0
+    return im
+
+def segment_lung_from_ct_scan(ct_scan):
+    return np.asarray([get_segmented_lungs(slice) for slice in ct_scan])
+
+def load_itk(filename):
+    itkimage = sitk.ReadImage(filename)
+    ct_scan = sitk.GetArrayFromImage(itkimage)
+    origin = np.array(list(reversed(itkimage.GetOrigin())))
+    spacing = np.array(list(reversed(itkimage.GetSpacing())))
+    return ct_scan, origin, spacing
+
+def world_2_voxel(world_coordinates, origin, spacing):
+    stretched_voxel_coordinates = np.absolute(world_coordinates - origin)
+    voxel_coordinates = stretched_voxel_coordinates / spacing
+    return voxel_coordinates
+
+def voxel_2_world(voxel_coordinates, origin, spacing):
+    stretched_voxel_coordinates = voxel_coordinates * spacing
+    world_coordinates = stretched_voxel_coordinates + origin
+    return world_coordinates
+
+def seq(start, stop, step=1):
+    n = int(round((stop - start)/float(step)))
+    if n > 1:
+        return([start + step*i for i in range(n+1)])
+    else:
+        return([])
+
+def draw_circles(image,cands,origin,spacing):
+    #make empty matrix, which will be filled with the mask
+    RESIZE_SPACING = [1, 1, 1]
+    image_mask = np.zeros(image.shape)
+
+    #run over all the nodules in the lungs
+    for ca in cands.values:
+        #get middel x-,y-, and z-worldcoordinate of the nodule
+        radius = np.ceil(ca[4])/2
+        coord_x = ca[1]
+        coord_y = ca[2]
+        coord_z = ca[3]
+        image_coord = np.array((coord_z,coord_y,coord_x))
+
+        #determine voxel coordinate given the worldcoordinate
+        image_coord = world_2_voxel(image_coord,origin,spacing)
+
+        #determine the range of the nodule
+        noduleRange = seq(-radius, radius, RESIZE_SPACING[0])
+
+        #create the mask
+        for x in noduleRange:
+            for y in noduleRange:
+                for z in noduleRange:
+                    coords = world_2_voxel(np.array((coord_z+z,coord_y+y,coord_x+x)),origin,spacing)
+                    if (np.linalg.norm(image_coord-coords) * RESIZE_SPACING[0]) < radius:
+                        image_mask[int(np.round(coords[0])),int(np.round(coords[1])),int(np.round(coords[2]))] = int(1)
+    
+    return image_mask
+
+def create_nodule_mask(imagePath, cands, fcount, subsetnum,final_lung_mask,final_nodule_mask):
+    #if os.path.isfile(imagePath.replace('original',SAVE_FOLDER_image)) == False:
+    img, origin, spacing = load_itk(imagePath)
+    #calculate resize factor
+    RESIZE_SPACING = [1, 1, 1]
+    resize_factor = spacing / RESIZE_SPACING
+    new_real_shape = img.shape * resize_factor
+    new_shape = np.round(new_real_shape)
+    real_resize = new_shape / img.shape
+    new_spacing = spacing / real_resize
+    
+    #resize image
+    lung_img = scipy.ndimage.interpolation.zoom(img, real_resize)
+    
+    # Segment the lung structure
+    lung_img = lung_img + 1024
+    lung_mask = segment_lung_from_ct_scan(lung_img)
+    lung_img = lung_img - 1024
+
+    #create nodule mask
+    nodule_mask = draw_circles(lung_img,cands,origin,new_spacing)
+
+    lung_img_512, lung_mask_512, nodule_mask_512 = np.zeros((lung_img.shape[0], 512, 512)), np.zeros((lung_mask.shape[0], 512, 512)), np.zeros((nodule_mask.shape[0], 512, 512))
+
+    original_shape = lung_img.shape	
+    i_start = 0
+    i_end = 0
+    flag = 0
+    for z in range(lung_img.shape[0]):
+        offset = (512 - original_shape[1])
+        upper_offset = int(np.round(offset/2))
+        lower_offset = int(offset - upper_offset)
+
+        new_origin = voxel_2_world([-upper_offset,-lower_offset,0],origin,new_spacing)
+
+        lung_mask_512[z, upper_offset:-lower_offset,upper_offset:-lower_offset] = lung_mask[z,:,:]
+        nodule_mask_512[z, upper_offset:-lower_offset,upper_offset:-lower_offset] = nodule_mask[z,:,:]
+
+    # return final_lung_mask,final_nodule_mask
+    # save images.
+    np.save(os.path.join(OUTPUT_PATH,"lung_mask_%04d_%04d.npy" % (subsetnum, fcount)),lung_mask_512)
+    np.save(os.path.join(OUTPUT_PATH,"nodule_mask_%04d_%04d.npy" % (subsetnum, fcount)),nodule_mask_512)
+
+
+
+# Helper function to get rows in data frame associated with each file
+def get_filename(file_list, case):
+    for f in file_list:
+        if case in f:
+            return(f)
+
+# Getting list of image files
+LUNA_DATA_PATH = '/home/marshallee/Documents/lung/'
+OUTPUT_PATH = '/home/marshallee/Documents/lung/output'
+
+final_lung_mask = np.zeros((1,512,512))
+final_nodule_mask = np.zeros((1,512,512))
+
+# create a list of subsets, which are lists of file paths
+FILE_LIST = []
+for i in range(0, 9):
+    LUNA_SUBSET_PATH = LUNA_DATA_PATH + 'subset'+str(i)+'/'
+    FILE_LIST.append(glob(LUNA_SUBSET_PATH + '*.mhd'))
+
+
+for subsetnum, subsetlist in enumerate(FILE_LIST):
+    # The locations of the nodes
+    df_node = pd.read_csv(LUNA_DATA_PATH + "mask-generate/CSVFILES/annotations.csv")
+    #df_node = pd.read_csv(LUNA_DATA_PATH + "CSVFILES/annotations.csv")
+    df_node["file"] = df_node["seriesuid"].map(lambda file_name: get_filename(subsetlist, file_name))
+    df_node = df_node.dropna()
+
+    # Looping over the image files
+    for fcount, img_file in enumerate(tqdm(subsetlist)):
+        mini_df = df_node[df_node["file"]==img_file] # get all nodules associate with file
+        if mini_df.shape[0]>0: # some files may not have a nodule--skipping those
+            # feeding mini_df to the function will work for "cands"
+            final_lung_mask, final_nodule_mask =create_nodule_mask(img_file, mini_df, fcount, subsetnum,final_lung_mask,final_nodule_mask)
+
+final_lung_mask = final_lung_mask[1:]
+final_nodule_mask = final_nodule_mask[1:]
+print(final_lung_mask.shape)
+print(final_nodule_mask.shape)
+np.save(os.path.join(OUTPUT_PATH,'final_lung_mask.npy'),final_lung_mask)
+np.save(os.path.join(OUTPUT_PATH,'final_nodule_mask.npy'),final_nodule_mask)
+