a b/code/vowel-extract.py
1
"""
2
    Utility for vowel extraction
3
"""
4
5
import numpy as np
6
import pandas as pd
7
import os
8
import csv
9
from skimage.segmentation import clear_border
10
from skimage.measure import label,regionprops, perimeter
11
from skimage.morphology import ball, disk, dilation, binary_erosion, remove_small_objects, erosion, closing, reconstruction, binary_closing
12
from skimage.filters import roberts, sobel
13
from scipy import ndimage as ndi
14
import pandas as pd
15
from glob import glob
16
from tqdm import tqdm
17
import SimpleITK as sitk
18
import scipy.misc
19
import matplotlib.pyplot as plt
20
import traceback
21
import random
22
from PIL import Image
23
from project_config import *
24
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
25
from skimage import measure, feature
26
import time
27
28
def draw_circles(image,cands,origin,spacing):
29
    #make empty matrix, which will be filled with the mask
30
    RESIZE_SPACING = [1, 1, 1]
31
    image_mask = np.zeros(image.shape)
32
33
    #run over all the nodules in the lungs
34
    for ca in cands.values:
35
        #get middel x-,y-, and z-worldcoordinate of the nodule
36
        radius = np.ceil(ca[4])/2
37
        coord_x = ca[1]
38
        coord_y = ca[2]
39
        coord_z = ca[3]
40
        image_coord = np.array((coord_z,coord_y,coord_x))
41
42
        #determine voxel coordinate given the worldcoordinate
43
        image_coord = world_2_voxel(image_coord,origin,spacing)
44
45
        #determine the range of the nodule
46
        noduleRange = seq(-radius, radius, RESIZE_SPACING[0])
47
48
        #create the mask
49
        for x in noduleRange:
50
            for y in noduleRange:
51
                for z in noduleRange:
52
                    coords = world_2_voxel(np.array((coord_z+z,coord_y+y,coord_x+x)),origin,spacing)
53
                    if (np.linalg.norm(image_coord-coords) * RESIZE_SPACING[0]) < radius:
54
                        image_mask[int(np.round(coords[0])),int(np.round(coords[1])),int(np.round(coords[2]))] = int(1)
55
    
56
    return image_mask
57
58
def get_segmented_lungs(im, plot=False):
59
    binary = im < 604
60
    cleared = clear_border(binary)
61
    label_image = label(cleared)
62
    areas = [r.area for r in regionprops(label_image)]
63
    areas.sort()
64
    if len(areas) > 2:
65
        for region in regionprops(label_image):
66
            if region.area < areas[-2]:
67
                for coordinates in region.coords:                
68
                       label_image[coordinates[0], coordinates[1]] = 0
69
    binary = label_image > 0
70
    selem = disk(2)
71
    binary = binary_erosion(binary, selem)
72
    selem = disk(10)
73
    binary = binary_closing(binary, selem)
74
    edges = roberts(binary)
75
    binary = ndi.binary_fill_holes(edges)
76
    get_high_vals = binary == 0
77
    im[get_high_vals] = 0
78
    return im
79
80
def segment_lung_from_ct_scan(ct_scan):
81
    return np.asarray([get_segmented_lungs(slice) for slice in ct_scan])
82
83
def load_itk(filename):
84
    itkimage = sitk.ReadImage(filename)
85
    ct_scan = sitk.GetArrayFromImage(itkimage)
86
    origin = np.array(list(reversed(itkimage.GetOrigin())))
87
    spacing = np.array(list(reversed(itkimage.GetSpacing())))
88
    return ct_scan, origin, spacing
89
90
def world_2_voxel(world_coordinates, origin, spacing):
91
    stretched_voxel_coordinates = np.absolute(world_coordinates - origin)
92
    voxel_coordinates = stretched_voxel_coordinates / spacing
93
    return voxel_coordinates
94
95
def voxel_2_world(voxel_coordinates, origin, spacing):
96
    stretched_voxel_coordinates = voxel_coordinates * spacing
97
    world_coordinates = stretched_voxel_coordinates + origin
98
    return world_coordinates
99
100
def seq(start, stop, step=1):
101
    n = int(round((stop - start)/float(step)))
102
    if n > 1:
103
        return([start + step*i for i in range(n+1)])
104
    else:
105
        return([])
106
107
def truncate_hu(image_array):
108
    image_array[image_array > 400] = 0
109
    image_array[image_array <-1000] = 0
110
111
# LUNA2016 data prepare ,second step: normalzation the HU
112
def normalazation(image_array):
113
    max = image_array.max()
114
    min = image_array.min()
115
    image_array = (image_array-min)/(max-min)  # float cannot apply the compute,or array error will occur
116
    avg = image_array.mean()
117
    image_array = image_array-avg
118
    return image_array   # a bug here, a array must be returned,directly appling function did't work
119
120
def extract(imagePath, cands, anno, fcount, normalization_output_path):
121
122
    print('file %d pre-processing starts...' % fcount)
123
    img, origin, spacing = load_itk(imagePath)
124
    print('origin: ', origin)
125
    print('spacing: ', spacing)
126
127
    #calculate resize factor
128
    RESIZE_SPACING = [1, 1, 1]
129
    resize_factor = spacing / RESIZE_SPACING
130
    new_real_shape = img.shape * resize_factor
131
    new_shape = np.round(new_real_shape)
132
    real_resize = new_shape / img.shape
133
    new_spacing = spacing / real_resize
134
    
135
    #resize image
136
    lung_img = scipy.ndimage.interpolation.zoom(img, real_resize)
137
    
138
    # Segment the lung structure
139
    lung_img = lung_img + 1024
140
    lung_mask = segment_lung_from_ct_scan(lung_img)
141
    lung_img = lung_img - 1024
142
143
    nodule_mask = draw_circles(lung_img,anno,origin,new_spacing)
144
145
    #create nodule mask
146
    lung_mask_512 = np.zeros((lung_mask.shape[0], 512, 512))
147
    nodule_mask_512 = np.zeros((nodule_mask.shape[0], 512, 512))
148
149
    original_shape = lung_img.shape 
150
    for z in range(lung_img.shape[0]):
151
        offset = (512 - original_shape[1])
152
        upper_offset = int(np.round(offset/2))
153
        lower_offset = int(offset - upper_offset)
154
        new_origin = voxel_2_world([-upper_offset,-lower_offset,0],origin,new_spacing)
155
156
        lung_mask_512[z, upper_offset:-lower_offset,upper_offset:-lower_offset] = lung_mask[z,:,:] * nodule_mask[z,:,:]
157
158
    print('file %d pre-processing over...' % fcount)
159
    print('file %d cubic extract strats...' % fcount)
160
161
    for node_idx, cur_row in cands.iterrows():
162
        node_x = cur_row["coordX"]
163
        node_y = cur_row["coordY"]
164
        node_z = cur_row["coordZ"]
165
        nodule_class = cur_row['class']
166
        center = np.array([node_x, node_y, node_z])  # nodule center
167
        v_center = world_2_voxel(center,origin,spacing)  # nodule center in voxel space ( x,y,z ordering)
168
169
        # every nodules saved into size of 20x20x6,30x30x10,40x40x26
170
        if nodule_class == 0:
171
            imgs_fake1 = np.zeros([26, 40, 40], dtype=np.float32)
172
            try:
173
                # these following imgs saves for plot
174
                imgs_fake1 = lung_mask_512[int(v_center[0]-13):int(v_center[0]+13),int(v_center[1]-20):int(v_center[1]+20),int(v_center[2]-20):int(v_center[2]+20)]
175
                
176
                if(np.max(imgs_fake1) != 0.0):
177
                    np.save(os.path.join(normalization_output_path, "%d_fake_size40x40.npy" % node_idx), imgs_fake1)
178
                    print('file %d nodule %d fake saves...' % (fcount, node_idx))
179
180
            except Exception as e:
181
                print('*')
182
        
183
        elif nodule_class == 1:
184
            imgs_true1 = np.zeros([26, 40, 40], dtype=np.float32)
185
            try:
186
                # these following imgs saves for plot
187
                imgs_true1 = lung_mask_512[int(v_center[0]-13):int(v_center[0]+13),int(v_center[1]-20):int(v_center[1]+20),int(v_center[2]-20):int(v_center[2]+20)]
188
                
189
                if(np.max(imgs_true1) != 0.0):
190
                    np.save(os.path.join(normalization_output_path, "%d_true_size40x40.npy" % node_idx), imgs_true1)
191
                    print('file %d nodule %d true saves...' % (fcount, node_idx))
192
193
            except Exception as e:
194
                print('*')
195
                
196
197
def get_filename(file_list, case):
198
    for f in file_list:
199
        if case in f:
200
            return(f)
201
202
if __name__ =='__main__':
203
204
    '''
205
    base_dir = '/home/ubuntu/data/'
206
    annatation_file = '/home/ubuntu/data/CSVFILES/annotations.csv'
207
    candidate_file = '/home/ubuntu/data/CSVFILES/candidates.csv'
208
    plot_output_path = '/home/ubuntu/data/output'
209
    normalization_output_path = '/home/ubuntu/data/train-3d'
210
    test_path = '/home/ubuntu/data/test-3d'
211
    '''
212
    
213
    print('test set starts processing...')
214
    for i in range(9, 10):
215
        print('subset %d preprocessing starts...' % i)
216
        LUNA_SUBSET_PATH = base_dir + 'subset'+str(i)+'/'
217
        FILE_LIST = glob(LUNA_SUBSET_PATH + '*.mhd')
218
219
        cand_df_node = pd.read_csv(candidate_file)
220
        cand_df_node["file"] = cand_df_node["seriesuid"].map(lambda file_name: get_filename(FILE_LIST, file_name))
221
        cand_df_node = cand_df_node.dropna()
222
223
        anno_df_node = pd.read_csv(annatation_file)
224
        anno_df_node["file"] = anno_df_node["seriesuid"].map(lambda file_name: get_filename(FILE_LIST, file_name))
225
        anno_df_node = anno_df_node.dropna()
226
227
        # Looping over the image files
228
        for fcount, img_file in enumerate(tqdm(FILE_LIST)):
229
            cand_mini_df = cand_df_node[cand_df_node["file"]==img_file] # get all nodules associate with file
230
            anno_mini_df = anno_df_node[anno_df_node["file"]==img_file]
231
            if cand_mini_df.shape[0]>0 and anno_mini_df.shape[0]>0: # some files may not have a nodule--skipping those
232
                extract(img_file, cand_mini_df, anno_mini_df, fcount, test_path)
233
        print('subset %d preprocessing ends...' % i)
234
        print('*'*20)
235
    print('test set ends processing...')
236
    print('-'*25)
237
    
238
239
    print('train set starts processing...')
240
    for i in range(0, 9):
241
        print('subset %d preprocessing starts...' % i)
242
        LUNA_SUBSET_PATH = base_dir + 'subset'+str(i)+'/'
243
        FILE_LIST = glob(LUNA_SUBSET_PATH + '*.mhd')
244
245
        cand_df_node = pd.read_csv(candidate_file)
246
        cand_df_node["file"] = cand_df_node["seriesuid"].map(lambda file_name: get_filename(FILE_LIST, file_name))
247
        cand_df_node = cand_df_node.dropna()
248
249
        anno_df_node = pd.read_csv(annatation_file)
250
        anno_df_node["file"] = anno_df_node["seriesuid"].map(lambda file_name: get_filename(FILE_LIST, file_name))
251
        anno_df_node = anno_df_node.dropna()
252
253
        # Looping over the image files
254
        for fcount, img_file in enumerate(tqdm(FILE_LIST)):
255
            cand_mini_df = cand_df_node[cand_df_node["file"]==img_file] # get all nodules associate with file
256
            anno_mini_df = anno_df_node[anno_df_node["file"]==img_file]
257
            if cand_mini_df.shape[0]>0 and anno_mini_df.shape[0]>0: # some files may not have a nodule--skipping those
258
                extract(img_file, cand_mini_df, anno_mini_df, fcount, normalization_output_path)
259
        print('subset %d preprocessing ends...' % i)
260
        print('*'*20)
261
    print('train set ends processing...')
262
    print('-'*25)
263
    
264