|
a |
|
b/code/vowel-extract.py |
|
|
1 |
""" |
|
|
2 |
Utility for vowel extraction |
|
|
3 |
""" |
|
|
4 |
|
|
|
5 |
import numpy as np |
|
|
6 |
import pandas as pd |
|
|
7 |
import os |
|
|
8 |
import csv |
|
|
9 |
from skimage.segmentation import clear_border |
|
|
10 |
from skimage.measure import label,regionprops, perimeter |
|
|
11 |
from skimage.morphology import ball, disk, dilation, binary_erosion, remove_small_objects, erosion, closing, reconstruction, binary_closing |
|
|
12 |
from skimage.filters import roberts, sobel |
|
|
13 |
from scipy import ndimage as ndi |
|
|
14 |
import pandas as pd |
|
|
15 |
from glob import glob |
|
|
16 |
from tqdm import tqdm |
|
|
17 |
import SimpleITK as sitk |
|
|
18 |
import scipy.misc |
|
|
19 |
import matplotlib.pyplot as plt |
|
|
20 |
import traceback |
|
|
21 |
import random |
|
|
22 |
from PIL import Image |
|
|
23 |
from project_config import * |
|
|
24 |
from mpl_toolkits.mplot3d.art3d import Poly3DCollection |
|
|
25 |
from skimage import measure, feature |
|
|
26 |
import time |
|
|
27 |
|
|
|
28 |
def draw_circles(image,cands,origin,spacing): |
|
|
29 |
#make empty matrix, which will be filled with the mask |
|
|
30 |
RESIZE_SPACING = [1, 1, 1] |
|
|
31 |
image_mask = np.zeros(image.shape) |
|
|
32 |
|
|
|
33 |
#run over all the nodules in the lungs |
|
|
34 |
for ca in cands.values: |
|
|
35 |
#get middel x-,y-, and z-worldcoordinate of the nodule |
|
|
36 |
radius = np.ceil(ca[4])/2 |
|
|
37 |
coord_x = ca[1] |
|
|
38 |
coord_y = ca[2] |
|
|
39 |
coord_z = ca[3] |
|
|
40 |
image_coord = np.array((coord_z,coord_y,coord_x)) |
|
|
41 |
|
|
|
42 |
#determine voxel coordinate given the worldcoordinate |
|
|
43 |
image_coord = world_2_voxel(image_coord,origin,spacing) |
|
|
44 |
|
|
|
45 |
#determine the range of the nodule |
|
|
46 |
noduleRange = seq(-radius, radius, RESIZE_SPACING[0]) |
|
|
47 |
|
|
|
48 |
#create the mask |
|
|
49 |
for x in noduleRange: |
|
|
50 |
for y in noduleRange: |
|
|
51 |
for z in noduleRange: |
|
|
52 |
coords = world_2_voxel(np.array((coord_z+z,coord_y+y,coord_x+x)),origin,spacing) |
|
|
53 |
if (np.linalg.norm(image_coord-coords) * RESIZE_SPACING[0]) < radius: |
|
|
54 |
image_mask[int(np.round(coords[0])),int(np.round(coords[1])),int(np.round(coords[2]))] = int(1) |
|
|
55 |
|
|
|
56 |
return image_mask |
|
|
57 |
|
|
|
58 |
def get_segmented_lungs(im, plot=False): |
|
|
59 |
binary = im < 604 |
|
|
60 |
cleared = clear_border(binary) |
|
|
61 |
label_image = label(cleared) |
|
|
62 |
areas = [r.area for r in regionprops(label_image)] |
|
|
63 |
areas.sort() |
|
|
64 |
if len(areas) > 2: |
|
|
65 |
for region in regionprops(label_image): |
|
|
66 |
if region.area < areas[-2]: |
|
|
67 |
for coordinates in region.coords: |
|
|
68 |
label_image[coordinates[0], coordinates[1]] = 0 |
|
|
69 |
binary = label_image > 0 |
|
|
70 |
selem = disk(2) |
|
|
71 |
binary = binary_erosion(binary, selem) |
|
|
72 |
selem = disk(10) |
|
|
73 |
binary = binary_closing(binary, selem) |
|
|
74 |
edges = roberts(binary) |
|
|
75 |
binary = ndi.binary_fill_holes(edges) |
|
|
76 |
get_high_vals = binary == 0 |
|
|
77 |
im[get_high_vals] = 0 |
|
|
78 |
return im |
|
|
79 |
|
|
|
80 |
def segment_lung_from_ct_scan(ct_scan): |
|
|
81 |
return np.asarray([get_segmented_lungs(slice) for slice in ct_scan]) |
|
|
82 |
|
|
|
83 |
def load_itk(filename): |
|
|
84 |
itkimage = sitk.ReadImage(filename) |
|
|
85 |
ct_scan = sitk.GetArrayFromImage(itkimage) |
|
|
86 |
origin = np.array(list(reversed(itkimage.GetOrigin()))) |
|
|
87 |
spacing = np.array(list(reversed(itkimage.GetSpacing()))) |
|
|
88 |
return ct_scan, origin, spacing |
|
|
89 |
|
|
|
90 |
def world_2_voxel(world_coordinates, origin, spacing): |
|
|
91 |
stretched_voxel_coordinates = np.absolute(world_coordinates - origin) |
|
|
92 |
voxel_coordinates = stretched_voxel_coordinates / spacing |
|
|
93 |
return voxel_coordinates |
|
|
94 |
|
|
|
95 |
def voxel_2_world(voxel_coordinates, origin, spacing): |
|
|
96 |
stretched_voxel_coordinates = voxel_coordinates * spacing |
|
|
97 |
world_coordinates = stretched_voxel_coordinates + origin |
|
|
98 |
return world_coordinates |
|
|
99 |
|
|
|
100 |
def seq(start, stop, step=1): |
|
|
101 |
n = int(round((stop - start)/float(step))) |
|
|
102 |
if n > 1: |
|
|
103 |
return([start + step*i for i in range(n+1)]) |
|
|
104 |
else: |
|
|
105 |
return([]) |
|
|
106 |
|
|
|
107 |
def truncate_hu(image_array): |
|
|
108 |
image_array[image_array > 400] = 0 |
|
|
109 |
image_array[image_array <-1000] = 0 |
|
|
110 |
|
|
|
111 |
# LUNA2016 data prepare ,second step: normalzation the HU |
|
|
112 |
def normalazation(image_array): |
|
|
113 |
max = image_array.max() |
|
|
114 |
min = image_array.min() |
|
|
115 |
image_array = (image_array-min)/(max-min) # float cannot apply the compute,or array error will occur |
|
|
116 |
avg = image_array.mean() |
|
|
117 |
image_array = image_array-avg |
|
|
118 |
return image_array # a bug here, a array must be returned,directly appling function did't work |
|
|
119 |
|
|
|
120 |
def extract(imagePath, cands, anno, fcount, normalization_output_path): |
|
|
121 |
|
|
|
122 |
print('file %d pre-processing starts...' % fcount) |
|
|
123 |
img, origin, spacing = load_itk(imagePath) |
|
|
124 |
print('origin: ', origin) |
|
|
125 |
print('spacing: ', spacing) |
|
|
126 |
|
|
|
127 |
#calculate resize factor |
|
|
128 |
RESIZE_SPACING = [1, 1, 1] |
|
|
129 |
resize_factor = spacing / RESIZE_SPACING |
|
|
130 |
new_real_shape = img.shape * resize_factor |
|
|
131 |
new_shape = np.round(new_real_shape) |
|
|
132 |
real_resize = new_shape / img.shape |
|
|
133 |
new_spacing = spacing / real_resize |
|
|
134 |
|
|
|
135 |
#resize image |
|
|
136 |
lung_img = scipy.ndimage.interpolation.zoom(img, real_resize) |
|
|
137 |
|
|
|
138 |
# Segment the lung structure |
|
|
139 |
lung_img = lung_img + 1024 |
|
|
140 |
lung_mask = segment_lung_from_ct_scan(lung_img) |
|
|
141 |
lung_img = lung_img - 1024 |
|
|
142 |
|
|
|
143 |
nodule_mask = draw_circles(lung_img,anno,origin,new_spacing) |
|
|
144 |
|
|
|
145 |
#create nodule mask |
|
|
146 |
lung_mask_512 = np.zeros((lung_mask.shape[0], 512, 512)) |
|
|
147 |
nodule_mask_512 = np.zeros((nodule_mask.shape[0], 512, 512)) |
|
|
148 |
|
|
|
149 |
original_shape = lung_img.shape |
|
|
150 |
for z in range(lung_img.shape[0]): |
|
|
151 |
offset = (512 - original_shape[1]) |
|
|
152 |
upper_offset = int(np.round(offset/2)) |
|
|
153 |
lower_offset = int(offset - upper_offset) |
|
|
154 |
new_origin = voxel_2_world([-upper_offset,-lower_offset,0],origin,new_spacing) |
|
|
155 |
|
|
|
156 |
lung_mask_512[z, upper_offset:-lower_offset,upper_offset:-lower_offset] = lung_mask[z,:,:] * nodule_mask[z,:,:] |
|
|
157 |
|
|
|
158 |
print('file %d pre-processing over...' % fcount) |
|
|
159 |
print('file %d cubic extract strats...' % fcount) |
|
|
160 |
|
|
|
161 |
for node_idx, cur_row in cands.iterrows(): |
|
|
162 |
node_x = cur_row["coordX"] |
|
|
163 |
node_y = cur_row["coordY"] |
|
|
164 |
node_z = cur_row["coordZ"] |
|
|
165 |
nodule_class = cur_row['class'] |
|
|
166 |
center = np.array([node_x, node_y, node_z]) # nodule center |
|
|
167 |
v_center = world_2_voxel(center,origin,spacing) # nodule center in voxel space ( x,y,z ordering) |
|
|
168 |
|
|
|
169 |
# every nodules saved into size of 20x20x6,30x30x10,40x40x26 |
|
|
170 |
if nodule_class == 0: |
|
|
171 |
imgs_fake1 = np.zeros([26, 40, 40], dtype=np.float32) |
|
|
172 |
try: |
|
|
173 |
# these following imgs saves for plot |
|
|
174 |
imgs_fake1 = lung_mask_512[int(v_center[0]-13):int(v_center[0]+13),int(v_center[1]-20):int(v_center[1]+20),int(v_center[2]-20):int(v_center[2]+20)] |
|
|
175 |
|
|
|
176 |
if(np.max(imgs_fake1) != 0.0): |
|
|
177 |
np.save(os.path.join(normalization_output_path, "%d_fake_size40x40.npy" % node_idx), imgs_fake1) |
|
|
178 |
print('file %d nodule %d fake saves...' % (fcount, node_idx)) |
|
|
179 |
|
|
|
180 |
except Exception as e: |
|
|
181 |
print('*') |
|
|
182 |
|
|
|
183 |
elif nodule_class == 1: |
|
|
184 |
imgs_true1 = np.zeros([26, 40, 40], dtype=np.float32) |
|
|
185 |
try: |
|
|
186 |
# these following imgs saves for plot |
|
|
187 |
imgs_true1 = lung_mask_512[int(v_center[0]-13):int(v_center[0]+13),int(v_center[1]-20):int(v_center[1]+20),int(v_center[2]-20):int(v_center[2]+20)] |
|
|
188 |
|
|
|
189 |
if(np.max(imgs_true1) != 0.0): |
|
|
190 |
np.save(os.path.join(normalization_output_path, "%d_true_size40x40.npy" % node_idx), imgs_true1) |
|
|
191 |
print('file %d nodule %d true saves...' % (fcount, node_idx)) |
|
|
192 |
|
|
|
193 |
except Exception as e: |
|
|
194 |
print('*') |
|
|
195 |
|
|
|
196 |
|
|
|
197 |
def get_filename(file_list, case): |
|
|
198 |
for f in file_list: |
|
|
199 |
if case in f: |
|
|
200 |
return(f) |
|
|
201 |
|
|
|
202 |
if __name__ =='__main__': |
|
|
203 |
|
|
|
204 |
''' |
|
|
205 |
base_dir = '/home/ubuntu/data/' |
|
|
206 |
annatation_file = '/home/ubuntu/data/CSVFILES/annotations.csv' |
|
|
207 |
candidate_file = '/home/ubuntu/data/CSVFILES/candidates.csv' |
|
|
208 |
plot_output_path = '/home/ubuntu/data/output' |
|
|
209 |
normalization_output_path = '/home/ubuntu/data/train-3d' |
|
|
210 |
test_path = '/home/ubuntu/data/test-3d' |
|
|
211 |
''' |
|
|
212 |
|
|
|
213 |
print('test set starts processing...') |
|
|
214 |
for i in range(9, 10): |
|
|
215 |
print('subset %d preprocessing starts...' % i) |
|
|
216 |
LUNA_SUBSET_PATH = base_dir + 'subset'+str(i)+'/' |
|
|
217 |
FILE_LIST = glob(LUNA_SUBSET_PATH + '*.mhd') |
|
|
218 |
|
|
|
219 |
cand_df_node = pd.read_csv(candidate_file) |
|
|
220 |
cand_df_node["file"] = cand_df_node["seriesuid"].map(lambda file_name: get_filename(FILE_LIST, file_name)) |
|
|
221 |
cand_df_node = cand_df_node.dropna() |
|
|
222 |
|
|
|
223 |
anno_df_node = pd.read_csv(annatation_file) |
|
|
224 |
anno_df_node["file"] = anno_df_node["seriesuid"].map(lambda file_name: get_filename(FILE_LIST, file_name)) |
|
|
225 |
anno_df_node = anno_df_node.dropna() |
|
|
226 |
|
|
|
227 |
# Looping over the image files |
|
|
228 |
for fcount, img_file in enumerate(tqdm(FILE_LIST)): |
|
|
229 |
cand_mini_df = cand_df_node[cand_df_node["file"]==img_file] # get all nodules associate with file |
|
|
230 |
anno_mini_df = anno_df_node[anno_df_node["file"]==img_file] |
|
|
231 |
if cand_mini_df.shape[0]>0 and anno_mini_df.shape[0]>0: # some files may not have a nodule--skipping those |
|
|
232 |
extract(img_file, cand_mini_df, anno_mini_df, fcount, test_path) |
|
|
233 |
print('subset %d preprocessing ends...' % i) |
|
|
234 |
print('*'*20) |
|
|
235 |
print('test set ends processing...') |
|
|
236 |
print('-'*25) |
|
|
237 |
|
|
|
238 |
|
|
|
239 |
print('train set starts processing...') |
|
|
240 |
for i in range(0, 9): |
|
|
241 |
print('subset %d preprocessing starts...' % i) |
|
|
242 |
LUNA_SUBSET_PATH = base_dir + 'subset'+str(i)+'/' |
|
|
243 |
FILE_LIST = glob(LUNA_SUBSET_PATH + '*.mhd') |
|
|
244 |
|
|
|
245 |
cand_df_node = pd.read_csv(candidate_file) |
|
|
246 |
cand_df_node["file"] = cand_df_node["seriesuid"].map(lambda file_name: get_filename(FILE_LIST, file_name)) |
|
|
247 |
cand_df_node = cand_df_node.dropna() |
|
|
248 |
|
|
|
249 |
anno_df_node = pd.read_csv(annatation_file) |
|
|
250 |
anno_df_node["file"] = anno_df_node["seriesuid"].map(lambda file_name: get_filename(FILE_LIST, file_name)) |
|
|
251 |
anno_df_node = anno_df_node.dropna() |
|
|
252 |
|
|
|
253 |
# Looping over the image files |
|
|
254 |
for fcount, img_file in enumerate(tqdm(FILE_LIST)): |
|
|
255 |
cand_mini_df = cand_df_node[cand_df_node["file"]==img_file] # get all nodules associate with file |
|
|
256 |
anno_mini_df = anno_df_node[anno_df_node["file"]==img_file] |
|
|
257 |
if cand_mini_df.shape[0]>0 and anno_mini_df.shape[0]>0: # some files may not have a nodule--skipping those |
|
|
258 |
extract(img_file, cand_mini_df, anno_mini_df, fcount, normalization_output_path) |
|
|
259 |
print('subset %d preprocessing ends...' % i) |
|
|
260 |
print('*'*20) |
|
|
261 |
print('train set ends processing...') |
|
|
262 |
print('-'*25) |
|
|
263 |
|
|
|
264 |
|