[ebf7be]: / code / data_prepare.py

Download this file

289 lines (259 with data), 13.3 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
# -*- coding:utf-8 -*-
"""
Prepare data for 3D CNN and some utility functions
(1) Extract positive and negative samples from mhd based on annotation
(2) Check nan in data function
(3) Visulization 2D and 3D utilities
(4) Normalization and thresholding utilities
(5) Get train and test batch
"""
import SimpleITK as sitk
import numpy as np
import csv
from glob import glob
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
import traceback
import random
from PIL import Image
from project_config import *
def get_filename(file_list, case):
for f in file_list:
if case in f:
return(f)
def extract_real_cubic_from_mhd(dcim_path,annatation_file,normalization_output_path):
'''
@param: dcim_path : the path contains all mhd file
@param: annatation_file: the annatation csv file,contains every nodules' coordinate
@param:normalization_output_path: the save path of extracted cubic of size 20x20x6,30x30x10,40x40x26 npy file(after normalization)
'''
file_list=glob(dcim_path+"*.mhd")
# The locations of the nodes
df_node = pd.read_csv(annatation_file)
df_node["file"] = df_node["seriesuid"].map(lambda file_name: get_filename(file_list, file_name))
df_node = df_node.dropna()
for img_file in file_list:
mini_df = df_node[df_node["file"]==img_file] #get all nodules associate with file
file_name = os.path.basename(img_file)
if mini_df.shape[0]>0: # some files may not have a nodule--skipping those
# load the data once
itk_img = sitk.ReadImage(img_file)
img_array = sitk.GetArrayFromImage(itk_img) # indexes are z,y,x (notice the ordering)
num_z, height, width = img_array.shape #heightXwidth constitute the transverse plane
origin = np.array(itk_img.GetOrigin()) # x,y,z Origin in world coordinates (mm)
spacing = np.array(itk_img.GetSpacing()) # spacing of voxels in world coor. (mm)
# go through all nodes
print("begin to process real nodules...")
img_array = img_array.transpose(2,1,0) # take care on the sequence of axis of v_center ,transfer to x,y,z
print(img_array.shape)
for node_idx, cur_row in mini_df.iterrows():
node_x = cur_row["coordX"]
node_y = cur_row["coordY"]
node_z = cur_row["coordZ"]
nodule_pos_str = str(node_x)+"_"+str(node_y)+"_"+str(node_z)
# every nodules saved into size of 20x20x6,30x30x10,40x40x26
imgs1 = np.ndarray([20,20,6],dtype=np.float32)
imgs2 = np.ndarray([30,30,10],dtype=np.float32)
imgs3 = np.ndarray([40,40,26],dtype=np.float32)
center = np.array([node_x, node_y, node_z]) # nodule center
v_center = np.rint((center-origin)/spacing) # nodule center in voxel space (still x,y,z ordering)
print(v_center[0],v_center[1],v_center[2])
try:
# these following imgs saves for plot
imgs1[:,:,:]=img_array[int(v_center[0]-10):int(v_center[0]+10),int(v_center[1]-10):int(v_center[1]+10),int(v_center[2]-3):int(v_center[2]+3)]
imgs2[:,:,:]=img_array[int(v_center[0]-15):int(v_center[0]+15),int(v_center[1]-15):int(v_center[1]+15),int(v_center[2]-5):int(v_center[2]+5)]
imgs3[:,:,:]=img_array[int(v_center[0]-20):int(v_center[0]+20),int(v_center[1]-20):int(v_center[1]+20),int(v_center[2]-13):int(v_center[2]+13)]
# these following are the standard data as input of CNN
truncate_hu(imgs1)
truncate_hu(imgs2)
truncate_hu(imgs3)
imgs1 = normalazation(imgs1)
imgs2 = normalazation(imgs2)
imgs3 = normalazation(imgs3)
np.save(os.path.join(normalization_output_path, "%d_real_size20x20.npy" % node_idx),imgs1)
np.save(os.path.join(normalization_output_path, "%d_real_size30x30.npy" % node_idx),imgs2)
np.save(os.path.join(normalization_output_path, "%d_real_size40x40.npy" % node_idx),imgs3)
print("save real %d finished!..." % node_idx)
except Exception as e:
print(" process images %s error..."%str(file_name))
print(Exception,":",e)
traceback.print_exc()
def extract_fake_cubic_from_mhd(dcim_path,candidate_file,normalization_output_path):
'''
@param: dcim_path : the path contains all mhd file
@param: candidate_file: the candidate csv file,contains every **fake** nodules' coordinate
@param:normalization_output_path: the save path of extracted cubic of size 20x20x6,30x30x10,40x40x26 npy file(after normalization)
'''
file_list=glob(dcim_path+"*.mhd")
# The locations of the nodes
df_node = pd.read_csv(candidate_file)
df_node["file"] = df_node["seriesuid"].map(lambda file_name: get_filename(file_list, file_name))
df_node = df_node.dropna()
for img_file in file_list:
mini_df = df_node[df_node["file"]==img_file] #get all nodules associate with file
file_name = os.path.basename(img_file)
num = 0
if mini_df.shape[0]>0 and num<10000: # some files may not have a nodule--skipping those
# load the data once
itk_img = sitk.ReadImage(img_file)
img_array = sitk.GetArrayFromImage(itk_img) # indexes are z,y,x (notice the ordering)
num_z, height, width = img_array.shape #heightXwidth constitute the transverse plane
origin = np.array(itk_img.GetOrigin()) # x,y,z Origin in world coordinates (mm)
spacing = np.array(itk_img.GetSpacing()) # spacing of voxels in world coor. (mm)
# go through all nodes
print("begin to process fake nodules...")
img_array = img_array.transpose(2,1,0) # take care on the sequence of axis of v_center ,transfer to x,y,z
print(img_array.shape)
for node_idx, cur_row in mini_df.iterrows():
node_x = cur_row["coordX"]
node_y = cur_row["coordY"]
node_z = cur_row["coordZ"]
nodule_pos_str = str(node_x) + "_" + str(node_y) + "_" + str(node_z)
center = np.array([node_x, node_y, node_z]) # nodule center
v_center = np.rint((center - origin) / spacing) # nodule center in voxel space ( x,y,z ordering)
# false nodule
# every nodules saved into size of 20x20x6,30x30x10,40x40x26
imgs_fake1 = np.ndarray([20, 20, 6], dtype=np.float32)
imgs_fake2 = np.ndarray([30, 30, 10], dtype=np.float32)
imgs_fake3 = np.ndarray([40, 40, 26], dtype=np.float32)
try:
# these following imgs saves for plot
imgs_fake1[:,:,:] = img_array[int(v_center[0]-10):int(v_center[0]+10),int(v_center[1]-10):int(v_center[1]+10),int(v_center[2]-3):int(v_center[2]+3)]
imgs_fake2[:,:,:] = img_array[int(v_center[0]-15):int(v_center[0]+15),int(v_center[1]-15):int(v_center[1]+15),int(v_center[2]-5):int(v_center[2]+5)]
imgs_fake3[:,:,:] = img_array[int(v_center[0]-20):int(v_center[0]+20),int(v_center[1]-20):int(v_center[1]+20),int(v_center[2]-13):int(v_center[2]+13)]
# save fake nodule
truncate_hu(imgs_fake1)
truncate_hu(imgs_fake2)
truncate_hu(imgs_fake3)
imgs_fake1 = normalazation(imgs_fake1)
imgs_fake2 = normalazation(imgs_fake2)
imgs_fake3 = normalazation(imgs_fake3)
np.save(os.path.join(normalization_output_path, "%d_fake_size20x20.npy" % node_idx), imgs_fake1)
np.save(os.path.join(normalization_output_path, "%d_fake_size30x30.npy" % node_idx), imgs_fake2)
np.save(os.path.join(normalization_output_path, "%d_fake_size40x40.npy" % node_idx), imgs_fake3)
num = num+1
print("save fake %d finished!..." % node_idx)
except Exception as e:
print(" process images %s error..." % str(file_name))
print(Exception, ":", e)
traceback.print_exc()
def check_nan(path):
'''
a function to check if there is nan value in current npy file path
:param path:
:return:
'''
for file in os.listdir(path):
array = np.load(os.path.join(path,file))
a = array[np.isnan(array)]
if len(a)>0:
print("file is nan : ",file )
print(a)
def plot_cubic(npy_file):
'''
plot the cubic slice by slice
:param npy_file:
:return:
'''
cubic_array = np.load(npy_file)
f, plots = plt.subplots(int(cubic_array.shape[2]/3), 3, figsize=(50, 50))
for i in range(1, cubic_array.shape[2]+1):
plots[int(i / 3), int((i % 3) )].axis('off')
plots[int(i / 3), int((i % 3) )].imshow(cubic_array[:,:,i], cmap=plt.cm.bone)
def plot_3d_cubic(image):
'''
plot the 3D cubic
:param image: image saved as npy file path
:return:
'''
from skimage import measure, morphology
from mpl_toolkits.mplot3d.art3d import Poly3DCollection
image = np.load(image)
verts, faces = measure.marching_cubes(image,0)
fig = plt.figure(figsize=(40, 40))
ax = fig.add_subplot(111, projection='3d')
# Fancy indexing: `verts[faces]` to generate a collection of triangles
mesh = Poly3DCollection(verts[faces], alpha=0.1)
face_color = [0.5, 0.5, 1]
mesh.set_facecolor(face_color)
ax.add_collection3d(mesh)
ax.set_xlim(0, image.shape[0])
ax.set_ylim(0, image.shape[1])
ax.set_zlim(0, image.shape[2])
plt.show()
# LUNA2016 data prepare ,first step: truncate HU to -1000 to 400
def truncate_hu(image_array):
image_array[image_array > 400] = 0
image_array[image_array <-1000] = 0
# LUNA2016 data prepare ,second step: normalzation the HU
def normalazation(image_array):
max = image_array.max()
min = image_array.min()
image_array = (image_array-min)/(max-min) # float cannot apply the compute,or array error will occur
avg = image_array.mean()
image_array = image_array-avg
return image_array # a bug here, a array must be returned,directly appling function did't work
def search(path, word):
'''
find filename match keyword from path
:param path: path search from
:param word: keyword should be matched
:return:
'''
filelist = []
for filename in os.listdir(path):
fp = os.path.join(path, filename)
if os.path.isfile(fp) and word in filename:
filelist.append(fp)
elif os.path.isdir(fp):
search(fp, word)
return filelist
def get_all_filename(path,size):
list_real = search(path, 'real_size' + str(size) + "x" + str(size))
list_fake = search(path, 'fake_size' + str(size) + "x" + str(size))
return list_real+list_fake
def get_test_batch(path):
'''
prepare every batch file data and label 【test】
:param path:
:return:
'''
files = [os.path.join(path,f) for f in os.listdir(path)]
batch_array = []
batch_label = []
for npy in files:
try:
if len(batch_label)<32:
arr = np.load(npy)
batch_array.append(arr)
if 'real_' in npy.split("/")[-1]:
batch_label.append([0, 1])
elif 'fake_' in npy.split("/")[-1]:
batch_label.append([1, 0])
except Exception as e:
print("file not exists! %s" % npy)
batch_array.append(batch_array[-1]) # some nodule process error leading nonexistent of the file, using the last file copy to fill
print(e.message)
return np.array(batch_array), np.array(batch_label)
def get_train_batch(batch_filename):
'''
prepare every batch file data and label 【train】
:param batch_filename:
:return:
'''
batch_array = []
batch_label = []
for npy in batch_filename:
try:
arr = np.load(npy)
batch_array.append(arr)
if 'real_' in npy.split("/")[-1]:
batch_label.append([0,1])
elif 'fake_' in npy.split("/")[-1]:
batch_label.append([1,0])
except Exception as e:
print("file not exists! %s"%npy)
batch_array.append(batch_array[-1]) # some nodule process error leading nonexistent of the file, using the last file copy to fill
return np.array(batch_array),np.array(batch_label)