--- a +++ b/code/data_prepare.py @@ -0,0 +1,288 @@ +# -*- coding:utf-8 -*- +""" + Prepare data for 3D CNN and some utility functions + (1) Extract positive and negative samples from mhd based on annotation + (2) Check nan in data function + (3) Visulization 2D and 3D utilities + (4) Normalization and thresholding utilities + (5) Get train and test batch +""" + +import SimpleITK as sitk +import numpy as np +import csv +from glob import glob +import pandas as pd +import os +import matplotlib +import matplotlib.pyplot as plt +import traceback +import random +from PIL import Image + + +from project_config import * + +def get_filename(file_list, case): + for f in file_list: + if case in f: + return(f) + +def extract_real_cubic_from_mhd(dcim_path,annatation_file,normalization_output_path): + ''' + @param: dcim_path : the path contains all mhd file + @param: annatation_file: the annatation csv file,contains every nodules' coordinate + @param:normalization_output_path: the save path of extracted cubic of size 20x20x6,30x30x10,40x40x26 npy file(after normalization) + ''' + file_list=glob(dcim_path+"*.mhd") + # The locations of the nodes + df_node = pd.read_csv(annatation_file) + df_node["file"] = df_node["seriesuid"].map(lambda file_name: get_filename(file_list, file_name)) + df_node = df_node.dropna() + + for img_file in file_list: + mini_df = df_node[df_node["file"]==img_file] #get all nodules associate with file + file_name = os.path.basename(img_file) + if mini_df.shape[0]>0: # some files may not have a nodule--skipping those + # load the data once + itk_img = sitk.ReadImage(img_file) + img_array = sitk.GetArrayFromImage(itk_img) # indexes are z,y,x (notice the ordering) + num_z, height, width = img_array.shape #heightXwidth constitute the transverse plane + origin = np.array(itk_img.GetOrigin()) # x,y,z Origin in world coordinates (mm) + spacing = np.array(itk_img.GetSpacing()) # spacing of voxels in world coor. (mm) + # go through all nodes + print("begin to process real nodules...") + img_array = img_array.transpose(2,1,0) # take care on the sequence of axis of v_center ,transfer to x,y,z + print(img_array.shape) + for node_idx, cur_row in mini_df.iterrows(): + node_x = cur_row["coordX"] + node_y = cur_row["coordY"] + node_z = cur_row["coordZ"] + nodule_pos_str = str(node_x)+"_"+str(node_y)+"_"+str(node_z) + # every nodules saved into size of 20x20x6,30x30x10,40x40x26 + imgs1 = np.ndarray([20,20,6],dtype=np.float32) + imgs2 = np.ndarray([30,30,10],dtype=np.float32) + imgs3 = np.ndarray([40,40,26],dtype=np.float32) + center = np.array([node_x, node_y, node_z]) # nodule center + v_center = np.rint((center-origin)/spacing) # nodule center in voxel space (still x,y,z ordering) + print(v_center[0],v_center[1],v_center[2]) + try: + # these following imgs saves for plot + imgs1[:,:,:]=img_array[int(v_center[0]-10):int(v_center[0]+10),int(v_center[1]-10):int(v_center[1]+10),int(v_center[2]-3):int(v_center[2]+3)] + imgs2[:,:,:]=img_array[int(v_center[0]-15):int(v_center[0]+15),int(v_center[1]-15):int(v_center[1]+15),int(v_center[2]-5):int(v_center[2]+5)] + imgs3[:,:,:]=img_array[int(v_center[0]-20):int(v_center[0]+20),int(v_center[1]-20):int(v_center[1]+20),int(v_center[2]-13):int(v_center[2]+13)] + + # these following are the standard data as input of CNN + truncate_hu(imgs1) + truncate_hu(imgs2) + truncate_hu(imgs3) + imgs1 = normalazation(imgs1) + imgs2 = normalazation(imgs2) + imgs3 = normalazation(imgs3) + np.save(os.path.join(normalization_output_path, "%d_real_size20x20.npy" % node_idx),imgs1) + np.save(os.path.join(normalization_output_path, "%d_real_size30x30.npy" % node_idx),imgs2) + np.save(os.path.join(normalization_output_path, "%d_real_size40x40.npy" % node_idx),imgs3) + print("save real %d finished!..." % node_idx) + + except Exception as e: + print(" process images %s error..."%str(file_name)) + print(Exception,":",e) + traceback.print_exc() + + +def extract_fake_cubic_from_mhd(dcim_path,candidate_file,normalization_output_path): + ''' + @param: dcim_path : the path contains all mhd file + @param: candidate_file: the candidate csv file,contains every **fake** nodules' coordinate + @param:normalization_output_path: the save path of extracted cubic of size 20x20x6,30x30x10,40x40x26 npy file(after normalization) + ''' + file_list=glob(dcim_path+"*.mhd") + # The locations of the nodes + df_node = pd.read_csv(candidate_file) + df_node["file"] = df_node["seriesuid"].map(lambda file_name: get_filename(file_list, file_name)) + df_node = df_node.dropna() + + for img_file in file_list: + mini_df = df_node[df_node["file"]==img_file] #get all nodules associate with file + file_name = os.path.basename(img_file) + num = 0 + if mini_df.shape[0]>0 and num<10000: # some files may not have a nodule--skipping those + # load the data once + itk_img = sitk.ReadImage(img_file) + img_array = sitk.GetArrayFromImage(itk_img) # indexes are z,y,x (notice the ordering) + num_z, height, width = img_array.shape #heightXwidth constitute the transverse plane + origin = np.array(itk_img.GetOrigin()) # x,y,z Origin in world coordinates (mm) + spacing = np.array(itk_img.GetSpacing()) # spacing of voxels in world coor. (mm) + # go through all nodes + print("begin to process fake nodules...") + img_array = img_array.transpose(2,1,0) # take care on the sequence of axis of v_center ,transfer to x,y,z + print(img_array.shape) + for node_idx, cur_row in mini_df.iterrows(): + node_x = cur_row["coordX"] + node_y = cur_row["coordY"] + node_z = cur_row["coordZ"] + nodule_pos_str = str(node_x) + "_" + str(node_y) + "_" + str(node_z) + center = np.array([node_x, node_y, node_z]) # nodule center + v_center = np.rint((center - origin) / spacing) # nodule center in voxel space ( x,y,z ordering) + # false nodule + # every nodules saved into size of 20x20x6,30x30x10,40x40x26 + imgs_fake1 = np.ndarray([20, 20, 6], dtype=np.float32) + imgs_fake2 = np.ndarray([30, 30, 10], dtype=np.float32) + imgs_fake3 = np.ndarray([40, 40, 26], dtype=np.float32) + try: + # these following imgs saves for plot + imgs_fake1[:,:,:] = img_array[int(v_center[0]-10):int(v_center[0]+10),int(v_center[1]-10):int(v_center[1]+10),int(v_center[2]-3):int(v_center[2]+3)] + imgs_fake2[:,:,:] = img_array[int(v_center[0]-15):int(v_center[0]+15),int(v_center[1]-15):int(v_center[1]+15),int(v_center[2]-5):int(v_center[2]+5)] + imgs_fake3[:,:,:] = img_array[int(v_center[0]-20):int(v_center[0]+20),int(v_center[1]-20):int(v_center[1]+20),int(v_center[2]-13):int(v_center[2]+13)] + + # save fake nodule + truncate_hu(imgs_fake1) + truncate_hu(imgs_fake2) + truncate_hu(imgs_fake3) + imgs_fake1 = normalazation(imgs_fake1) + imgs_fake2 = normalazation(imgs_fake2) + imgs_fake3 = normalazation(imgs_fake3) + np.save(os.path.join(normalization_output_path, "%d_fake_size20x20.npy" % node_idx), imgs_fake1) + np.save(os.path.join(normalization_output_path, "%d_fake_size30x30.npy" % node_idx), imgs_fake2) + np.save(os.path.join(normalization_output_path, "%d_fake_size40x40.npy" % node_idx), imgs_fake3) + + num = num+1 + + print("save fake %d finished!..." % node_idx) + except Exception as e: + print(" process images %s error..." % str(file_name)) + print(Exception, ":", e) + traceback.print_exc() + +def check_nan(path): + ''' + a function to check if there is nan value in current npy file path + :param path: + :return: + ''' + for file in os.listdir(path): + array = np.load(os.path.join(path,file)) + a = array[np.isnan(array)] + if len(a)>0: + print("file is nan : ",file ) + print(a) + +def plot_cubic(npy_file): + ''' + plot the cubic slice by slice + + :param npy_file: + :return: + ''' + cubic_array = np.load(npy_file) + f, plots = plt.subplots(int(cubic_array.shape[2]/3), 3, figsize=(50, 50)) + for i in range(1, cubic_array.shape[2]+1): + plots[int(i / 3), int((i % 3) )].axis('off') + plots[int(i / 3), int((i % 3) )].imshow(cubic_array[:,:,i], cmap=plt.cm.bone) + +def plot_3d_cubic(image): + ''' + plot the 3D cubic + :param image: image saved as npy file path + :return: + ''' + from skimage import measure, morphology + from mpl_toolkits.mplot3d.art3d import Poly3DCollection + image = np.load(image) + verts, faces = measure.marching_cubes(image,0) + fig = plt.figure(figsize=(40, 40)) + ax = fig.add_subplot(111, projection='3d') + # Fancy indexing: `verts[faces]` to generate a collection of triangles + mesh = Poly3DCollection(verts[faces], alpha=0.1) + face_color = [0.5, 0.5, 1] + mesh.set_facecolor(face_color) + ax.add_collection3d(mesh) + ax.set_xlim(0, image.shape[0]) + ax.set_ylim(0, image.shape[1]) + ax.set_zlim(0, image.shape[2]) + plt.show() + +# LUNA2016 data prepare ,first step: truncate HU to -1000 to 400 +def truncate_hu(image_array): + image_array[image_array > 400] = 0 + image_array[image_array <-1000] = 0 + +# LUNA2016 data prepare ,second step: normalzation the HU +def normalazation(image_array): + max = image_array.max() + min = image_array.min() + image_array = (image_array-min)/(max-min) # float cannot apply the compute,or array error will occur + avg = image_array.mean() + image_array = image_array-avg + return image_array # a bug here, a array must be returned,directly appling function did't work + +def search(path, word): + ''' + find filename match keyword from path + :param path: path search from + :param word: keyword should be matched + :return: + ''' + filelist = [] + for filename in os.listdir(path): + fp = os.path.join(path, filename) + if os.path.isfile(fp) and word in filename: + filelist.append(fp) + elif os.path.isdir(fp): + search(fp, word) + return filelist + + +def get_all_filename(path,size): + list_real = search(path, 'real_size' + str(size) + "x" + str(size)) + list_fake = search(path, 'fake_size' + str(size) + "x" + str(size)) + return list_real+list_fake + +def get_test_batch(path): + ''' + prepare every batch file data and label 【test】 + :param path: + :return: + ''' + files = [os.path.join(path,f) for f in os.listdir(path)] + batch_array = [] + batch_label = [] + for npy in files: + try: + if len(batch_label)<32: + arr = np.load(npy) + batch_array.append(arr) + if 'real_' in npy.split("/")[-1]: + batch_label.append([0, 1]) + elif 'fake_' in npy.split("/")[-1]: + batch_label.append([1, 0]) + except Exception as e: + print("file not exists! %s" % npy) + batch_array.append(batch_array[-1]) # some nodule process error leading nonexistent of the file, using the last file copy to fill + print(e.message) + + return np.array(batch_array), np.array(batch_label) + +def get_train_batch(batch_filename): + ''' + prepare every batch file data and label 【train】 + :param batch_filename: + :return: + ''' + batch_array = [] + batch_label = [] + for npy in batch_filename: + try: + arr = np.load(npy) + batch_array.append(arr) + + if 'real_' in npy.split("/")[-1]: + batch_label.append([0,1]) + elif 'fake_' in npy.split("/")[-1]: + batch_label.append([1,0]) + except Exception as e: + print("file not exists! %s"%npy) + batch_array.append(batch_array[-1]) # some nodule process error leading nonexistent of the file, using the last file copy to fill + + return np.array(batch_array),np.array(batch_label) +