Lung-nodule-segmentation / Git / [ebf7be] /code/data

Models:
RichardZick/
Lung-nodule-segmentation
Downloads: 1
[ebf7be]: / code / data_prepare.py
History
Download this file
289 lines (259 with data), 13.3 kB

# -*- coding:utf-8 -*-
"""    
    Prepare data for 3D CNN and some utility functions
    (1) Extract positive and negative samples from mhd based on annotation 
    (2) Check nan in data function
    (3) Visulization 2D and 3D utilities
    (4) Normalization and thresholding utilities
    (5) Get train and test batch
"""

import SimpleITK as sitk
import numpy as np
import csv
from glob import glob
import pandas as pd
import os
import matplotlib
import matplotlib.pyplot as plt
import traceback
import random
from PIL import Image


from project_config import *

def get_filename(file_list, case):
    for f in file_list:
        if case in f:
            return(f)

def extract_real_cubic_from_mhd(dcim_path,annatation_file,normalization_output_path):
    '''
      @param: dcim_path :                 the path contains all mhd file
      @param: annatation_file:            the annatation csv file,contains every nodules' coordinate
      @param:normalization_output_path:   the save path of extracted cubic of size 20x20x6,30x30x10,40x40x26 npy file(after normalization)
    '''
    file_list=glob(dcim_path+"*.mhd")
    # The locations of the nodes
    df_node = pd.read_csv(annatation_file)
    df_node["file"] = df_node["seriesuid"].map(lambda file_name: get_filename(file_list, file_name))
    df_node = df_node.dropna()

    for img_file in file_list:
        mini_df = df_node[df_node["file"]==img_file] #get all nodules associate with file
        file_name = os.path.basename(img_file)
        if mini_df.shape[0]>0: # some files may not have a nodule--skipping those
            # load the data once
            itk_img = sitk.ReadImage(img_file)
            img_array = sitk.GetArrayFromImage(itk_img) # indexes are z,y,x (notice the ordering)
            num_z, height, width = img_array.shape        #heightXwidth constitute the transverse plane
            origin = np.array(itk_img.GetOrigin())      # x,y,z  Origin in world coordinates (mm)
            spacing = np.array(itk_img.GetSpacing())    # spacing of voxels in world coor. (mm)
            # go through all nodes
            print("begin to process real nodules...")
            img_array = img_array.transpose(2,1,0)      # take care on the sequence of axis of v_center ,transfer to x,y,z
            print(img_array.shape)
            for node_idx, cur_row in mini_df.iterrows():
                node_x = cur_row["coordX"]
                node_y = cur_row["coordY"]
                node_z = cur_row["coordZ"]
                nodule_pos_str = str(node_x)+"_"+str(node_y)+"_"+str(node_z)
                # every nodules saved into size of 20x20x6,30x30x10,40x40x26
                imgs1 = np.ndarray([20,20,6],dtype=np.float32)
                imgs2 = np.ndarray([30,30,10],dtype=np.float32)
                imgs3 = np.ndarray([40,40,26],dtype=np.float32)
                center = np.array([node_x, node_y, node_z])   # nodule center
                v_center = np.rint((center-origin)/spacing)  # nodule center in voxel space (still x,y,z ordering)
                print(v_center[0],v_center[1],v_center[2])
                try:
                    # these following imgs saves for plot
                    imgs1[:,:,:]=img_array[int(v_center[0]-10):int(v_center[0]+10),int(v_center[1]-10):int(v_center[1]+10),int(v_center[2]-3):int(v_center[2]+3)]
                    imgs2[:,:,:]=img_array[int(v_center[0]-15):int(v_center[0]+15),int(v_center[1]-15):int(v_center[1]+15),int(v_center[2]-5):int(v_center[2]+5)]
                    imgs3[:,:,:]=img_array[int(v_center[0]-20):int(v_center[0]+20),int(v_center[1]-20):int(v_center[1]+20),int(v_center[2]-13):int(v_center[2]+13)]

                    # these following are the standard data as input of CNN
                    truncate_hu(imgs1)
                    truncate_hu(imgs2)
                    truncate_hu(imgs3)
                    imgs1 = normalazation(imgs1)
                    imgs2 = normalazation(imgs2)
                    imgs3 = normalazation(imgs3)
                    np.save(os.path.join(normalization_output_path, "%d_real_size20x20.npy" % node_idx),imgs1)
                    np.save(os.path.join(normalization_output_path, "%d_real_size30x30.npy" % node_idx),imgs2)
                    np.save(os.path.join(normalization_output_path, "%d_real_size40x40.npy" % node_idx),imgs3)
                    print("save real %d finished!..." % node_idx)

                except Exception as e:
                    print(" process images %s error..."%str(file_name))
                    print(Exception,":",e)
                    traceback.print_exc()


def extract_fake_cubic_from_mhd(dcim_path,candidate_file,normalization_output_path):
    '''
      @param: dcim_path :                 the path contains all mhd file
      @param: candidate_file:             the candidate csv file,contains every **fake** nodules' coordinate
      @param:normalization_output_path:   the save path of extracted cubic of size 20x20x6,30x30x10,40x40x26 npy file(after normalization)
    '''
    file_list=glob(dcim_path+"*.mhd")
    # The locations of the nodes
    df_node = pd.read_csv(candidate_file)
    df_node["file"] = df_node["seriesuid"].map(lambda file_name: get_filename(file_list, file_name))
    df_node = df_node.dropna()

    for img_file in file_list:
        mini_df = df_node[df_node["file"]==img_file] #get all nodules associate with file
        file_name = os.path.basename(img_file)
        num = 0
        if mini_df.shape[0]>0 and num<10000: # some files may not have a nodule--skipping those
            # load the data once
            itk_img = sitk.ReadImage(img_file)
            img_array = sitk.GetArrayFromImage(itk_img) # indexes are z,y,x (notice the ordering)
            num_z, height, width = img_array.shape        #heightXwidth constitute the transverse plane
            origin = np.array(itk_img.GetOrigin())      # x,y,z  Origin in world coordinates (mm)
            spacing = np.array(itk_img.GetSpacing())    # spacing of voxels in world coor. (mm)
            # go through all nodes
            print("begin to process fake nodules...")
            img_array = img_array.transpose(2,1,0)      # take care on the sequence of axis of v_center ,transfer to x,y,z
            print(img_array.shape)
            for node_idx, cur_row in mini_df.iterrows():
                node_x = cur_row["coordX"]
                node_y = cur_row["coordY"]
                node_z = cur_row["coordZ"]
                nodule_pos_str = str(node_x) + "_" + str(node_y) + "_" + str(node_z)
                center = np.array([node_x, node_y, node_z])  # nodule center
                v_center = np.rint((center - origin) / spacing)  # nodule center in voxel space ( x,y,z ordering)
                # false nodule
                # every nodules saved into size of 20x20x6,30x30x10,40x40x26
                imgs_fake1 = np.ndarray([20, 20, 6], dtype=np.float32)
                imgs_fake2 = np.ndarray([30, 30, 10], dtype=np.float32)
                imgs_fake3 = np.ndarray([40, 40, 26], dtype=np.float32)
                try:
                    # these following imgs saves for plot
                    imgs_fake1[:,:,:] = img_array[int(v_center[0]-10):int(v_center[0]+10),int(v_center[1]-10):int(v_center[1]+10),int(v_center[2]-3):int(v_center[2]+3)]
                    imgs_fake2[:,:,:] = img_array[int(v_center[0]-15):int(v_center[0]+15),int(v_center[1]-15):int(v_center[1]+15),int(v_center[2]-5):int(v_center[2]+5)]
                    imgs_fake3[:,:,:] = img_array[int(v_center[0]-20):int(v_center[0]+20),int(v_center[1]-20):int(v_center[1]+20),int(v_center[2]-13):int(v_center[2]+13)]

                    # save fake nodule
                    truncate_hu(imgs_fake1)
                    truncate_hu(imgs_fake2)
                    truncate_hu(imgs_fake3)
                    imgs_fake1 = normalazation(imgs_fake1)
                    imgs_fake2 = normalazation(imgs_fake2)
                    imgs_fake3 = normalazation(imgs_fake3)
                    np.save(os.path.join(normalization_output_path, "%d_fake_size20x20.npy" % node_idx), imgs_fake1)
                    np.save(os.path.join(normalization_output_path, "%d_fake_size30x30.npy" % node_idx), imgs_fake2)
                    np.save(os.path.join(normalization_output_path, "%d_fake_size40x40.npy" % node_idx), imgs_fake3)

                    num = num+1

                    print("save fake %d finished!..." % node_idx)
                except Exception as e:
                    print(" process images %s error..." % str(file_name))
                    print(Exception, ":", e)
                    traceback.print_exc()

def check_nan(path):
    '''
     a function to check if there is nan value in current npy file path
    :param path:
    :return:
    '''
    for file in os.listdir(path):
        array = np.load(os.path.join(path,file))
        a = array[np.isnan(array)]
        if len(a)>0:
            print("file is nan :  ",file )
            print(a)

def plot_cubic(npy_file):
    '''
       plot the cubic slice by slice

    :param npy_file:
    :return:
    '''
    cubic_array = np.load(npy_file)
    f, plots = plt.subplots(int(cubic_array.shape[2]/3), 3, figsize=(50, 50))
    for i in range(1, cubic_array.shape[2]+1):
        plots[int(i / 3), int((i % 3) )].axis('off')
        plots[int(i / 3), int((i % 3) )].imshow(cubic_array[:,:,i], cmap=plt.cm.bone)

def plot_3d_cubic(image):
    '''
        plot the 3D cubic
    :param image:   image saved as npy file path
    :return:
    '''
    from skimage import measure, morphology
    from mpl_toolkits.mplot3d.art3d import Poly3DCollection
    image = np.load(image)
    verts, faces = measure.marching_cubes(image,0)
    fig = plt.figure(figsize=(40, 40))
    ax = fig.add_subplot(111, projection='3d')
    # Fancy indexing: `verts[faces]` to generate a collection of triangles
    mesh = Poly3DCollection(verts[faces], alpha=0.1)
    face_color = [0.5, 0.5, 1]
    mesh.set_facecolor(face_color)
    ax.add_collection3d(mesh)
    ax.set_xlim(0, image.shape[0])
    ax.set_ylim(0, image.shape[1])
    ax.set_zlim(0, image.shape[2])
    plt.show()

# LUNA2016 data prepare ,first step: truncate HU to -1000 to 400
def truncate_hu(image_array):
    image_array[image_array > 400] = 0
    image_array[image_array <-1000] = 0

# LUNA2016 data prepare ,second step: normalzation the HU
def normalazation(image_array):
    max = image_array.max()
    min = image_array.min()
    image_array = (image_array-min)/(max-min)  # float cannot apply the compute,or array error will occur
    avg = image_array.mean()
    image_array = image_array-avg
    return image_array   # a bug here, a array must be returned,directly appling function did't work

def search(path, word):
    '''
       find filename match keyword from path
    :param path:  path search from
    :param word:  keyword should be matched
    :return:
    '''
    filelist = []
    for filename in os.listdir(path):
        fp = os.path.join(path, filename)
        if os.path.isfile(fp) and word in filename:
            filelist.append(fp)
        elif os.path.isdir(fp):
            search(fp, word)
    return filelist


def get_all_filename(path,size):
    list_real = search(path, 'real_size' + str(size) + "x" + str(size))
    list_fake = search(path, 'fake_size' + str(size) + "x" + str(size))
    return list_real+list_fake

def get_test_batch(path):
    '''
            prepare every batch file data and label 【test】
    :param path:
    :return:
    '''
    files = [os.path.join(path,f) for f in os.listdir(path)]
    batch_array = []
    batch_label = []
    for npy in files:
        try:
            if len(batch_label)<32:
                arr = np.load(npy)
                batch_array.append(arr)
                if 'real_' in npy.split("/")[-1]:
                    batch_label.append([0, 1])
                elif 'fake_' in npy.split("/")[-1]:
                    batch_label.append([1, 0])
        except Exception as e:
            print("file not exists! %s" % npy)
            batch_array.append(batch_array[-1])  # some nodule process error leading nonexistent of the file, using the last file copy to fill
            print(e.message)

    return np.array(batch_array), np.array(batch_label)

def get_train_batch(batch_filename):
    '''
      prepare every batch file data and label 【train】
    :param batch_filename:
    :return:
    '''
    batch_array = []
    batch_label = []
    for npy in batch_filename:
        try:
            arr = np.load(npy)
            batch_array.append(arr)

            if 'real_' in  npy.split("/")[-1]:
                batch_label.append([0,1])
            elif 'fake_' in npy.split("/")[-1]:
                batch_label.append([1,0])
        except Exception  as e:
            print("file not exists! %s"%npy)
            batch_array.append(batch_array[-1])  # some nodule process error leading nonexistent of the file, using the last file copy to fill

    return np.array(batch_array),np.array(batch_label)