dsb3 / Git / [70b6b3] /data

Models:
RichardZick/
dsb3
Downloads: 1
[70b6b3]: / data_iterators.py
History
Download this file
2250 lines (1790 with data), 101.2 kB

import numpy as np
import utils_lung
import pathfinder
import utils


# 6% to 28% for nodules 5 to 10 mm,
prob5 = (0.01+0.06)/2.
slope10 = (0.28-prob5) / (10.-5.)
offset10 = prob5 - slope10*5.

slope20 = (0.64-0.28) / (20.-10.)
offset20 = 0.28 - slope20*10.

# and 64% to 82% for nodules >20 mm in diameter
slope25 = (0.82-0.64) / (25.-20.)
offset25 = 0.64 - slope25*20.

slope30 = (0.93-0.82) / (30.-25.)
offset30 = 0.82 - slope30*25.

# For nodules more than 3 cm in diameter, 93% to 97% are malignant
slope40 = (0.97-0.93) / (40.-30.)
offset40 = 0.93 - slope40*30.

def diameter_to_prob(diam):
    # The prevalence of malignancy is 0% to 1% for nodules <5 mm,
    if diam < 5:
        p = prob5*diam/5.
    elif diam < 10:
        p = slope10*diam+offset10
    elif diam < 20:
        p = slope20*diam+offset20
    elif diam < 25:
        p = slope25*diam+offset25
    elif diam < 30:
        p = slope30*diam+offset30
    else:
        p = slope40 * diam + offset40
    return np.clip(p ,0.,1.)


class LunaDataGenerator(object):
    def __init__(self, data_path, transform_params, data_prep_fun, rng,
                 random, infinite, patient_ids=None, **kwargs):

        self.patient_ids = patient_ids
        if patient_ids:
            self.patient_paths = [data_path + '/' + p + '.mhd' for p in patient_ids]
        else:
            patient_paths = utils_lung.get_patient_data_paths(data_path)
            self.patient_paths = [p for p in patient_paths if '.mhd' in p]

        self.id2annotations = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH)
        self.nsamples = len(self.patient_paths)
        self.data_path = data_path
        self.rng = rng
        self.random = random
        self.infinite = infinite
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)
            for pos in xrange(0, len(rand_idxs)):
                idx = rand_idxs[pos]

                patient_path = self.patient_paths[idx]
                pid = utils_lung.extract_pid_filename(patient_path)

                img, origin, pixel_spacing = utils_lung.read_mhd(patient_path)
                x, y, annotations, tf_matrix = self.data_prep_fun(data=img,
                                                                  pixel_spacing=pixel_spacing,
                                                                  luna_annotations=
                                                                  self.id2annotations[pid],
                                                                  luna_origin=origin)

                x = np.float32(x)[None, None, :, :, :]
                y = np.float32(y)[None, None, :, :, :]

                yield x, y, None, annotations, tf_matrix, pid

            if not self.infinite:
                break



class LunaSimpleDataGenerator(object):
    def __init__(self, data_path, patient_ids=None, **kwargs):

        self.patient_ids = patient_ids

        self.data_path = data_path
        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'

        if patient_ids:
            self.patient_paths = [data_path + '/' + p + self.file_extension for p in patient_ids]
        else:
            patient_paths = utils_lung.get_patient_data_paths(data_path)
            self.patient_paths = [p for p in patient_paths if self.file_extension in p]
        
        self.nsamples = len(self.patient_paths)

        print self.data_path

    def generate(self):
        for patient_path in self.patient_paths:
            pid = utils_lung.extract_pid_filename(patient_path)

            img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)

            x = np.float32(img)

            yield x, pid




class LunaScanPositiveDataGenerator(LunaDataGenerator):
    def __init__(self, data_path, transform_params, data_prep_fun, rng,
                 random, infinite, patient_ids=None, **kwargs):
        super(LunaScanPositiveDataGenerator, self).__init__(data_path, transform_params, data_prep_fun, rng,
                                                            random, infinite, patient_ids, **kwargs)
        patient_ids_all = [utils_lung.extract_pid_filename(p) for p in self.patient_paths]
        patient_ids_pos = [pid for pid in patient_ids_all if pid in self.id2annotations.keys()]
        self.patient_paths = [data_path + '/' + p + '.mhd' for p in patient_ids_pos]
        self.nsamples = len(self.patient_paths)


class LunaScanPositiveLungMaskDataGenerator(LunaDataGenerator):
    def __init__(self, data_path, batch_size, transform_params, data_prep_fun, rng,
                 full_batch, random, infinite, patient_ids=None, **kwargs):
        super(LunaScanPositiveLungMaskDataGenerator, self).__init__(data_path, transform_params,
                                                                    data_prep_fun, rng,
                                                                    random, infinite, patient_ids, **kwargs)

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)
            for pos in xrange(0, len(rand_idxs)):
                idx = rand_idxs[pos]

                patient_path = self.patient_paths[idx]
                pid = utils_lung.extract_pid_filename(patient_path)

                img, origin, pixel_spacing = utils_lung.read_mhd(patient_path)
                x, y, lung_mask, annotations, tf_matrix = self.data_prep_fun(data=img,
                                                                             pixel_spacing=pixel_spacing,
                                                                             luna_annotations=
                                                                             self.id2annotations[pid],
                                                                             luna_origin=origin)

                x = np.float32(x)[None, None, :, :, :]
                y = np.float32(y)[None, None, :, :, :]
                lung_mask = np.float32(lung_mask)[None, None, :, :, :]

                yield x, y, lung_mask, annotations, tf_matrix, pid

            if not self.infinite:
                break



class LunaScanMaskPositiveDataGenerator(LunaDataGenerator):
    def __init__(self, data_path, seg_data_path, batch_size, transform_params, data_prep_fun, rng,
                 full_batch, random, infinite, patient_ids=None, **kwargs):
        super(LunaScanMaskPositiveDataGenerator, self).__init__(data_path, transform_params,
                                                                    data_prep_fun, rng,
                                                                    random, infinite, patient_ids, **kwargs)
        self.seg_data_path = seg_data_path
        self.mask_paths = [seg_data_path + '/' + p + '.mhd' for p in self.patient_ids]

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)
            for pos in xrange(0, len(rand_idxs)):
                idx = rand_idxs[pos]

                ct_scan_path = self.patient_paths[idx]
                mask_path = self.mask_paths[idx]

                pid = utils_lung.extract_pid_filename(ct_scan_path)

                ct_scan, ct_origin, ct_pixel_spacing = utils_lung.read_mhd(ct_scan_path)
                mask, mask_origin, mask_pixel_spacing = utils_lung.read_mhd(mask_path)

                assert(sum(abs(ct_origin-mask_origin)) < 1e-9)
                assert(sum(abs(ct_pixel_spacing-mask_pixel_spacing)) < 1e-9)

                ct, lung_mask, annotations, tf_matrix = self.data_prep_fun(ct_scan=ct_scan, mask=mask,
                                                                             pixel_spacing=ct_pixel_spacing,
                                                                             luna_annotations=
                                                                             self.id2annotations[pid],
                                                                             luna_origin=ct_origin)

                ct = np.float32(ct)[None, None, :, :, :]
                lung_mask = np.float32(lung_mask)[None, None, :, :, :]

                yield ct, lung_mask, annotations, tf_matrix, pid

            if not self.infinite:
                break


#for lung segmentation, does not work yet
class PatchLunaDataGenerator(object):
    def __init__(self, ct_data_path, seg_data_path, batch_size, transform_params, data_prep_fun, rng,
                 full_batch, random, infinite, patient_ids=None, **kwargs):

        if patient_ids:
            self.patient_ids = patient_ids
            #self.patient_paths = [data_path + '/' + p + '.mhd' for p in patient_ids]
        else:
            patient_paths = utils_lung.get_patient_data_paths(data_path)
            #self.patient_paths = [p for p in patient_paths if '.mhd' in p]
            self.patient_ids = [utils_lung.extract_pid_filename(p) for p in self.patient_paths]\

        self.nsamples = len(self.patient_ids)
        self.ct_data_path = ct_data_path
        self.seg_data_path = seg_data_path
        self.rng = rng
        self.random = random
        self.infinite = infinite
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.batch_size = batch_size
        self.full_batch = full_batch

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)
            for pos in xrange(0, len(rand_idxs), self.batch_size):
                idxs_batch = rand_idxs[pos:pos + self.batch_size]
                nb = len(idxs_batch)
                # allocate batches
                x_batch = np.zeros((nb, 1) + self.transform_params['patch_size'], dtype='float32')
                y_batch = np.zeros((nb, 1) + self.transform_params['patch_size'], dtype='float32')
                patients_ids = []

                for i, idx in enumerate(idxs_batch):
                    pid = self.patient_ids[idx]
                    ct_path = self.ct_data_path + pid + '.mhd'
                    seg_path = self.seg_data_path + pid + '.mhd'
                    patients_ids.append(pid)

                    ct_img, ct_origin, ct_pixel_spacing = utils_lung.read_mhd(ct_path)
                    seg_img, seg_origin, seg_pixel_spacing = utils_lung.read_mhd(seg_path)

                    assert(np.sum(ct_origin-seg_origin) <  1e-9)
                    assert(np.sum(ct_pixel_spacing-seg_pixel_spacing) <  1e-9)

                    print 'ct_img.shape', ct_img.shape
                    print 'seg_img.shape', seg_img.shape
                    w,h,d = self.transform_params['patch_size']
                    patch_center = [self.rng.randint(w/2, ct_img.shape[0]-w/2),
                                    self.rng.randint(h/2, ct_img.shape[1]-h/2),
                                    self.rng.randint(d/2, ct_img.shape[1]-d/2)]
                    print patch_center


                    x_batch[i, 0, :, :, :], y_batch[i, 0, :, :, :]  = self.data_prep_fun(ct_img=ct_img, seg_img=seg_img,
                                                                    patch_center=patch_center,
                                                                    pixel_spacing=ct_pixel_spacing,
                                                                    luna_origin=ct_origin)

                    # y_batch[i, 0, :, :, :],  = self.data_prep_fun(data=seg_img,
                    #                                                 patch_center=patch_center,
                    #                                                 pixel_spacing=seg_pixel_spacing,
                    #                                                 luna_origin=seg_origin)
                if self.full_batch:
                    if nb == self.batch_size:
                        yield x_batch, y_batch, patients_ids
                else:
                    yield x_batch, y_batch, patients_ids

            if not self.infinite:
                break

#works, tested
class LunaScanDataGenerator(object):
    def __init__(self, ct_data_path, seg_data_path, patient_ids=None, **kwargs):

        if patient_ids:
            self.patient_ids = patient_ids
            #self.patient_paths = [data_path + '/' + p + '.mhd' for p in patient_ids]
        else:
            patient_paths = utils_lung.get_patient_data_paths(ct_data_path)
            #self.patient_paths = [p for p in patient_paths if '.mhd' in p]
            self.patient_ids = [utils_lung.extract_pid_filename(p) for p in self.patient_paths]\

        self.nsamples = len(self.patient_ids)
        self.ct_data_path = ct_data_path
        self.seg_data_path = seg_data_path
        

    def generate(self):
        for pid in self.patient_ids:
            ct_path = self.ct_data_path + pid + '.mhd'
            seg_path = self.seg_data_path + pid + '.mhd'

            ct_img, ct_origin, ct_pixel_spacing = utils_lung.read_mhd(ct_path)
            seg_img, seg_origin, seg_pixel_spacing = utils_lung.read_mhd(seg_path)

            assert(np.sum(ct_origin-seg_origin) <  1e-9)
            assert(np.sum(ct_pixel_spacing-seg_pixel_spacing) <  1e-9)

            print 'ct_img.shape', ct_img.shape
            print 'seg_img.shape', seg_img.shape

            yield ct_img, seg_img, pid


class PatchPositiveLunaDataGenerator(object):
    def __init__(self, data_path, batch_size, transform_params, data_prep_fun, rng,
                 full_batch, random, infinite, patient_ids=None, **kwargs):

        self.id2annotations = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH)

        if patient_ids:
            self.patient_paths = [data_path + '/' + p + '.mhd' for p in patient_ids]
        else:
            patient_paths = utils_lung.get_patient_data_paths(data_path)
            self.patient_paths = [p for p in patient_paths if '.mhd' in p]

        patient_ids_all = [utils_lung.extract_pid_filename(p) for p in self.patient_paths]
        patient_ids_pos = [pid for pid in patient_ids_all if pid in self.id2annotations.keys()]
        self.patient_paths = [data_path + '/' + p + '.mhd' for p in patient_ids_pos]

        self.nsamples = len(self.patient_paths)
        self.data_path = data_path
        self.rng = rng
        self.random = random
        self.infinite = infinite
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.batch_size = batch_size
        self.full_batch = full_batch

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)
            for pos in xrange(0, len(rand_idxs), self.batch_size):
                idxs_batch = rand_idxs[pos:pos + self.batch_size]
                nb = len(idxs_batch)
                # allocate batches
                x_batch = np.zeros((nb, 1) + self.transform_params['patch_size'], dtype='float32')
                y_batch = np.zeros((nb, 1) + self.transform_params['patch_size'], dtype='float32')
                patients_ids = []

                for i, idx in enumerate(idxs_batch):
                    patient_path = self.patient_paths[idx]
                    id = utils_lung.extract_pid_filename(patient_path)
                    patients_ids.append(id)
                    img, origin, pixel_spacing = utils_lung.read_mhd(patient_path)

                    patient_annotations = self.id2annotations[id]
                    patch_center = patient_annotations[self.rng.randint(len(patient_annotations))]
                    x_batch[i, 0, :, :, :], y_batch[i, 0, :, :, :] = self.data_prep_fun(data=img,
                                                                                        patch_center=patch_center,
                                                                                        pixel_spacing=pixel_spacing,
                                                                                        luna_annotations=patient_annotations,
                                                                                        luna_origin=origin)
                if self.full_batch:
                    if nb == self.batch_size:
                        yield x_batch, y_batch, patients_ids
                else:
                    yield x_batch, y_batch, patients_ids

            if not self.infinite:
                break



class ValidPatchPositiveLunaDataGenerator(object):
    def __init__(self, data_path, transform_params, patient_ids, data_prep_fun, **kwargs):

        id2positive_annotations = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH)

        self.id2positive_annotations = {}
        self.id2patient_path = {}
        n_positive = 0
        for pid in patient_ids:
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                n_pos = len(id2positive_annotations[pid])
                self.id2patient_path[pid] = data_path + '/' + pid + '.mhd'
                n_positive += n_pos

        self.nsamples = n_positive
        self.data_path = data_path
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params

    def generate(self):

        for pid in self.id2positive_annotations.iterkeys():
            for patch_center in self.id2positive_annotations[pid]:
                patient_path = self.id2patient_path[pid]
                img, origin, pixel_spacing = utils_lung.read_mhd(patient_path)

                patient_annotations = self.id2positive_annotations[pid]
                x_batch, y_batch = self.data_prep_fun(data=img,
                                                      patch_center=patch_center,
                                                      pixel_spacing=pixel_spacing,
                                                      luna_annotations=patient_annotations,
                                                      luna_origin=origin)
                x_batch = np.float32(x_batch)[None, None, :, :, :]
                y_batch = np.float32(y_batch)[None, None, :, :, :]
                yield x_batch, y_batch, [pid]


class CandidatesLunaDataGenerator(object):
    def __init__(self, data_path, batch_size, transform_params, patient_ids, data_prep_fun, rng,
                 full_batch, random, infinite, positive_proportion, **kwargs):

        id2positive_annotations = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH)
        id2negative_annotations = utils_lung.read_luna_negative_candidates(pathfinder.LUNA_CANDIDATES_PATH)

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2positive_annotations = {}
        self.id2negative_annotations = {}
        self.patient_paths = []
        n_positive, n_negative = 0, 0
        for pid in patient_ids:
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                self.id2negative_annotations[pid] = id2negative_annotations[pid]
                self.patient_paths.append(data_path + '/' + pid + self.file_extension)
                n_positive += len(id2positive_annotations[pid])
                n_negative += len(id2negative_annotations[pid])

        print 'n positive', n_positive
        print 'n negative', n_negative

        self.nsamples = len(self.patient_paths)

        print 'n patients', self.nsamples
        self.data_path = data_path
        self.batch_size = batch_size
        self.rng = rng
        self.full_batch = full_batch
        self.random = random
        self.infinite = infinite
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.positive_proportion = positive_proportion

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)
            for pos in xrange(0, len(rand_idxs), self.batch_size):
                idxs_batch = rand_idxs[pos:pos + self.batch_size]
                nb = len(idxs_batch)
                # allocate batches
                x_batch = np.zeros((nb, 1) + self.transform_params['patch_size'], dtype='float32')
                y_batch = np.zeros((nb, 1), dtype='float32')
                patients_ids = []

                for i, idx in enumerate(idxs_batch):
                    patient_path = self.patient_paths[idx]

                    id = utils_lung.extract_pid_filename(patient_path, self.file_extension)
                    patients_ids.append(id)

                    img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                        if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)
                    if i < np.rint(self.batch_size * self.positive_proportion):
                        patient_annotations = self.id2positive_annotations[id]
                    else:
                        patient_annotations = self.id2negative_annotations[id]

                    patch_center = patient_annotations[self.rng.randint(len(patient_annotations))]

                    y_batch[i] = float(patch_center[-1] > 0)
                    x_batch[i, 0, :, :, :] = self.data_prep_fun(data=img,
                                                                patch_center=patch_center,
                                                                pixel_spacing=pixel_spacing,
                                                                luna_origin=origin)

                if self.full_batch:
                    if nb == self.batch_size:
                        yield x_batch, y_batch, patients_ids
                else:
                    yield x_batch, y_batch, patients_ids

            if not self.infinite:
                break



class CandidatesLunaDataGenerator(object):
    def __init__(self, data_path, batch_size, transform_params, patient_ids, data_prep_fun, rng,
                 full_batch, random, infinite, positive_proportion, return_malignancy=False, **kwargs):

        id2positive_annotations = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH)
        id2negative_annotations = utils_lung.read_luna_negative_candidates(pathfinder.LUNA_CANDIDATES_PATH)

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2positive_annotations = {}
        self.id2negative_annotations = {}
        self.patient_paths = []
        n_positive, n_negative = 0, 0
        for pid in patient_ids:
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                self.id2negative_annotations[pid] = id2negative_annotations[pid]
                self.patient_paths.append(data_path + '/' + pid + self.file_extension)
                n_positive += len(id2positive_annotations[pid])
                n_negative += len(id2negative_annotations[pid])

        print 'n positive', n_positive
        print 'n negative', n_negative

        self.nsamples = len(self.patient_paths)

        print 'n patients', self.nsamples
        self.data_path = data_path
        self.batch_size = batch_size
        self.rng = rng
        self.full_batch = full_batch
        self.random = random
        self.infinite = infinite
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.positive_proportion = positive_proportion
        self.return_malignancy = return_malignancy

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)
            for pos in xrange(0, len(rand_idxs), self.batch_size):
                idxs_batch = rand_idxs[pos:pos + self.batch_size]
                nb = len(idxs_batch)
                # allocate batches
                x_batch = np.zeros((nb,) + self.transform_params['patch_size'], dtype='float32')
                y_batch = np.zeros((nb,), dtype='float32')
                patients_ids = []

                for i, idx in enumerate(idxs_batch):
                    patient_path = self.patient_paths[idx]

                    id = utils_lung.extract_pid_filename(patient_path, self.file_extension)
                    patients_ids.append(id)

                    img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                        if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)
                    if i < np.rint(self.batch_size * self.positive_proportion):
                        patient_annotations = self.id2positive_annotations[id]
                    else:
                        patient_annotations = self.id2negative_annotations[id]

                    patch_center = patient_annotations[self.rng.randint(len(patient_annotations))]

                    if self.return_malignancy:
                        y_batch[i] = np.float32(diameter_to_prob(patch_center[-1]))
                    else:
                        y_batch[i] = float(patch_center[-1] > 0) 
                    x_batch[i, :, :, :] = self.data_prep_fun(data=img,
                                                                patch_center=patch_center,
                                                                pixel_spacing=pixel_spacing,
                                                                luna_origin=origin)

                if self.full_batch:
                    if nb == self.batch_size:
                        yield x_batch, y_batch, patients_ids
                else:
                    yield x_batch, y_batch, patients_ids

            if not self.infinite:
                break


class CandidatesLunaValidDataGenerator(object):
    def __init__(self, data_path, transform_params, patient_ids, data_prep_fun, return_malignancy=False, **kwargs):
        rng = np.random.RandomState(42)  # do not change this!!!

        id2positive_annotations = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH)
        id2negative_annotations = utils_lung.read_luna_negative_candidates(pathfinder.LUNA_CANDIDATES_PATH)

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2positive_annotations = {}
        self.id2negative_annotations = {}
        self.id2patient_path = {}
        n_positive, n_negative = 0, 0
        for pid in patient_ids:
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                negative_annotations = id2negative_annotations[pid]
                n_pos = len(id2positive_annotations[pid])
                n_neg = len(id2negative_annotations[pid])
                neg_idxs = rng.choice(n_neg, size=n_pos, replace=False)
                negative_annotations_selected = []
                for i in neg_idxs:
                    negative_annotations_selected.append(negative_annotations[i])
                self.id2negative_annotations[pid] = negative_annotations_selected

                self.id2patient_path[pid] = data_path + '/' + pid + self.file_extension
                n_positive += n_pos
                n_negative += n_pos

        print 'n positive', n_positive
        print 'n negative', n_negative

        self.nsamples = len(self.id2patient_path)
        self.data_path = data_path
        self.rng = rng
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.return_malignancy = return_malignancy

    def generate(self):

        for pid in self.id2positive_annotations.iterkeys():
            for patch_center in self.id2positive_annotations[pid]:
                patient_path = self.id2patient_path[pid]

                img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                    if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)
                
                if self.return_malignancy:
                    y_batch = np.array([diameter_to_prob(patch_center[-1])], dtype='float32')
                else:
                    y_batch = np.array([1.], dtype='float32')

                x_batch = np.float32(self.data_prep_fun(data=img,
                                                        patch_center=patch_center,
                                                        pixel_spacing=pixel_spacing,
                                                        luna_origin=origin))[None, :, :, :]

                yield x_batch, y_batch, [pid]

            for patch_center in self.id2negative_annotations[pid]:
                patient_path = self.id2patient_path[pid]

                img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                    if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)
                y_batch = np.array([0.], dtype='float32')
                x_batch = np.float32(self.data_prep_fun(data=img,
                                                        patch_center=patch_center,
                                                        pixel_spacing=pixel_spacing,
                                                        luna_origin=origin))[None, :, :, :]

                yield x_batch, y_batch, [pid]


class FixedCandidatesLunaDataGenerator(object):
    def __init__(self, data_path, transform_params, id2candidates_path, data_prep_fun, top_n=None):

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2candidates_path = id2candidates_path
        self.id2patient_path = {}
        for pid in id2candidates_path.keys():
            self.id2patient_path[pid] = data_path + '/' + pid + self.file_extension

        self.nsamples = len(self.id2patient_path)
        self.data_path = data_path
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.top_n = top_n

    def generate(self):

        for pid in self.id2candidates_path.iterkeys():
            patient_path = self.id2patient_path[pid]
            print 'PATIENT', pid
            candidates = utils.load_pkl(self.id2candidates_path[pid])
            if self.top_n is not None:
                candidates = candidates[:self.top_n]
                print candidates
            print 'n blobs', len(candidates)

            img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)

            for candidate in candidates:
                y_batch = np.array(candidate, dtype='float32')
                patch_center = candidate[:3]
                x_batch = np.float32(self.data_prep_fun(data=img,
                                                        patch_center=patch_center,
                                                        pixel_spacing=pixel_spacing,
                                                        luna_origin=origin))[None, None, :, :, :]

                yield x_batch, y_batch, [pid]



class CandidatesLunaSizeDataGenerator(object):
    def __init__(self, data_path, batch_size, transform_params, patient_ids, data_prep_fun, rng,
                 full_batch, random, infinite, positive_proportion, **kwargs):

        id2positive_annotations = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH)
        id2negative_annotations = utils_lung.read_luna_negative_candidates(pathfinder.LUNA_CANDIDATES_PATH)

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2positive_annotations = {}
        self.id2negative_annotations = {}
        self.patient_paths = []
        n_positive, n_negative = 0, 0
        for pid in patient_ids:
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                self.id2negative_annotations[pid] = id2negative_annotations[pid]
                self.patient_paths.append(data_path + '/' + pid + self.file_extension)
                n_positive += len(id2positive_annotations[pid])
                n_negative += len(id2negative_annotations[pid])

        print 'n positive', n_positive
        print 'n negative', n_negative

        self.nsamples = len(self.patient_paths)

        print 'n patients', self.nsamples
        self.data_path = data_path
        self.batch_size = batch_size
        self.rng = rng
        self.full_batch = full_batch
        self.random = random
        self.infinite = infinite
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.positive_proportion = positive_proportion

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)
            for pos in xrange(0, len(rand_idxs), self.batch_size):
                idxs_batch = rand_idxs[pos:pos + self.batch_size]
                nb = len(idxs_batch)
                # allocate batches
                x_batch = np.zeros((nb, 1) + self.transform_params['patch_size'], dtype='float32')
                y_batch = np.zeros((nb, 1), dtype='float32')
                patients_ids = []

                for i, idx in enumerate(idxs_batch):
                    patient_path = self.patient_paths[idx]

                    id = utils_lung.extract_pid_filename(patient_path, self.file_extension)
                    patients_ids.append(id)

                    img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                        if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)
                    if i < np.rint(self.batch_size * self.positive_proportion):
                        patient_annotations = self.id2positive_annotations[id]
                    else:
                        patient_annotations = self.id2negative_annotations[id]

                    patch_center = patient_annotations[self.rng.randint(len(patient_annotations))]

                    y_batch[i] = float(patch_center[-1])
                    x_batch[i, 0, :, :, :] = self.data_prep_fun(data=img,
                                                                patch_center=patch_center,
                                                                pixel_spacing=pixel_spacing,
                                                                luna_origin=origin)

                if self.full_batch:
                    if nb == self.batch_size:
                        yield x_batch, y_batch, patients_ids
                else:
                    yield x_batch, y_batch, patients_ids

            if not self.infinite:
                break

class CandidatesLunaSizeValidDataGenerator(object):
    def __init__(self, data_path, transform_params, patient_ids, data_prep_fun, **kwargs):
        rng = np.random.RandomState(42)  # do not change this!!!

        id2positive_annotations = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH)
        id2negative_annotations = utils_lung.read_luna_negative_candidates(pathfinder.LUNA_CANDIDATES_PATH)

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2positive_annotations = {}
        self.id2negative_annotations = {}
        self.id2patient_path = {}
        n_positive, n_negative = 0, 0
        for pid in patient_ids:
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                negative_annotations = id2negative_annotations[pid]
                n_pos = len(id2positive_annotations[pid])
                n_neg = len(id2negative_annotations[pid])
                neg_idxs = rng.choice(n_neg, size=n_pos, replace=False)
                negative_annotations_selected = []
                for i in neg_idxs:
                    negative_annotations_selected.append(negative_annotations[i])
                self.id2negative_annotations[pid] = negative_annotations_selected

                self.id2patient_path[pid] = data_path + '/' + pid + self.file_extension
                n_positive += n_pos
                n_negative += n_pos

        print 'n positive', n_positive
        print 'n negative', n_negative

        self.nsamples = len(self.id2patient_path)
        self.data_path = data_path
        self.rng = rng
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params

    def generate(self):

        for pid in self.id2positive_annotations.iterkeys():
            for patch_center in self.id2positive_annotations[pid]:
                patient_path = self.id2patient_path[pid]

                img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                    if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)
                y_batch = np.array([[float(patch_center[-1])]], dtype='float32')
                x_batch = np.float32(self.data_prep_fun(data=img,
                                                        patch_center=patch_center,
                                                        pixel_spacing=pixel_spacing,
                                                        luna_origin=origin))[None, None, :, :, :]

                yield x_batch, y_batch, [pid]

            for patch_center in self.id2negative_annotations[pid]:
                patient_path = self.id2patient_path[pid]

                img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                    if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)
                y_batch = np.array([[0.]], dtype='float32')
                x_batch = np.float32(self.data_prep_fun(data=img,
                                                        patch_center=patch_center,
                                                        pixel_spacing=pixel_spacing,
                                                        luna_origin=origin))[None, None, :, :, :]

                yield x_batch, y_batch, [pid]



class CandidatesLunaSizeBinDataGenerator(object):
    def __init__(self, data_path, batch_size, transform_params, patient_ids, data_prep_fun, rng,
                 full_batch, random, infinite, positive_proportion, bin_borders = [4,8,20,50], **kwargs):

        id2positive_annotations = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH)
        id2negative_annotations = utils_lung.read_luna_negative_candidates(pathfinder.LUNA_CANDIDATES_PATH)

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2positive_annotations = {}
        self.id2negative_annotations = {}
        self.patient_paths = []
        n_positive, n_negative = 0, 0
        for pid in patient_ids:
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                self.id2negative_annotations[pid] = id2negative_annotations[pid]
                self.patient_paths.append(data_path + '/' + pid + self.file_extension)
                n_positive += len(id2positive_annotations[pid])
                n_negative += len(id2negative_annotations[pid])

        print 'n positive', n_positive
        print 'n negative', n_negative

        self.nsamples = len(self.patient_paths)

        print 'n patients', self.nsamples
        self.data_path = data_path
        self.batch_size = batch_size
        self.rng = rng
        self.full_batch = full_batch
        self.random = random
        self.infinite = infinite
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.positive_proportion = positive_proportion
        self.bin_borders = bin_borders

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)
            for pos in xrange(0, len(rand_idxs), self.batch_size):
                idxs_batch = rand_idxs[pos:pos + self.batch_size]
                nb = len(idxs_batch)
                # allocate batches
                x_batch = np.zeros((nb,) + self.transform_params['patch_size'], dtype='float32')
                y_batch = np.zeros((nb,), dtype='float32')
                patients_ids = []

                for i, idx in enumerate(idxs_batch):
                    patient_path = self.patient_paths[idx]

                    id = utils_lung.extract_pid_filename(patient_path, self.file_extension)
                    patients_ids.append(id)

                    img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                        if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)
                    if i < np.rint(self.batch_size * self.positive_proportion):
                        patient_annotations = self.id2positive_annotations[id]
                    else:
                        patient_annotations = self.id2negative_annotations[id]

                    patch_center = patient_annotations[self.rng.randint(len(patient_annotations))]

                    diameter = patch_center[-1]
                    if diameter > 0.:
                        ybin = 0
                        for idx, border in enumerate(self.bin_borders):
                            if diameter<border:
                                ybin = idx
                                break                            
                        y_batch[i] = 1. + ybin
                    else:
                        y_batch[i] = 0. 
                    #print 'y_batch[i]', y_batch[i], 'diameter', diameter

                    x_batch[i, :, :, :] = self.data_prep_fun(data=img,
                                                                patch_center=patch_center,
                                                                pixel_spacing=pixel_spacing,
                                                                luna_origin=origin)

                if self.full_batch:
                    if nb == self.batch_size:
                        yield x_batch, y_batch, patients_ids
                else:
                    yield x_batch, y_batch, patients_ids

            if not self.infinite:
                break

class CandidatesLunaSizeBinValidDataGenerator(object):
    def __init__(self, data_path, transform_params, patient_ids, data_prep_fun, bin_borders = [4,8,20,50], **kwargs):
        rng = np.random.RandomState(42)  # do not change this!!!

        id2positive_annotations = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH)
        id2negative_annotations = utils_lung.read_luna_negative_candidates(pathfinder.LUNA_CANDIDATES_PATH)

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2positive_annotations = {}
        self.id2negative_annotations = {}
        self.id2patient_path = {}
        n_positive, n_negative = 0, 0
        for pid in patient_ids:
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                negative_annotations = id2negative_annotations[pid]
                n_pos = len(id2positive_annotations[pid])
                n_neg = len(id2negative_annotations[pid])
                neg_idxs = rng.choice(n_neg, size=n_pos, replace=False)
                negative_annotations_selected = []
                for i in neg_idxs:
                    negative_annotations_selected.append(negative_annotations[i])
                self.id2negative_annotations[pid] = negative_annotations_selected

                self.id2patient_path[pid] = data_path + '/' + pid + self.file_extension
                n_positive += n_pos
                n_negative += n_pos

        print 'n positive', n_positive
        print 'n negative', n_negative

        self.nsamples = len(self.id2patient_path)
        self.data_path = data_path
        self.rng = rng
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.bin_borders = bin_borders

    def generate(self):

        for pid in self.id2positive_annotations.iterkeys():
            for patch_center in self.id2positive_annotations[pid]:
                patient_path = self.id2patient_path[pid]

                img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                    if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)

                diameter = patch_center[3]                        
                ybin = 0
                for idx, border in enumerate(self.bin_borders):
                    if diameter<border:
                        ybin = idx
                        break  

                y_batch = np.array([1. + ybin], dtype='float32')
                x_batch = np.float32(self.data_prep_fun(data=img,
                                                        patch_center=patch_center,
                                                        pixel_spacing=pixel_spacing,
                                                        luna_origin=origin))[None, :, :, :]

                yield x_batch, y_batch, [pid]

            for patch_center in self.id2negative_annotations[pid]:
                patient_path = self.id2patient_path[pid]

                img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                    if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)
                y_batch = np.array([0.], dtype='float32')
                x_batch = np.float32(self.data_prep_fun(data=img,
                                                        patch_center=patch_center,
                                                        pixel_spacing=pixel_spacing,
                                                        luna_origin=origin))[None, :, :, :]

                yield x_batch, y_batch, [pid]



class CandidatesLunaPropsDataGenerator(object):
    def __init__(self, data_path, batch_size, transform_params, patient_ids, data_prep_fun, rng,
                 full_batch, random, infinite, 
                 positive_proportion,
                 order_objectives,
                 property_type,
                 property_bin_borders = None,
                 return_enable_target_vector = False, **kwargs):

        id2positive_annotations = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH)
        id2negative_annotations = utils_lung.read_luna_negative_candidates(pathfinder.LUNA_CANDIDATES_PATH)

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2positive_annotations = {}
        self.id2negative_annotations = {}
        self.all_pids = patient_ids
        self.pos_pids = []
        self.neg_pids = []
        n_positive, n_negative = 0, 0
        for pid in patient_ids:
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                self.id2negative_annotations[pid] = id2negative_annotations[pid]
                self.pos_pids.append(pid)
                n_positive += len(id2positive_annotations[pid])
                n_negative += len(id2negative_annotations[pid])
            elif pid in id2negative_annotations:
                self.id2negative_annotations[pid] = id2negative_annotations[pid]
                self.neg_pids.append(pid)
                n_negative += len(id2negative_annotations[pid])
            else:
                print 'WARNING something weird happens'

        print 'n positive', n_positive
        print 'n negative', n_negative

        self.n_neg_cans = n_negative
        self.n_pos_cans = n_positive

        self.n_pos_pids = len(self.pos_pids)
        self.n_neg_pids = len(self.neg_pids)

        self.nsamples = len(self.all_pids)
        print 'n patients', self.nsamples
        self.data_path = data_path
        self.batch_size = batch_size
        self.rng = rng
        self.full_batch = full_batch
        self.random = random
        self.infinite = infinite
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.positive_proportion = positive_proportion

        self.order_objectives = order_objectives
        self.property_bin_borders = property_bin_borders
	self.property_type = property_type
        #self.return_enable_target_vector = return_enable_target_vector

    def L2(self, a,b):
        return ((a[0]-b[0])**2 + (a[1]-b[1])**2 + (a[2]-b[2])**2)**(0.5)

    def build_ground_truth_vector(self, pid, patch_center):
        properties={}
        feature_vector = np.zeros((len(self.order_objectives)), dtype='float32')
        enable_target_vector = np.zeros((len(self.order_objectives)), dtype='float32')
        diameter = patch_center[-1]
        is_nodule  = diameter>0.01
        properties['nodule'] = np.float32(is_nodule)
        if is_nodule:
            if 'size' in self.property_bin_borders:
                properties['size'] = np.digitize(diameter, self.property_bin_borders['size'])
            else:
                properties['size'] = diameter
            
            patient = utils_lung.read_patient_annotations_luna(pid, pathfinder.LUNA_NODULE_ANNOTATIONS_PATH)

            #find the nodules in the doctor's annotations
            nodule_characteristics = []
            for doctor in patient:
                for nodule in doctor:
                    if "centroid_xyz" in nodule:
                        dist = self.L2(patch_center[:3],nodule["centroid_xyz"][::-1])
                        if  dist < 5:
                            #print 'found a very close nodule at', dist, ': ', patch_center[:3]
                            nodule_characteristics.append(nodule['characteristics'])

            if len(nodule_characteristics)==0:
                print 'WARNING: no nodule found in doctor annotations for ', patch_center
            else:
                #calculate the median property values
                for prop in nodule_characteristics[0]:
                    if prop in self.order_objectives:
                        prop_values = []
                        for nchar in nodule_characteristics:
                            prop_values.append(float(nchar[prop]))
                            random_value = self.rng.choice(np.array(prop_values))
                            if prop in self.property_bin_borders:
                                properties[prop] = np.digitize(random_value, self.property_bin_borders[prop])
                            else:      
                                if self.property_type:
                                    if self.property_type[prop] == 'bounded_continuous':
                                        properties[prop] = (random_value-1) / 4.
                                    else:
                                        properties[prop] = random_value-1
                                else:
                                    raise

        for idx, prop in enumerate(self.order_objectives):
            if prop in properties:
                feature_vector[idx] = properties[prop]
                enable_target_vector[idx] = 1.
            
        return feature_vector, enable_target_vector

    def generate(self):
        while True:
            # Construct pid set with
            rand_pos_idxs = np.arange(self.n_pos_pids)
            rand_neg_idxs = np.arange(self.n_neg_pids)
            ptr_pos_idcs = 0
            ptr_neg_idcs = 0

            if self.random:
                self.rng.shuffle(rand_pos_idxs)
                self.rng.shuffle(rand_neg_idxs)

            n_pos_batch = int(np.rint(self.batch_size * self.positive_proportion))
            n_neg_batch = self.batch_size - n_pos_batch
            for _idx, pos_pos in enumerate(xrange(0, len(rand_pos_idxs), n_pos_batch)):
                pos_idxs_batch = rand_pos_idxs[pos_pos:pos_pos + n_pos_batch]
                neg_idxs_batch = rand_neg_idxs[_idx * n_neg_batch:(_idx+1) * n_neg_batch]

                nb = len(pos_idxs_batch) + len(neg_idxs_batch)
                # allocate batches
                x_batch = np.zeros((nb,) + self.transform_params['patch_size'], dtype='float32')
                y_batch = np.zeros((nb, len(self.order_objectives)), dtype='float32')
                z_batch = np.zeros((nb, len(self.order_objectives)), dtype='float32')
                patients_ids = []

                batch_ptr = 0
                for idx in pos_idxs_batch:
                    pid  = self.pos_pids[idx]
                    patient_path = self.data_path + '/' + pid + self.file_extension
                    patients_ids.append(pid)

                    img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                        if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)

                    patient_annotations = self.id2positive_annotations[pid]
                    patch_center = patient_annotations[self.rng.randint(len(patient_annotations))]

                    y_batch[batch_ptr], z_batch[batch_ptr] = self.build_ground_truth_vector(pid, patch_center)
                    x_batch[batch_ptr, :, :, :] = self.data_prep_fun(data=img,
                                                                patch_center=patch_center,
                                                                pixel_spacing=pixel_spacing,
                                                                luna_origin=origin)
                    batch_ptr += 1

                for idx in neg_idxs_batch:
                    pid  = self.neg_pids[idx]
                    patient_path = self.data_path + '/' + pid + self.file_extension
                    patients_ids.append(pid)

                    img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                        if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)

                    patient_annotations = self.id2negative_annotations[pid]
                    patch_center = patient_annotations[self.rng.randint(len(patient_annotations))]

                    y_batch[batch_ptr], z_batch[batch_ptr] = self.build_ground_truth_vector(pid, patch_center)
                    x_batch[batch_ptr, :, :, :] = self.data_prep_fun(data=img,
                                                                patch_center=patch_center,
                                                                pixel_spacing=pixel_spacing,
                                                                luna_origin=origin)
                    batch_ptr += 1


                if self.full_batch:
                    if nb == self.batch_size:
                        yield x_batch, y_batch, z_batch, patients_ids
                else:
                    yield x_batch, y_batch, z_batch, patients_ids

            if not self.infinite:
                break


class CandidatesLunaPropsValidDataGenerator(object):
    def __init__(self, data_path, transform_params, patient_ids, data_prep_fun, 
                    order_objectives, property_type, property_bin_borders=None, **kwargs):
        rng = np.random.RandomState(42)  # do not change this!!!

        id2positive_annotations = utils_lung.read_luna_annotations(pathfinder.LUNA_LABELS_PATH)
        id2negative_annotations = utils_lung.read_luna_negative_candidates(pathfinder.LUNA_CANDIDATES_PATH)

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2positive_annotations = {}
        self.id2negative_annotations = {}
        self.id2patient_path = {}
        n_positive, n_negative = 0, 0
        for pid in patient_ids:
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                negative_annotations = id2negative_annotations[pid]
                n_pos = len(id2positive_annotations[pid])
                n_neg = len(id2negative_annotations[pid])
                neg_idxs = rng.choice(n_neg, size=n_pos, replace=False)
                negative_annotations_selected = []
                for i in neg_idxs:
                    negative_annotations_selected.append(negative_annotations[i])
                self.id2negative_annotations[pid] = negative_annotations_selected

                self.id2patient_path[pid] = data_path + '/' + pid + self.file_extension
                n_positive += n_pos
                n_negative += n_pos

        print 'n positive', n_positive
        print 'n negative', n_negative

        self.nsamples = len(self.id2patient_path)
        self.data_path = data_path
        self.rng = rng
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params

        self.order_objectives = order_objectives
        self.property_bin_borders = property_bin_borders
        self.property_type = property_type
    

    def L2(self, a,b):
        return ((a[0]-b[0])**2 + (a[1]-b[1])**2 + (a[2]-b[2])**2)**(0.5)


    def build_ground_truth_vector(self, pid, patch_center):
        properties={}
        feature_vector = np.zeros((len(self.order_objectives)), dtype='float32')
        enable_target_vector = np.zeros((len(self.order_objectives)), dtype='float32')
        diameter = patch_center[-1]
        is_nodule  = diameter>0.01
        properties['nodule'] = np.float32(is_nodule)
        if is_nodule:
            if 'size' in self.property_bin_borders:
                properties['size'] = np.digitize(diameter, self.property_bin_borders['size'])
            else:
                properties['size'] = diameter
            
            patient = utils_lung.read_patient_annotations_luna(pid, pathfinder.LUNA_NODULE_ANNOTATIONS_PATH)

            #find the nodules in the doctor's annotations
            nodule_characteristics = []
            for doctor in patient:
                for nodule in doctor:
                    if "centroid_xyz" in nodule:
                        dist = self.L2(patch_center[:3],nodule["centroid_xyz"][::-1])
                        if  dist < 5:
                            #print 'found a very close nodule at', dist, ': ', patch_center[:3]
                            nodule_characteristics.append(nodule['characteristics'])

            if len(nodule_characteristics)==0:
                print 'WARNING: no nodule found in doctor annotations for ', patch_center
            else:
                #calculate the median property values
                for prop in nodule_characteristics[0]:
                    if prop in self.order_objectives:
                        prop_values = []
                        for nchar in nodule_characteristics:
                            prop_values.append(float(nchar[prop]))
                        if prop in self.property_bin_borders:
                            median_value = np.median(np.array(prop_values))
                            properties[prop] = np.digitize(median_value, self.property_bin_borders[prop])
                        else:
                            mean_value = np.mean(np.array(prop_values))
                            if self.property_type:
                                if self.property_type[prop] == 'bounded_continuous':
                                    properties[prop] = (mean_value-1) / 4.
                                else:
                                    properties[prop] = mean_value-1
                            else:
                                raise

        for idx, prop in enumerate(self.order_objectives):
            if prop in properties:
                feature_vector[idx] = properties[prop]
                enable_target_vector[idx] = 1.
            
        return feature_vector, enable_target_vector


    def generate(self):

        for pid in self.id2positive_annotations.iterkeys():
            for patch_center in self.id2positive_annotations[pid]:
                patient_path = self.id2patient_path[pid]

                img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                    if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)

                x_batch = np.float32(self.data_prep_fun(data=img,
                                                        patch_center=patch_center,
                                                        pixel_spacing=pixel_spacing,
                                                        luna_origin=origin))[None, :, :, :]

                feature_vector, enable_target_vector = self.build_ground_truth_vector(pid, patch_center)
                y_batch = np.array([feature_vector], dtype='float32')
                z_batch = np.array([enable_target_vector], dtype='float32')

                yield x_batch, y_batch, z_batch, [pid]

            for patch_center in self.id2negative_annotations[pid]:
                patient_path = self.id2patient_path[pid]

                img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                    if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)

                x_batch = np.float32(self.data_prep_fun(data=img,
                                                        patch_center=patch_center,
                                                        pixel_spacing=pixel_spacing,
                                                        luna_origin=origin))[None, :, :, :]

                feature_vector, enable_target_vector = self.build_ground_truth_vector(pid, patch_center)
                y_batch = np.array([feature_vector], dtype='float32')
                z_batch = np.array([enable_target_vector], dtype='float32')

                yield x_batch, y_batch, z_batch, [pid]


class DSBScanDataGenerator(object):
    def __init__(self, data_path, transform_params, data_prep_fun, **kwargs):
        self.patient_paths = utils_lung.get_patient_data_paths(data_path)
        self.nsamples = len(self.patient_paths)
        self.data_path = data_path
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params

    def generate(self):
        for p in self.patient_paths:
            pid = utils_lung.extract_pid_dir(p)

            img, pixel_spacing = utils_lung.read_dicom_scan(p)

            x, tf_matrix = self.data_prep_fun(data=img, pixel_spacing=pixel_spacing)

            x = np.float32(x)[None, None, :, :, :]
            yield x, None, tf_matrix, pid


class DSBScanLungMaskDataGenerator(object):
    def __init__(self, data_path, transform_params, data_prep_fun, exclude_pids=None,
                 include_pids=None, part_out_of=(1, 1)):

        self.patient_paths = utils_lung.get_patient_data_paths(data_path)

        this_part = part_out_of[0]
        all_parts = part_out_of[1]
        part_lenght = int(len(self.patient_paths) / all_parts)

        if this_part == all_parts:
            self.patient_paths = self.patient_paths[part_lenght * (this_part - 1):]
        else:
            self.patient_paths = self.patient_paths[part_lenght * (this_part - 1): part_lenght * this_part]

        if exclude_pids is not None:
            for ep in exclude_pids:
                for i in xrange(len(self.patient_paths)):
                    if ep in self.patient_paths[i]:
                        self.patient_paths.pop(i)
                        break

        if include_pids is not None:
            self.patient_paths = [data_path + '/' + p for p in include_pids]

        self.nsamples = len(self.patient_paths)
        self.data_path = data_path
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params

    def generate(self):
        for p in self.patient_paths:
            pid = utils_lung.extract_pid_dir(p)

            img, pixel_spacing = utils_lung.read_dicom_scan(p)

            x, lung_mask, tf_matrix = self.data_prep_fun(data=img, pixel_spacing=pixel_spacing)

            x = np.float32(x)[None, None, :, :, :]
            lung_mask = np.float32(lung_mask)[None, None, :, :, :]
            yield x, lung_mask, tf_matrix, pid


class CandidatesDSBDataGenerator(object):
    def __init__(self, data_path, transform_params, id2candidates_path, data_prep_fun, exclude_pids=None):
        if exclude_pids is not None:
            for p in exclude_pids:
                id2candidates_path.pop(p, None)

        self.id2candidates_path = id2candidates_path
        self.id2patient_path = {}
        for pid in id2candidates_path.keys():
            self.id2patient_path[pid] = data_path + '/' + pid

        self.nsamples = len(self.id2patient_path)
        self.data_path = data_path
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params

    def generate(self):

        for pid in self.id2candidates_path.iterkeys():
            patient_path = self.id2patient_path[pid]
            print pid, patient_path
            img, pixel_spacing = utils_lung.read_dicom_scan(patient_path)

            print self.id2candidates_path[pid]
            candidates = utils.load_pkl(self.id2candidates_path[pid])
            print candidates.shape
            for candidate in candidates:
                y_batch = np.array(candidate, dtype='float32')
                patch_center = candidate[:3]
                x_batch = np.float32(self.data_prep_fun(data=img,
                                                        patch_center=patch_center,
                                                        pixel_spacing=pixel_spacing))[None, :, :, :]

                yield x_batch, y_batch, [pid]




class CandidatesDSBDataGeneratorTTA(object):
    def __init__(self, data_path, transform_params, id2candidates_path, data_prep_fun, exclude_pids=None, tta=64):
        if exclude_pids is not None:
            for p in exclude_pids:
                id2candidates_path.pop(p, None)

        self.id2candidates_path = id2candidates_path
        self.id2patient_path = {}
        for pid in id2candidates_path.keys():
            self.id2patient_path[pid] = data_path + '/' + pid

        self.nsamples = len(self.id2patient_path)
        self.data_path = data_path
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.tta = tta

    def generate(self):

        for pid in self.id2candidates_path.iterkeys():
            patient_path = self.id2patient_path[pid]
            print pid, patient_path
            img, pixel_spacing = utils_lung.read_dicom_scan(patient_path)

            print self.id2candidates_path[pid]
            candidates = utils.load_pkl(self.id2candidates_path[pid])
            print candidates.shape
            for candidate in candidates:
                y_batch = np.array(candidate, dtype='float32')
                patch_center = candidate[:3]
                batch = []
                for i in range(self.tta):
                    batch.append(np.float32(self.data_prep_fun(data=img,
                                                        patch_center=patch_center,
                                                        pixel_spacing=pixel_spacing)))
                x_batch = np.stack(batch)
                print x_batch.shape

                yield x_batch, y_batch, [pid]


class DSBFeatureDataGenerator(object):
    def __init__(self, data_path, batch_size, p_features,
                 rng, random, infinite, patient_ids=None):

        print 'init DSBFeatureDataGenerator'

        self.id2label = utils_lung.read_labels(pathfinder.LABELS_PATH)
        self.patient_paths = []
        if patient_ids is not None:
            for pid in patient_ids:
                self.patient_paths.append(data_path + '/' + pid)
        else:
            raise ValueError('provide patient ids')

        self.nsamples = len(self.patient_paths)
        self.data_path = data_path
        self.batch_size = batch_size

        self.p_features = p_features
        self.rng = rng
        self.random = random
        self.infinite = infinite

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)

            for pos in xrange(0, len(rand_idxs), self.batch_size):
                idxs_batch = rand_idxs[pos:pos + self.batch_size]

                x_batch = np.zeros((self.batch_size,)
                                   + self.p_features['output_shape'], dtype='float32')

                y_batch = np.zeros((self.batch_size,), dtype='float32')
                
                pids_batch = []

                for i, idx in enumerate(idxs_batch):
                    patient_path = self.patient_paths[idx]
                    pid = utils_lung.extract_pid_dir(patient_path)

                    t_features = utils.load_pkl(patient_path+'.pkl')
                    if 'reshape' in self.p_features:
                        t_features = np.reshape(t_features, self.p_features['reshape'])
                    if 'swapaxes' in self.p_features:
                        t_features = np.swapaxes(t_features, *self.p_features['swapaxes'])


                    x_batch[i] = t_features

                    y_batch[i] = self.id2label.get(pid)
                    
                    pids_batch.append(pid)

                if len(idxs_batch) == self.batch_size:
                    yield x_batch, y_batch, pids_batch

            if not self.infinite:
                break

class DSBPatientsDataGenerator(object):
    def __init__(self, data_path, batch_size, transform_params, id2candidates_path, id2label, data_prep_fun,
                 n_candidates_per_patient, rng, random, infinite, candidates_prep_fun, return_patch_locs=False, shuffle_top_n=False, patient_ids=None):

        self.id2label = id2label #utils_lung.read_labels(pathfinder.LABELS_PATH)
        self.id2candidates_path = id2candidates_path
        self.patient_paths = []
        if patient_ids is not None:
            for pid in patient_ids:
                if pid in self.id2candidates_path:  # TODO: this should be redundant if fpr and segemntation are correctly generated
                    self.patient_paths.append(data_path + '/' + pid)
        else:
            raise ValueError('provide patient ids')

        self.nsamples = len(self.patient_paths)
        self.data_path = data_path
        self.data_prep_fun = data_prep_fun
        self.batch_size = batch_size
        self.transform_params = transform_params
        self.n_candidates_per_patient = n_candidates_per_patient
        self.rng = rng
        self.random = random
        self.infinite = infinite
        self.shuffle_top_n = shuffle_top_n
        self.return_patch_locs = return_patch_locs
        self.candidates_prep_fun = candidates_prep_fun

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)

            for pos in xrange(0, len(rand_idxs), self.batch_size):
                idxs_batch = rand_idxs[pos:pos + self.batch_size]

                x_batch = np.zeros((self.batch_size, self.n_candidates_per_patient,)
                                   + self.transform_params['patch_size'], dtype='float32')

                if self.return_patch_locs:
                    x_loc_batch = np.zeros((self.batch_size, self.n_candidates_per_patient, 3), dtype='float32')

                y_batch = np.zeros((self.batch_size,), dtype='float32')
                pids_batch = []

                for i, idx in enumerate(idxs_batch):
                    patient_path = self.patient_paths[idx]
                    pid = utils_lung.extract_pid_dir(patient_path)

                    img, pixel_spacing = utils_lung.read_dicom_scan(patient_path)

                    all_candidates = utils.load_pkl(self.id2candidates_path[pid])
                    if self.candidates_prep_fun:
                        top_candidates = self.candidates_prep_fun(all_candidates, self.n_candidates_per_patient)
                    else:
                        top_candidates = all_candidates[:self.n_candidates_per_patient]
                        if self.shuffle_top_n:
                            self.rng.shuffle(top_candidates)

                    if self.return_patch_locs:
                        #TODO move the normalization to the config file
                        x_loc_batch[i] = np.float32(top_candidates[:,:3])/512. 

                    x_batch[i] = np.float32(self.data_prep_fun(data=img, pid=pid,
                                                               patch_centers=top_candidates,
                                                               pixel_spacing=pixel_spacing))[:, :, :, :]
                    y_batch[i] = self.id2label.get(pid)
                    pids_batch.append(pid)

                if len(idxs_batch) == self.batch_size:
                    if self.return_patch_locs:
                        yield x_batch, x_loc_batch, y_batch, pids_batch
                    else:
                        yield x_batch, y_batch, pids_batch

            if not self.infinite:
                break



class DSBPatientsDataGeneratorTTA(object):
    def __init__(self, data_path, transform_params, id2candidates_path, id2label, data_prep_fun, candidates_prep_fun,
                 n_candidates_per_patient, patient_ids, tta=1):

        self.id2label = id2label 
        self.id2candidates_path = id2candidates_path
        self.patient_paths = []
        if patient_ids is not None:
            for pid in patient_ids:
                if pid in self.id2candidates_path:  # TODO: this should be redundant if fpr and segemntation are correctly generated
                    self.patient_paths.append(data_path + '/' + pid)
        else:
            raise ValueError('provide patient ids')

        self.nsamples = len(self.patient_paths)
        self.data_path = data_path
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.n_candidates_per_patient = n_candidates_per_patient
        self.tta = tta
        self.candidates_prep_fun = candidates_prep_fun

    def generate(self):
        print 
        for idx in xrange(self.nsamples):
            x_batch = np.zeros((self.tta, self.n_candidates_per_patient,)
                               + self.transform_params['patch_size'], dtype='float32')

            y_batch = np.zeros((self.tta,), dtype='float32')

            patient_path = self.patient_paths[idx]
            pid = utils_lung.extract_pid_dir(patient_path)

            img, pixel_spacing = utils_lung.read_dicom_scan(patient_path)

            all_candidates = utils.load_pkl(self.id2candidates_path[pid])
            if self.candidates_prep_fun:
                top_candidates = self.candidates_prep_fun(all_candidates, self.n_candidates_per_patient)
            else:
                top_candidates = all_candidates[:self.n_candidates_per_patient]

            for i in range(self.tta):
                x_batch[i] = np.float32(self.data_prep_fun(data=img,
                                                           patch_centers=top_candidates,
                                                           pixel_spacing=pixel_spacing))[:, :, :, :]

                y_batch[i] = self.id2label.get(pid)

            yield x_batch, y_batch, pid




class DSBPixelSpacingsGenerator(object):
    def __init__(self, data_path, id2candidates_path, patient_ids):

        self.id2candidates_path = id2candidates_path
        self.patient_paths = []
        if patient_ids is not None:
            for pid in patient_ids:
                if pid in self.id2candidates_path:  # TODO: this should be redundant if fpr and segemntation are correctly generated
                    self.patient_paths.append(data_path + '/' + pid)
        else:
            raise ValueError('provide patient ids')

        self.nsamples = len(self.patient_paths)
        self.data_path = data_path

    def generate(self):

        for idx in xrange(self.nsamples):

            patient_path = self.patient_paths[idx]
            pid = utils_lung.extract_pid_dir(patient_path)

            img, pixel_spacing = utils_lung.read_dicom_scan(patient_path)

            yield  pid, pixel_spacing


class DSBPatientsDataGenerator_only_heatmap(object):
    def __init__(self, data_path, batch_size, transform_params, id2candidates_path, data_prep_fun, 
                 n_candidates_per_patient, rng, random, infinite, candidates_prep_fun, return_patch_locs=False, shuffle_top_n=False, patient_ids=None):

        self.id2label = utils_lung.read_labels(pathfinder.LABELS_PATH)
        self.id2candidates_path = id2candidates_path
        self.patient_paths = []
        if patient_ids is not None:
            for pid in patient_ids:
                if pid in self.id2candidates_path:  # TODO: this should be redundant if fpr and segemntation are correctly generated
                    self.patient_paths.append(data_path + '/' + pid)
        else:
            raise ValueError('provide patient ids')

        self.nsamples = len(self.patient_paths)
        self.data_path = data_path
        self.data_prep_fun = data_prep_fun
        self.batch_size = batch_size
        self.transform_params = transform_params
        self.rng = rng
        self.random = random
        self.infinite = infinite
        self.shuffle_top_n = shuffle_top_n
        self.candidates_prep_fun = candidates_prep_fun
        self.n_candidates_per_patient = n_candidates_per_patient

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)

            for pos in xrange(0, len(rand_idxs), self.batch_size):
                idxs_batch = rand_idxs[pos:pos + self.batch_size]

                x_batch = np.zeros((self.batch_size,)
                                   + self.transform_params['heatmap_size'], dtype='float32')

                y_batch = np.zeros((self.batch_size,), dtype='float32')
                pids_batch = []

                for i, idx in enumerate(idxs_batch):
                    patient_path = self.patient_paths[idx]
                    pid = utils_lung.extract_pid_dir(patient_path)

                    img, pixel_spacing = utils_lung.read_dicom_scan(patient_path)

                    all_candidates = utils.load_pkl(self.id2candidates_path[pid])
                    candidates_w_value = self.candidates_prep_fun(all_candidates)

                    x_batch[i] = np.float32(self.data_prep_fun(data=img,
                                                               candidates=candidates_w_value,
                                                               pixel_spacing=pixel_spacing))
                    y_batch[i] = self.id2label.get(pid)
                    pids_batch.append(pid)

                if len(idxs_batch) == self.batch_size:
                    yield x_batch, y_batch, pids_batch

            if not self.infinite:
                break


class DSBPatientsDataGeneratorRandomSelectionNonCancerous(object):
    def __init__(self, data_path, batch_size, transform_params, id2candidates_path, data_prep_fun,
                 n_candidates_per_patient, rng, random, infinite, top_true=10, top_false=16, shuffle_top_n=False, patient_ids=None):

        self.id2label = utils_lung.read_labels(pathfinder.LABELS_PATH)
        self.id2candidates_path = id2candidates_path
        self.patient_paths = []
        if patient_ids is not None:
            for pid in patient_ids:
                if pid in self.id2candidates_path:  # TODO: this should be redundant if fpr and segemntation are correctly generated
                    self.patient_paths.append(data_path + '/' + pid)
        else:
            raise ValueError('provide patient ids')

        self.nsamples = len(self.patient_paths)
        self.data_path = data_path
        self.data_prep_fun = data_prep_fun
        self.batch_size = batch_size
        self.transform_params = transform_params
        self.n_candidates_per_patient = n_candidates_per_patient
        self.rng = rng
        self.random = random
        self.infinite = infinite
        self.shuffle_top_n = shuffle_top_n
        self.top_true = top_true
        self.top_false = top_false  

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)

            for pos in xrange(0, len(rand_idxs), self.batch_size):
                idxs_batch = rand_idxs[pos:pos + self.batch_size]

                x_batch = np.zeros((self.batch_size, self.n_candidates_per_patient, 1,)
                                   + self.transform_params['patch_size'], dtype='float32')
                y_batch = np.zeros((self.batch_size,), dtype='float32')
                pids_batch = []

                for i, idx in enumerate(idxs_batch):
                    patient_path = self.patient_paths[idx]
                    pid = utils_lung.extract_pid_dir(patient_path)

                    img, pixel_spacing = utils_lung.read_dicom_scan(patient_path)
                    all_candidates = utils.load_pkl(self.id2candidates_path[pid])

                    label = self.id2label.get(pid)
                    if label:
                        top_candidates = all_candidates[:self.n_candidates_per_patient]
                    else:
                        selection = np.arange(self.top_false)
                        self.rng.shuffle(selection)
                        selection = selection[:self.n_candidates_per_patient]
                        top_candidates = all_candidates[selection]

                    
                    if self.shuffle_top_n:
                        self.rng.shuffle(top_candidates)

                    x_batch[i] = np.float32(self.data_prep_fun(data=img,
                                                               patch_centers=top_candidates,
                                                               pixel_spacing=pixel_spacing))[:, None, :, :, :]
                    y_batch[i] = label
                    pids_batch.append(pid)

                if len(idxs_batch) == self.batch_size:
                    yield x_batch, y_batch, pids_batch

            if not self.infinite:
                break

#balance between patients with and without cancer
class BalancedDSBPatientsDataGenerator(object):
    def __init__(self, data_path, batch_size, transform_params, id2candidates_path, data_prep_fun,
                 n_candidates_per_patient, rng, random, infinite, shuffle_top_n=False, patient_ids=None):

        self.id2label = utils_lung.read_labels(pathfinder.LABELS_PATH)
        self.id2candidates_path = id2candidates_path
        self.patient_paths = []
        if patient_ids is not None:
            for pid in patient_ids:
                if pid in self.id2candidates_path:  # TODO: this should be redundant if fpr and segemntation are correctly generated
                    self.patient_paths.append(data_path + '/' + pid)
        else:
            raise ValueError('provide patient ids')
        self.pos_ids = []
        self.neg_ids = []
        for pid in patient_ids:
            if self.id2label[pid]:
                self.pos_ids.append(pid)
            else:
                self.neg_ids.append(pid)
        self.n_pos_ids = len(self.pos_ids)
        self.n_neg_ids = len(self.neg_ids)
        print 'n positive ids', self.n_pos_ids
        print 'n negative ids', self.n_neg_ids
        self.all_pids = patient_ids
        self.nsamples = len(self.all_pids)

        self.data_path = data_path
        self.data_prep_fun = data_prep_fun
        self.batch_size = batch_size
        self.transform_params = transform_params
        self.n_candidates_per_patient = n_candidates_per_patient
        self.rng = rng
        self.random = random
        self.infinite = infinite
        self.shuffle_top_n = shuffle_top_n

    def generate(self):
        while True:
            neg_rand_idxs = np.arange(self.n_neg_ids)
            if self.random:
                self.rng.shuffle(neg_rand_idxs)
            neg_rand_idxs_ptr = 0
            batch_pids = []
            while(neg_rand_idxs_ptr<self.n_neg_ids):               
                if self.rng.randint(2):
                    #take a cancerous patient
                    pos_pid = self.rng.choice(self.pos_ids)
                    batch_pids.append(pos_pid)
                else:
                    neg_pid = self.neg_ids[neg_rand_idxs[neg_rand_idxs_ptr]] 
                    batch_pids.append(neg_pid)
                    neg_rand_idxs_ptr += 1
                if len(batch_pids)==self.batch_size:
                    yield self.prepare_batch(batch_pids)
                    batch_pids = []
            # yield the half filled batch
            if len(batch_pids) > 0:
                yield self.prepare_batch(batch_pids)

            if not self.infinite:
                break

    def prepare_batch(self, batch_pids):
        x_batch = np.zeros((len(batch_pids), self.n_candidates_per_patient, 1,)
                               + self.transform_params['patch_size'], dtype='float32')
        y_batch = np.zeros((len(batch_pids),), dtype='float32')
        for i, pid in enumerate(batch_pids):
            patient_path = self.data_path + '/' + str(pid)
            img, pixel_spacing = utils_lung.read_dicom_scan(patient_path)  
            all_candidates = utils.load_pkl(self.id2candidates_path[pid])
            top_candidates = all_candidates[:self.n_candidates_per_patient]                       
            if self.shuffle_top_n:
                self.rng.shuffle(top_candidates)
            x_batch[i] = np.float32(self.data_prep_fun(data=img,
                                                   patch_centers=top_candidates,
                                                   pixel_spacing=pixel_spacing))[:, None, :, :, :]
            y_batch[i] = self.id2label.get(pid) 
        return x_batch, y_batch, batch_pids

class DSBDataGenerator(object):
    def __init__(self, data_path, transform_params=None, data_prep_fun=None, patient_pids=None, **kwargs):
        self.patient_paths = utils_lung.get_patient_data_paths(data_path)


        self.patient_paths = [data_path + '/' + p for p in patient_pids]

        self.nsamples = len(self.patient_paths)
        self.data_path = data_path
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params

    def generate(self):
        for p in self.patient_paths:
            pid = utils_lung.extract_pid_dir(p)

            img, pixel_spacing = utils_lung.read_dicom_scan(p)

            if self.data_prep_fun:
                x, tf_matrix = self.data_prep_fun(data=img, pixel_spacing=pixel_spacing)
            else:
                x = img

            x = np.float32(x)
            yield x,  pid




class CandidatesPropertiesLunaDataGenerator(object):
    def __init__(self, data_path, batch_size, transform_params, label_prep_fun,
                 nproperties,  patient_ids, data_prep_fun, rng,
                 full_batch, random, infinite, positive_proportion, properties_included=[],
                 random_negative_samples=False, **kwargs):

        id2positive_annotations = utils_lung.read_luna_properties(pathfinder.LUNA_PROPERTIES_PATH)
        id2negative_annotations = utils_lung.read_luna_negative_candidates(pathfinder.LUNA_CANDIDATES_PATH)

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2positive_annotations = {}
        self.id2negative_annotations = {}
        self.pid2patient_path = {}
        n_positive = 0
        for pid in patient_ids:
            self.pid2patient_path[pid] = data_path + '/' + pid + self.file_extension
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                n_positive += len(id2positive_annotations[pid])
            if pid in id2negative_annotations:
                self.id2negative_annotations[pid] = id2negative_annotations[pid]

        self.nsamples = int(n_positive + (1. - positive_proportion) / positive_proportion * n_positive)
        print 'n samples', self.nsamples
        self.idx2pid_annotation = {}
        i = 0
        for pid, annotations in self.id2positive_annotations.iteritems():
            for a in annotations:
                self.idx2pid_annotation[i] = (pid, a)
                i += 1
        print 'n positive', len(self.idx2pid_annotation.keys())

        if random_negative_samples:
            while i < self.nsamples:
                self.idx2pid_annotation[i] = (None, None)
                i += 1
        else:
            while i < self.nsamples:
                pid = rng.choice(self.id2negative_annotations.keys())
                patient_annotations = self.id2negative_annotations[pid]
                a = patient_annotations[rng.randint(len(patient_annotations))]
                self.idx2pid_annotation[i] = (pid, a)
                i += 1
        assert len(self.idx2pid_annotation) == self.nsamples

        self.data_path = data_path
        self.batch_size = batch_size
        self.rng = rng
        self.full_batch = full_batch
        self.random = random
        self.infinite = infinite
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.positive_proportion = positive_proportion
        self.label_prep_fun = label_prep_fun
        self.nlabels = nproperties

        if len(properties_included)>0:
            self.nlabels=len(properties_included)
        self.properties_included = properties_included

        assert self.transform_params['pixel_spacing'] == (1., 1., 1.)

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)
            for pos in xrange(0, len(rand_idxs), self.batch_size):
                idxs_batch = rand_idxs[pos:pos + self.batch_size]
                nb = len(idxs_batch)
                # allocate batches
                x_batch = np.zeros((nb,) + self.transform_params['patch_size'], dtype='float32')
                y_batch = np.zeros((nb, self.nlabels), dtype='float32')
                patients_ids = []

                for i, idx in enumerate(idxs_batch):
                    pid, patch_annotation = self.idx2pid_annotation[idx]

                    if pid is None:
                        pid = self.rng.choice(self.id2negative_annotations.keys())
                        patient_annotations = self.id2negative_annotations[pid]
                        patch_annotation = patient_annotations[self.rng.randint(len(patient_annotations))]

                    patient_path = self.pid2patient_path[pid]
                    patients_ids.append(pid)

                    y_batch[i] = self.label_prep_fun(patch_annotation,self.properties_included)
                    # print pid, y_batch[i]

                    img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                        if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)

                    patch_zyxd = patch_annotation[:4]
                    x_batch[i, :, :, :] = self.data_prep_fun(data=img, pid = pid,
                                                                patch_center=patch_zyxd,
                                                                pixel_spacing=pixel_spacing,
                                                                luna_origin=origin)
                y_batch = np.asarray(y_batch,dtype=np.float32)
                if self.full_batch:
                    if nb == self.batch_size:
                        yield x_batch, y_batch, patients_ids
                else:
                    yield x_batch, y_batch, patients_ids

            if not self.infinite:
                break


class CandidatesPropertiesLunaDataGenerator2(object):
    def __init__(self, data_path, batch_size, transform_params, label_prep_fun,
                 nproperties,  patient_ids, data_prep_fun, rng,
                 full_batch, random, infinite, positive_proportion, properties_included=[],
                 random_negative_samples=False, **kwargs):

        id2positive_annotations = utils_lung.read_luna_properties(pathfinder.LUNA_PROPERTIES_PATH)
        id2negative_annotations = utils_lung.read_luna_negative_candidates(pathfinder.LUNA_CANDIDATES_PATH)

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2positive_annotations = {}
        self.id2negative_annotations = {}
        self.pid2patient_path = {}
        n_positive = 0
        for pid in patient_ids:
            self.pid2patient_path[pid] = data_path + '/' + pid + self.file_extension
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                n_positive += len(id2positive_annotations[pid])
            if pid in id2negative_annotations:
                self.id2negative_annotations[pid] = id2negative_annotations[pid]

        self.nsamples = int(n_positive + (1. - positive_proportion) / positive_proportion * n_positive)
        print 'n samples', self.nsamples
        self.idx2pid_annotation = {}
        i = 0
        for pid, annotations in self.id2positive_annotations.iteritems():
            for a in annotations:
                self.idx2pid_annotation[i] = (pid, a)
                i += 1
        print 'n positive', len(self.idx2pid_annotation.keys())

        if random_negative_samples:
            while i < self.nsamples:
                self.idx2pid_annotation[i] = (None, None)
                i += 1
        else:
            while i < self.nsamples:
                pid = rng.choice(self.id2negative_annotations.keys())
                patient_annotations = self.id2negative_annotations[pid]
                a = patient_annotations[rng.randint(len(patient_annotations))]
                self.idx2pid_annotation[i] = (pid, a)
                i += 1
        assert len(self.idx2pid_annotation) == self.nsamples

        self.data_path = data_path
        self.batch_size = batch_size
        self.rng = rng
        self.full_batch = full_batch
        self.random = random
        self.infinite = infinite
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.positive_proportion = positive_proportion
        self.label_prep_fun = label_prep_fun
        self.nlabels = nproperties

        if len(properties_included)>0:
            self.nlabels=len(properties_included)
        self.properties_included = properties_included

        assert self.transform_params['pixel_spacing'] == (1., 1., 1.)

    def generate(self):
        while True:
            rand_idxs = np.arange(self.nsamples)
            if self.random:
                self.rng.shuffle(rand_idxs)
            for pos in xrange(0, len(rand_idxs), self.batch_size):
                idxs_batch = rand_idxs[pos:pos + self.batch_size]
                nb = len(idxs_batch)
                # allocate batches
                x_batch = np.zeros((nb,) + self.transform_params['patch_size'], dtype='float32')
                if self.nlabels == 1:
                    y_batch = np.zeros((nb,), dtype='float32')
                else:
                    y_batch = np.zeros((nb, self.nlabels), dtype='float32')
                patients_ids = []

                for i, idx in enumerate(idxs_batch):
                    pid, patch_annotation = self.idx2pid_annotation[idx]

                    if pid is None:
                        pid = self.rng.choice(self.id2negative_annotations.keys())
                        patient_annotations = self.id2negative_annotations[pid]
                        patch_annotation = patient_annotations[self.rng.randint(len(patient_annotations))]

                    patient_path = self.pid2patient_path[pid]
                    patients_ids.append(pid)

                    y_batch[i] = self.label_prep_fun(patch_annotation,self.properties_included)
                    # print pid, y_batch[i]

                    img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                        if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)

                    patch_zyxd = patch_annotation[:4]
                    x_batch[i, :, :, :] = self.data_prep_fun(data=img, pid = pid,
                                                                patch_center=patch_zyxd,
                                                                pixel_spacing=pixel_spacing,
                                                                luna_origin=origin)
                y_batch = np.asarray(y_batch,dtype=np.float32)
                if self.full_batch:
                    if nb == self.batch_size:
                        yield x_batch, y_batch, patients_ids
                else:
                    yield x_batch, y_batch, patients_ids

            if not self.infinite:
                break


class CandidatesLunaValidDataGenerator2(object):
    def __init__(self, data_path, transform_params, patient_ids, data_prep_fun, label_prep_fun=None,properties_included=[],
                 **kwargs):
        rng = np.random.RandomState(42)  # do not change this!!!

        id2positive_annotations = utils_lung.read_luna_properties(pathfinder.LUNA_PROPERTIES_PATH)
        id2negative_annotations = utils_lung.read_luna_negative_candidates(pathfinder.LUNA_CANDIDATES_PATH)

        self.file_extension = '.pkl' if 'pkl' in data_path else '.mhd'
        self.id2positive_annotations = {}
        self.id2negative_annotations = {}
        self.id2patient_path = {}
        n_positive, n_negative = 0, 0
        for pid in patient_ids:
            if pid in id2positive_annotations:
                self.id2positive_annotations[pid] = id2positive_annotations[pid]
                negative_annotations = id2negative_annotations[pid]
                n_pos = len(id2positive_annotations[pid])
                n_neg = len(id2negative_annotations[pid])
                neg_idxs = rng.choice(n_neg, size=n_pos, replace=False)
                negative_annotations_selected = []
                for i in neg_idxs:
                    negative_annotations_selected.append(negative_annotations[i])
                self.id2negative_annotations[pid] = negative_annotations_selected

                self.id2patient_path[pid] = data_path + '/' + pid + self.file_extension
                n_positive += n_pos
                n_negative += n_pos

        print 'n positive', n_positive
        print 'n negative', n_negative

        self.nsamples = len(self.id2patient_path)
        self.data_path = data_path
        self.rng = rng
        self.data_prep_fun = data_prep_fun
        self.transform_params = transform_params
        self.label_prep_fun = label_prep_fun
        if label_prep_fun is not None:
            assert self.transform_params['pixel_spacing'] == (1., 1., 1.)

        self.properties_included = properties_included

    def generate(self):

        for pid in self.id2positive_annotations.iterkeys():
            for patch_center in self.id2positive_annotations[pid]:
                patient_path = self.id2patient_path[pid]

                img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                    if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)
                if self.label_prep_fun is None:
                    y_batch = np.array([1.], dtype='float32')
                else:
                    y_batch = np.array([self.label_prep_fun(patch_center,self.properties_included)], dtype='float32')
                x_batch = np.float32(self.data_prep_fun(data=img, pid=pid,
                                                        patch_center=patch_center[0:4],
                                                        pixel_spacing=pixel_spacing,
                                                        luna_origin=origin))[None, :, :, :]

                yield x_batch, y_batch, [pid]

            for patch_center in self.id2negative_annotations[pid]:
                patient_path = self.id2patient_path[pid]

                img, origin, pixel_spacing = utils_lung.read_pkl(patient_path) \
                    if self.file_extension == '.pkl' else utils_lung.read_mhd(patient_path)
                y_batch = np.array([0.], dtype='float32')
                x_batch = np.float32(self.data_prep_fun(data=img, pid=pid,
                                                        patch_center=patch_center,
                                                        pixel_spacing=pixel_spacing,
                                                        luna_origin=origin))[None, :, :, :]

                yield x_batch, y_batch, [pid]