Histology-ML / Git / [3b7fea] /tf_record_weight

Models:
DanielG/
Histology-ML
Downloads: 1
[3b7fea]: / tf_record_weight_determination.py
History
Download this file
211 lines (157 with data), 7.0 kB

# %% importing packages

import numpy as np
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from skimage import measure
import cv2 as cv
import time
from joblib import Parallel, delayed
from scipy.stats import variation 
import tqdm
import matplotlib.pyplot as plt
import gc



# %% Citations
#############################################################
#############################################################


def parse_tf_elements(element):
    '''This function is the mapper function for retrieving examples from the
       tfrecord'''

    # create placeholders for all the features in each example
    data = {
        'height' : tf.io.FixedLenFeature([],tf.int64),
        'width' : tf.io.FixedLenFeature([],tf.int64),
        'raw_image' : tf.io.FixedLenFeature([],tf.string),
        'raw_seg' : tf.io.FixedLenFeature([],tf.string),
        'bbox_x' : tf.io.VarLenFeature(tf.float32),
        'bbox_y' : tf.io.VarLenFeature(tf.float32),
        'bbox_height' : tf.io.VarLenFeature(tf.float32),
        'bbox_width' : tf.io.VarLenFeature(tf.float32)
    }

    # pull out the current example
    content = tf.io.parse_single_example(element, data)

    # pull out each feature from the example 
    height = content['height']
    width = content['width']
    raw_seg = content['raw_seg']
    raw_image = content['raw_image']
    bbox_x = content['bbox_x']
    bbox_y = content['bbox_y']
    bbox_height = content['bbox_height']
    bbox_width = content['bbox_width']

    # convert the images to uint8, and reshape them accordingly
    image = tf.io.parse_tensor(raw_image, out_type=tf.uint8)
    image = tf.reshape(image,shape=[height,width,3])
    segmentation = tf.io.parse_tensor(raw_seg, out_type=tf.uint8)
    segmentation = tf.reshape(segmentation,shape=[height,width,1])
    one_hot_seg = tf.one_hot(tf.squeeze(segmentation),7,axis=-1)

    # there currently is a bug with returning the bbox, but isn't necessary
    # to fix for creating the initial uNet for segmentation exploration
    
    # bbox = [bbox_x,bbox_y,bbox_height,bbox_width]

    return(image,one_hot_seg)

#############################################################

def load_dataset(file_names):
    '''Receives a list of file names from a folder that contains tfrecord files
       compiled previously. Takes these names and creates a tensorflow dataset
       from them.'''

    ignore_order = tf.data.Options()
    ignore_order.experimental_deterministic = False
    dataset = tf.data.TFRecordDataset(file_names)

    # you can shard the dataset if you like to reduce the size when necessary
    # dataset = dataset.shard(num_shards=2,index=1)
    
    # order in the file names doesn't really matter, so ignoring it
    dataset = dataset.with_options(ignore_order)

    # mapping the dataset using the parse_tf_elements function defined earlier
    dataset = dataset.map(parse_tf_elements,num_parallel_calls=1)
    
    return(dataset)

#############################################################

def get_dataset(file_names,batch_size):
    '''Receives a list of file names of tfrecord shards from a dataset as well
       as a batch size for the dataset.'''
    
    # uses the load_dataset function to retrieve the files and put them into a 
    # dataset.
    dataset = load_dataset(file_names)
    
    # creates a shuffle buffer of 1000. Number was arbitrarily chosen, feel free
    # to alter as fits your hardware.
    dataset = dataset.shuffle(50)

    # adding the batch size to the dataset
    dataset = dataset.batch(batch_size=batch_size)

    return(dataset)

#############################################################

def joblib_parallel_function_class_sums(sample):
    '''Receives a sample, separates out the ground truth, and sends back 
       a list of the sums of each class in order.'''
    
    # sum up each class in the dataset for this example
    sum_classes = np.sum(sample,axis=(0,1,2))
    try:
        return(sum_classes)
    finally:
        gc.collect()
        tf.keras.backend.clear_session()

#############################################################
# %%
# pick one directory from which to read the dataset shards
shard_dataset_directory = '/home/briancottle/Research/Semantic_Segmentation/dataset_shards_6/train'
os.chdir(shard_dataset_directory)
file_names = tf.io.gfile.glob(shard_dataset_directory + "/shard_*_of_*.tfrecords")

# retrieve a dataset including all of the files in the directory. To only 
# include information from a training dataset, you should move specific dataset
# files to a new directory, and then from there perform the analysis.
batch_size = 1
dataset = get_dataset(file_names,batch_size=batch_size)
dataset = dataset.shard(num_shards=10,index=2)
# %%

# percentages = Parallel(
#     n_jobs=15, verbose=5, backend='loky')(delayed(joblib_parallel_function_class_sums)
#     (sample[1]) for sample in dataset
#     )

# %% iterate through each example in the dataset
percentages = []
variances = np.zeros((32400,3))
all_means = np.zeros((32400,3))
count = 0
for sample in dataset:
    ground_truth = sample[1]
    image = sample[0]
    # sum up each class in the dataset for this example
    sum_classes = np.sum(ground_truth,axis=(0,1,2))

    # all_means[count,:] = [np.mean(image[0,:,:,0]),
    #                     np.mean(image[0,:,:,1]),
    #                     np.mean(image[0,:,:,2])]
    
    # variances[count,:] = [np.var(image[0,:,:,0]),
    #                   np.var(image[0,:,:,1]),
    #                   np.var(image[0,:,:,2])]

    # append the sums to the list
    percentages.append(sum_classes)
    gc.collect()
    tf.keras.backend.clear_session()
    count += 1

# %%

# convert to array
sums = np.asarray(percentages)
# get the sum of each class divided by the number of pixels in an image. make 
# sure to change this value if your images are a different size!
percents = sums/(1024*1024*batch_size)

# get the standard deviation of the percentages
std_dev = np.std(percents,axis=0)
# get the mean of the percentages
means = np.mean(percents,axis=0)
# produce the weights as the inverse of the percentages
weights = 1/means
print(weights)

# mean_means = np.mean(np.asarray(all_means),axis=0)
# var_vars = np.var(np.asarray(variances),axis=0)
# print(f'mean of the means is {mean_means}')
# print(f'variance of the variances is {var_vars}')




# [         inf   2.15248481   3.28798466   5.18559616  46.96594578
#  130.77512742 105.23678672]


# [         inf   2.3971094    3.04084893   4.77029963  39.23478673
#  118.13505703  96.22377396]



# [         inf   2.72403952   2.81034368   4.36437716  36.66264202
#  108.40694198  87.39903838] # results with dataset 5

# means and variances with daataset 5

# mean of the means is [232.69476802 204.16933591 211.45184799]
# variance of the variances is [ 139869.85259648  550311.88980989 1160687.94506812]


# %%