--- a +++ b/scripts/data_exploration.py @@ -0,0 +1,115 @@ +#==============================================================================# +# Author: Dominik Müller # +# Copyright: 2020 IT-Infrastructure for Translational Medical Research, # +# University of Augsburg # +# # +# This program is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# This program is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details. # +# # +# You should have received a copy of the GNU General Public License # +# along with this program. If not, see <http://www.gnu.org/licenses/>. # +#==============================================================================# +#-----------------------------------------------------# +# Library imports # +#-----------------------------------------------------# +from miscnn.data_loading.interfaces import NIFTI_interface +from miscnn import Data_IO +from tqdm import tqdm +import os +import numpy as np +import pandas as pd + +#-----------------------------------------------------# +# Configurations # +#-----------------------------------------------------# +# Data directory +path_data = "data" + +#-----------------------------------------------------# +# Data Exploration # +#-----------------------------------------------------# +# Initialize Data IO Interface for NIfTI data +interface = NIFTI_interface(channels=1, classes=3) + +# Create Data IO object to load and write samples in the file structure +data_io = Data_IO(interface, path_data, delete_batchDir=True) + +# Access all available samples in our file structure +sample_list = data_io.get_indiceslist() +sample_list.sort() + +# Print out the sample list +print("Sample list:", sample_list) + +# Now let's load each sample and obtain collect diverse information from them +sample_data = {} +for index in tqdm(sample_list): + # Sample loading + sample = data_io.sample_loader(index, load_seg=True) + # Create an empty list for the current asmple in our data dictionary + sample_data[index] = [] + # Store the volume shape + sample_data[index].append(sample.img_data.shape) + # Identify minimum and maximum volume intensity + sample_data[index].append(sample.img_data.min()) + sample_data[index].append(sample.img_data.max()) + # Store voxel spacing + sample_data[index].append(sample.details["spacing"]) + # Identify and store class distribution + unique_data, unique_counts = np.unique(sample.seg_data, return_counts=True) + class_freq = unique_counts / np.sum(unique_counts) + class_freq = np.around(class_freq, decimals=6) + sample_data[index].append(tuple(class_freq)) + +# Transform collected data into a pandas dataframe +df = pd.DataFrame.from_dict(sample_data, orient="index", + columns=["vol_shape", "vol_minimum", + "vol_maximum", "voxel_spacing", + "class_frequency"]) + +# Print out the dataframe to console +with pd.option_context('display.max_rows', None, 'display.max_columns', None): + print(df) + +# Calculate mean and median shape sizes +shape_list = np.array(df["vol_shape"].tolist()) +for i, a in enumerate(["X", "Y", "Z"]): + print(a + "-Axes Mean:", np.mean(shape_list[:,i])) + print(a + "-Axes Median:", np.median(shape_list[:,i])) + +# Calculate average class frequency +df_classes = pd.DataFrame(df["class_frequency"].tolist(), + columns=["background", "lung_L", + "lung_R", "infection"]) +print(df_classes.mean(axis=0)) + +## The resolution of the volumes are fixed for the x,y axes to 512x512 for the +## "coronacases" data and 630x630 for the "radiopaedia" volumes. +## Strangely, a single sample "radiopaedia_14_85914_0" has only 630x401 for x,y. +## Furthermore, the sample "radiopaedia_10_85902_3" has a ~ 8x times higher +## z axis than the other radiopaedia samples. +## Overall, the aim to identify suited resampling coordinates will be tricky. +## +## When looking on the voxel spacings from the affinity matrix of the CTs, +## we see that the x,y spacings or quite similiar whereas the z axis reveal +## large differences. Resampling to a common voxel spacing is mandatory. +## +## We see that the radiopaedia ct volumes have already preprocessed +## In the description of the data set, it is documentated that they +## used a clipping to the lung window [-1250, 250] and normalized these values +## to [0,255]. Therefore we have to perform the identical preprocessing to the +## other ct volumes as well +## +## Also, the class frequency reveal a heavy bias towards the background class +## as expected in medical image segmentation +## +## In COVID-19-CT-Seg dataset, the last 10 cases from Radiopaedia have been +## adjusted to lung window [-1250,250], and then normalized to [0,255], +## we recommend to adust the first 10 cases from Coronacases with the same method.