a b/scripts/data_exploration.py
1
#==============================================================================#
2
#  Author:       Dominik Müller                                                #
3
#  Copyright:    2020 IT-Infrastructure for Translational Medical Research,    #
4
#                University of Augsburg                                        #
5
#                                                                              #
6
#  This program is free software: you can redistribute it and/or modify        #
7
#  it under the terms of the GNU General Public License as published by        #
8
#  the Free Software Foundation, either version 3 of the License, or           #
9
#  (at your option) any later version.                                         #
10
#                                                                              #
11
#  This program is distributed in the hope that it will be useful,             #
12
#  but WITHOUT ANY WARRANTY; without even the implied warranty of              #
13
#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the               #
14
#  GNU General Public License for more details.                                #
15
#                                                                              #
16
#  You should have received a copy of the GNU General Public License           #
17
#  along with this program.  If not, see <http://www.gnu.org/licenses/>.       #
18
#==============================================================================#
19
#-----------------------------------------------------#
20
#                   Library imports                   #
21
#-----------------------------------------------------#
22
from miscnn.data_loading.interfaces import NIFTI_interface
23
from miscnn import Data_IO
24
from tqdm import tqdm
25
import os
26
import numpy as np
27
import pandas as pd
28
29
#-----------------------------------------------------#
30
#                    Configurations                   #
31
#-----------------------------------------------------#
32
# Data directory
33
path_data = "data"
34
35
#-----------------------------------------------------#
36
#                   Data Exploration                  #
37
#-----------------------------------------------------#
38
# Initialize Data IO Interface for NIfTI data
39
interface = NIFTI_interface(channels=1, classes=3)
40
41
# Create Data IO object to load and write samples in the file structure
42
data_io = Data_IO(interface, path_data, delete_batchDir=True)
43
44
# Access all available samples in our file structure
45
sample_list = data_io.get_indiceslist()
46
sample_list.sort()
47
48
# Print out the sample list
49
print("Sample list:", sample_list)
50
51
# Now let's load each sample and obtain collect diverse information from them
52
sample_data = {}
53
for index in tqdm(sample_list):
54
    # Sample loading
55
    sample = data_io.sample_loader(index, load_seg=True)
56
    # Create an empty list for the current asmple in our data dictionary
57
    sample_data[index] = []
58
    # Store the volume shape
59
    sample_data[index].append(sample.img_data.shape)
60
    # Identify minimum and maximum volume intensity
61
    sample_data[index].append(sample.img_data.min())
62
    sample_data[index].append(sample.img_data.max())
63
    # Store voxel spacing
64
    sample_data[index].append(sample.details["spacing"])
65
    # Identify and store class distribution
66
    unique_data, unique_counts = np.unique(sample.seg_data, return_counts=True)
67
    class_freq = unique_counts / np.sum(unique_counts)
68
    class_freq = np.around(class_freq, decimals=6)
69
    sample_data[index].append(tuple(class_freq))
70
71
# Transform collected data into a pandas dataframe
72
df = pd.DataFrame.from_dict(sample_data, orient="index",
73
                            columns=["vol_shape", "vol_minimum",
74
                                     "vol_maximum", "voxel_spacing",
75
                                     "class_frequency"])
76
77
# Print out the dataframe to console
78
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
79
    print(df)
80
81
# Calculate mean and median shape sizes
82
shape_list = np.array(df["vol_shape"].tolist())
83
for i, a in enumerate(["X", "Y", "Z"]):
84
    print(a + "-Axes Mean:", np.mean(shape_list[:,i]))
85
    print(a + "-Axes Median:", np.median(shape_list[:,i]))
86
87
# Calculate average class frequency
88
df_classes = pd.DataFrame(df["class_frequency"].tolist(),
89
                          columns=["background", "lung_L",
90
                                   "lung_R", "infection"])
91
print(df_classes.mean(axis=0))
92
93
## The resolution of the volumes are fixed for the x,y axes to 512x512 for the
94
## "coronacases" data and 630x630 for the "radiopaedia" volumes.
95
## Strangely, a single sample "radiopaedia_14_85914_0" has only 630x401 for x,y.
96
## Furthermore, the sample "radiopaedia_10_85902_3" has a ~ 8x times higher
97
## z axis than the other radiopaedia samples.
98
## Overall, the aim to identify suited resampling coordinates will be tricky.
99
##
100
## When looking on the voxel spacings from the affinity matrix of the CTs,
101
## we see that the x,y spacings or quite similiar whereas the z axis reveal
102
## large differences. Resampling to a common voxel spacing is mandatory.
103
##
104
## We see that the radiopaedia ct volumes have already preprocessed
105
## In the description of the data set, it is documentated that they
106
## used a clipping to the lung window [-1250, 250] and normalized these values
107
## to [0,255]. Therefore we have to perform the identical preprocessing to the
108
## other ct volumes as well
109
##
110
## Also, the class frequency reveal a heavy bias towards the background class
111
## as expected in medical image segmentation
112
##
113
## In COVID-19-CT-Seg dataset, the last 10 cases from Radiopaedia have been
114
## adjusted to lung window [-1250,250], and then normalized to [0,255],
115
## we recommend to adust the first 10 cases from Coronacases with the same method.