|
a |
|
b/scripts/data_exploration.py |
|
|
1 |
#==============================================================================# |
|
|
2 |
# Author: Dominik Müller # |
|
|
3 |
# Copyright: 2020 IT-Infrastructure for Translational Medical Research, # |
|
|
4 |
# University of Augsburg # |
|
|
5 |
# # |
|
|
6 |
# This program is free software: you can redistribute it and/or modify # |
|
|
7 |
# it under the terms of the GNU General Public License as published by # |
|
|
8 |
# the Free Software Foundation, either version 3 of the License, or # |
|
|
9 |
# (at your option) any later version. # |
|
|
10 |
# # |
|
|
11 |
# This program is distributed in the hope that it will be useful, # |
|
|
12 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of # |
|
|
13 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # |
|
|
14 |
# GNU General Public License for more details. # |
|
|
15 |
# # |
|
|
16 |
# You should have received a copy of the GNU General Public License # |
|
|
17 |
# along with this program. If not, see <http://www.gnu.org/licenses/>. # |
|
|
18 |
#==============================================================================# |
|
|
19 |
#-----------------------------------------------------# |
|
|
20 |
# Library imports # |
|
|
21 |
#-----------------------------------------------------# |
|
|
22 |
from miscnn.data_loading.interfaces import NIFTI_interface |
|
|
23 |
from miscnn import Data_IO |
|
|
24 |
from tqdm import tqdm |
|
|
25 |
import os |
|
|
26 |
import numpy as np |
|
|
27 |
import pandas as pd |
|
|
28 |
|
|
|
29 |
#-----------------------------------------------------# |
|
|
30 |
# Configurations # |
|
|
31 |
#-----------------------------------------------------# |
|
|
32 |
# Data directory |
|
|
33 |
path_data = "data" |
|
|
34 |
|
|
|
35 |
#-----------------------------------------------------# |
|
|
36 |
# Data Exploration # |
|
|
37 |
#-----------------------------------------------------# |
|
|
38 |
# Initialize Data IO Interface for NIfTI data |
|
|
39 |
interface = NIFTI_interface(channels=1, classes=3) |
|
|
40 |
|
|
|
41 |
# Create Data IO object to load and write samples in the file structure |
|
|
42 |
data_io = Data_IO(interface, path_data, delete_batchDir=True) |
|
|
43 |
|
|
|
44 |
# Access all available samples in our file structure |
|
|
45 |
sample_list = data_io.get_indiceslist() |
|
|
46 |
sample_list.sort() |
|
|
47 |
|
|
|
48 |
# Print out the sample list |
|
|
49 |
print("Sample list:", sample_list) |
|
|
50 |
|
|
|
51 |
# Now let's load each sample and obtain collect diverse information from them |
|
|
52 |
sample_data = {} |
|
|
53 |
for index in tqdm(sample_list): |
|
|
54 |
# Sample loading |
|
|
55 |
sample = data_io.sample_loader(index, load_seg=True) |
|
|
56 |
# Create an empty list for the current asmple in our data dictionary |
|
|
57 |
sample_data[index] = [] |
|
|
58 |
# Store the volume shape |
|
|
59 |
sample_data[index].append(sample.img_data.shape) |
|
|
60 |
# Identify minimum and maximum volume intensity |
|
|
61 |
sample_data[index].append(sample.img_data.min()) |
|
|
62 |
sample_data[index].append(sample.img_data.max()) |
|
|
63 |
# Store voxel spacing |
|
|
64 |
sample_data[index].append(sample.details["spacing"]) |
|
|
65 |
# Identify and store class distribution |
|
|
66 |
unique_data, unique_counts = np.unique(sample.seg_data, return_counts=True) |
|
|
67 |
class_freq = unique_counts / np.sum(unique_counts) |
|
|
68 |
class_freq = np.around(class_freq, decimals=6) |
|
|
69 |
sample_data[index].append(tuple(class_freq)) |
|
|
70 |
|
|
|
71 |
# Transform collected data into a pandas dataframe |
|
|
72 |
df = pd.DataFrame.from_dict(sample_data, orient="index", |
|
|
73 |
columns=["vol_shape", "vol_minimum", |
|
|
74 |
"vol_maximum", "voxel_spacing", |
|
|
75 |
"class_frequency"]) |
|
|
76 |
|
|
|
77 |
# Print out the dataframe to console |
|
|
78 |
with pd.option_context('display.max_rows', None, 'display.max_columns', None): |
|
|
79 |
print(df) |
|
|
80 |
|
|
|
81 |
# Calculate mean and median shape sizes |
|
|
82 |
shape_list = np.array(df["vol_shape"].tolist()) |
|
|
83 |
for i, a in enumerate(["X", "Y", "Z"]): |
|
|
84 |
print(a + "-Axes Mean:", np.mean(shape_list[:,i])) |
|
|
85 |
print(a + "-Axes Median:", np.median(shape_list[:,i])) |
|
|
86 |
|
|
|
87 |
# Calculate average class frequency |
|
|
88 |
df_classes = pd.DataFrame(df["class_frequency"].tolist(), |
|
|
89 |
columns=["background", "lung_L", |
|
|
90 |
"lung_R", "infection"]) |
|
|
91 |
print(df_classes.mean(axis=0)) |
|
|
92 |
|
|
|
93 |
## The resolution of the volumes are fixed for the x,y axes to 512x512 for the |
|
|
94 |
## "coronacases" data and 630x630 for the "radiopaedia" volumes. |
|
|
95 |
## Strangely, a single sample "radiopaedia_14_85914_0" has only 630x401 for x,y. |
|
|
96 |
## Furthermore, the sample "radiopaedia_10_85902_3" has a ~ 8x times higher |
|
|
97 |
## z axis than the other radiopaedia samples. |
|
|
98 |
## Overall, the aim to identify suited resampling coordinates will be tricky. |
|
|
99 |
## |
|
|
100 |
## When looking on the voxel spacings from the affinity matrix of the CTs, |
|
|
101 |
## we see that the x,y spacings or quite similiar whereas the z axis reveal |
|
|
102 |
## large differences. Resampling to a common voxel spacing is mandatory. |
|
|
103 |
## |
|
|
104 |
## We see that the radiopaedia ct volumes have already preprocessed |
|
|
105 |
## In the description of the data set, it is documentated that they |
|
|
106 |
## used a clipping to the lung window [-1250, 250] and normalized these values |
|
|
107 |
## to [0,255]. Therefore we have to perform the identical preprocessing to the |
|
|
108 |
## other ct volumes as well |
|
|
109 |
## |
|
|
110 |
## Also, the class frequency reveal a heavy bias towards the background class |
|
|
111 |
## as expected in medical image segmentation |
|
|
112 |
## |
|
|
113 |
## In COVID-19-CT-Seg dataset, the last 10 cases from Radiopaedia have been |
|
|
114 |
## adjusted to lung window [-1250,250], and then normalized to [0,255], |
|
|
115 |
## we recommend to adust the first 10 cases from Coronacases with the same method. |