[09c10e]: / load_dataset / custom_datasets.py

Download this file

206 lines (177 with data), 10.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
#custom_datasets.py
#Copyright (c) 2020 Rachel Lea Ballantyne Draelos
#MIT License
#Permission is hereby granted, free of charge, to any person obtaining a copy
#of this software and associated documentation files (the "Software"), to deal
#in the Software without restriction, including without limitation the rights
#to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
#copies of the Software, and to permit persons to whom the Software is
#furnished to do so, subject to the following conditions:
#The above copyright notice and this permission notice shall be included in all
#copies or substantial portions of the Software.
#THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
#IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
#FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
#AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
#LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
#OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
#SOFTWARE
import os
import pickle
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from . import utils
#Set seeds
np.random.seed(0)
torch.manual_seed(0)
torch.cuda.manual_seed(0)
torch.cuda.manual_seed_all(0)
###################################################
# PACE Dataset for Data Stored in 2019-10-BigData #-----------------------------
###################################################
class CTDataset_2019_10(Dataset):
def __init__(self, setname, label_type_ld,
label_meanings, num_channels, pixel_bounds,
data_augment, crop_type,
selected_note_acc_files):
"""CT Dataset class that works for preprocessed data in 2019-10-BigData.
A single example (for crop_type == 'single') is a 4D CT volume:
if num_channels == 3, shape [134,3,420,420]
if num_channels == 1, shape [402,420,420]
Variables:
<setname> is either 'train' or 'valid' or 'test'
<label_type_ld> is 'disease_new'
<label_meanings>: list of strings indicating which labels should
be kept. Alternatively, can be the string 'all' in which case
all labels are kept.
<num_channels>: number of channels to reshape the image to.
== 3 if the model uses a pretrained feature extractor.
== 1 if the model uses only 3D convolutions.
<pixel_bounds>: list of ints e.g. [-1000,200]
Determines the lower bound, upper bound of pixel value clipping
and normalization.
<data_augment>: if True, perform data augmentation.
<crop_type>: is 'single' for an example consisting of one 4D numpy array
<selected_note_acc_files>: This should be a dictionary
with key equal to setname and value that is a string. If the value
is a path to a file, the file must be a CSV. Only note accessions
in this file will be used. If the value is not a valid file path,
all available note accs will be used, i.e. the model will be
trained on the whole dataset."""
self.setname = setname
self.define_subsets_list()
self.label_type_ld = label_type_ld
self.label_meanings = label_meanings
self.num_channels = num_channels
self.pixel_bounds = pixel_bounds
if self.setname == 'train':
self.data_augment = data_augment
else:
self.data_augment = False
print('For dataset',self.setname,'data_augment is',self.data_augment)
self.crop_type = crop_type
assert self.crop_type == 'single'
self.selected_note_acc_files = selected_note_acc_files
#Define location of the CT volumes
self.main_clean_path = './load_dataset/fakedata'
self.volume_log_df = pd.read_csv('./load_dataset/fakedata/CT_Scan_Preprocessing_Log_File_FINAL_SMALL.csv',header=0,index_col=0)
#Get the example ids
self.volume_accessions = self.get_volume_accessions()
#Get the ground truth labels
self.labels_df = self.get_labels_df()
# Pytorch Required Methods #------------------------------------------------
def __len__(self):
return len(self.volume_accessions)
def __getitem__(self, idx):
"""Return a single sample at index <idx>. The sample is a Python
dictionary with keys 'data' and 'gr_truth' for the image and label,
respectively"""
return self._get_pace(self.volume_accessions[idx])
# Volume Accession Methods #------------------------------------------------
def get_note_accessions(self):
setname_file = self.selected_note_acc_files[self.setname]
if os.path.isfile(setname_file):
print('\tObtaining note accessions from',setname_file)
sel_accs = pd.read_csv(setname_file,header=0)
assert sorted(list(set(sel_accs['Subset_Assigned'].values.tolist())))==sorted(self.subsets_list)
note_accs = sel_accs.loc[:,'Accession'].values.tolist()
print('\tTotal theoretical note accessions in subsets:',len(note_accs))
return note_accs
else:
print('\tObtaining note accessions from complete identifiers file')
#Read in identifiers file, which contains note_accessions
#Columns are MRN, Accession, Set_Assigned, Set_Should_Be, Subset_Assigned
all_ids = pd.read_csv('./load_dataset/fakedata/all_identifiers.csv',header=0)
#Extract the note_accessions
note_accs = []
for subset in self.subsets_list: #e.g. ['imgvalid_a','imgvalid_b']
subset_note_accs = all_ids[all_ids['Subset_Assigned']==subset].loc[:,'Accession'].values.tolist()
note_accs += subset_note_accs
print('\tTotal theoretical note accessions in subsets:',len(note_accs))
return note_accs
def get_volume_accessions(self):
note_accs = self.get_note_accessions()
#Translate note_accessions to volume_accessions based on what data has been
#preprocessed successfully. volume_log_df has note accessions as the
#index, and the column 'full_filename_npz' for the volume accession.
#The column 'status' should equal 'success' if the volume has been
#preprocessed correctly.
print('\tTotal theoretical volumes in whole dataset:',self.volume_log_df.shape[0])
self.volume_log_df = self.volume_log_df[self.volume_log_df['status']=='success']
print('\tTotal successfully preprocessed volumes in whole dataset:',self.volume_log_df.shape[0])
volume_accs = []
for note_acc in note_accs:
if note_acc in self.volume_log_df.index.values.tolist():
volume_accs.append(self.volume_log_df.at[note_acc,'full_filename_npz'])
print('\tFinal total successfully preprocessed volumes in requested subsets:',len(volume_accs))
#According to this thread: https://github.com/pytorch/pytorch/issues/13246
#it is better to use a numpy array than a list to reduce memory leaks.
return np.array(volume_accs)
# Ground Truth Label Methods #----------------------------------------------
def get_labels_df(self):
#Get the ground truth labels based on requested label type.
labels_df = read_in_labels(self.label_type_ld, self.setname)
#Now filter the ground truth labels based on the desired label meanings:
if self.label_meanings != 'all': #i.e. if you want to filter
labels_df = labels_df[self.label_meanings]
return labels_df
# Fetch a CT Volume (__getitem__ implementation) #--------------------------
def _get_pace(self, volume_acc):
"""<volume_acc> is for example RHAA12345_6.npz"""
#Load compressed npz file: [slices, square, square]
ctvol = np.load(os.path.join(self.main_clean_path, volume_acc))['ct']
#Prepare the CT volume data (already torch Tensors)
data = utils.prepare_ctvol_2019_10_dataset(ctvol, self.pixel_bounds, self.data_augment, self.num_channels, self.crop_type)
#Get the ground truth:
note_acc = self.volume_log_df[self.volume_log_df['full_filename_npz']==volume_acc].index.values.tolist()[0]
gr_truth = self.labels_df.loc[note_acc, :].values
gr_truth = torch.from_numpy(gr_truth).squeeze().type(torch.float)
#When training on only one abnormality you must unsqueeze to prevent
#a dimensions error when training the model:
if len(self.label_meanings)==1:
gr_truth = gr_truth.unsqueeze(0)
#Create the sample
sample = {'data': data, 'gr_truth': gr_truth, 'volume_acc': volume_acc}
return sample
# Sanity Check #------------------------------------------------------------
def define_subsets_list(self):
assert self.setname in ['train','valid','test']
if self.setname == 'train':
self.subsets_list = ['imgtrain']
elif self.setname == 'valid':
self.subsets_list = ['imgvalid_a']
elif self.setname == 'test':
self.subsets_list = ['imgtest_a','imgtest_b','imgtest_c','imgtest_d']
print('Creating',self.setname,'dataset with subsets',self.subsets_list)
#######################
# Ground Truth Labels #---------------------------------------------------------
#######################
def read_in_labels(label_type_ld, setname):
"""Return a pandas dataframe with the dataset labels.
Accession numbers are the index and labels (e.g. "pneumonia") are the columns.
<setname> can be 'train', 'valid', or 'test'."""
assert label_type_ld == 'disease_new'
labels_file = './load_dataset/fakedata/2019-12-18_duke_disease/img'+setname+'_BinaryLabels.csv'
return pd.read_csv(labels_file, header=0, index_col = 0)