[03245f]: / utils / dataset.py

Download this file

316 lines (246 with data), 13.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
# sklearn and nltk imports
from sklearn.model_selection import KFold
from nltk.tokenize import word_tokenize
import numpy as np
import nltk
nltk.download("punkt", quiet=True)
# tensorflow imports
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
# progress bar
from tqdm import tqdm
# utils imports
from utils.text_handler import TextHandler
from utils.vocabulary import Vocabulary
class Dataset:
def __init__(self, image_vectors:dict, captions_data:dict, clear_long_captions:bool = True):
""" Base class to create the employed dataset for my research, i.e. ImageCLEF and IU X-Ray
Args:
image_vectors (dict): Dictionary with keys to be the ImageIDs and values the image embeddings.
captions_data (dict): Dictionary with keys to be the ImageIDs and values the captions.
clear_long_captions (bool, optional): If we want to drop the outlier long captions. Defaults to True.
"""
self.image_vectors = image_vectors
self.captions_data = captions_data
self.clear_long_captions = clear_long_captions
# init a text handler object to pre-process training captions
self.text_handler = TextHandler()
def delete_long_captions(self, data:dict, threshold:int=80) -> dict:
""" Function that removes the long captions only from the training set. This method was utilised during ImageCLEF campaign.
Args:
data (dict): Dictionary with keys to be the ImageIDs and values the captions.
threshold (int, optional): The maximum length limit. Defaults to 80.
Returns:
dict: Dictionary with keys to be the ImageIDs and values the captions, without the instances whose captions are long.
"""
filtered_data = {}
for image_id, caption in data.items():
tokens = word_tokenize(caption)
if len(tokens) <= threshold:
filtered_data[image_id] = caption
return filtered_data
@staticmethod
def build_splits(self) ->tuple[list, list, list]:
""" This function makes the split sets for trainig, validation and test.
In particulare, we followed the next splits:
train: 80%
valid: 5%
test: 15%
Returns:
tuple[list, list, list]: Training, validation, test set ids.
"""
image_ids = list( self.captions_data.keys() )
np.random.shuffle(image_ids)
test_split_threshold = int(0.15 * len(image_ids))
train, test = (
image_ids[:-test_split_threshold],
image_ids[-test_split_threshold:],
)
dev_split_threshold = int(0.1 * len(train))
train, dev = (
train[:-dev_split_threshold],
train[-dev_split_threshold:],
)
return train, dev, test
@staticmethod
def get_image_vectors(self, keys:list) -> dict:
""" Fetches from the whole dataset the image embeddings according to the utilised set.
Args:
keys (list): Split set ids
Returns:
dict: Dictionary with keys to be the ImageIDs and values the image embeddings, for each split set.
"""
return { k: v for k, v in tqdm(self.image_vectors.items(), desc="Fetching image embeddings..") if k in keys }
def get_captions(self, _ids:list) -> dict:
return { key:value for key, value in self.captions_data.items() if key in _ids}
def build_pseudo_cv_splits(self) -> tuple[list, list]:
""" This function makes cross-validaion splis using K-Fold cross validation. It was used only for ImageCLEF campaign.
More details are described in my Thesis.
Returns:
tuple[list, list]: Training and test fold sets.
"""
image_ids = list( self.captions_data.keys() )
np.random.shuffle(image_ids)
# apply 15-Fold CV
kf = KFold(n_splits=15)
train_fold_ids, test_fold_ids = list(), list()
for train_index, test_index in kf.split(image_ids):
train_ids = [image_ids[index] for index in train_index]
test_ids = [image_ids[index] for index in test_index]
train_fold_ids.append(train_ids)
test_fold_ids.append(test_ids)
return train_fold_ids, test_fold_ids
def build_vocab(self, training_captions:list, threshold:int = 3) -> tuple[Vocabulary, Tokenizer, dict, dict]:
""" This method creates the employed vocabulary given the training captions
Args:
training_captions (list): All training captions
threshold (int, optional): The cut-off frequence for Vocabulary. Defaults to 3.
Returns:
tuple[Vocabulary, Tokenizer, dict, dict]: The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary.
The latters are mappers for words and index respectively
"""
vocab = Vocabulary(texts=training_captions, threshold=threshold)
tokenizer, word2idx, idx2word = vocab.build_vocab()
return vocab, tokenizer, word2idx, idx2word
class IuXrayDataset(Dataset):
def __init__(self, image_vectors: dict, captions_data: dict, tags_data: dict):
""" Child class to create the employed IU X-Ray, inheriting the base class methods
Args:
image_vectors (dict): Dictionary with keys to be the ImageIDs and values the image embeddings.
captions_data (dict): Dictionary with keys to be the ImageIDs and values the captions.
tags_data (dict): Dictionary with keys to be the ImageIDs and values the tags embeddings.
"""
super().__init__(image_vectors=image_vectors, captions_data=captions_data, clear_long_captions=False)
self.tags_data = tags_data
# get the splits
self.train_dataset, self.dev_dataset, self.test_dataset = self.build_dataset()
# build linguistic attributes
self.vocab, self.tokenizer, self.word2idx, self.idx2word = super().build_vocab(training_captions=list(self.train_dataset[1].values()))
def __str__(self) -> str:
""" Python built-in function for prints
Returns:
str: A modified print.
"""
text = f"Train: patients={len(self.train_dataset[0])}, captions={len(self.train_dataset[1])}, tags={len(self.train_dataset[2])}"
text += f"\nDev: patients={len(self.dev_dataset[0])}, captions={len(self.dev_dataset[1])}, tags={len(self.dev_dataset[2])}"
text += f"\nTest: patients={len(self.test_dataset[0])}, captions={len(self.test_dataset[1])}, tags={len(self.test_dataset[2])}"
return text
def get_splits_sets(self) ->tuple[list, list, list]:
""" Fetches the data for each split set.
Returns:
tuple[list, list, list]: train_dataset, dev_dataset, test_dataset
"""
return self.train_dataset, self.dev_dataset, self.test_dataset
def get_tokenizer_utils(self) ->tuple[Vocabulary, Tokenizer, dict, dict]:
""" Fetches the linguistic utilities.
Returns:
tuple[Vocabulary, Tokenizer, dict, dict]: The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary.
The latters are mappers for words and index respectively
"""
return self.vocab, self.tokenizer, self.word2idx, self.idx2word
def __get_tags(self, _ids:list) -> dict:
""" Fetches from the whole dataset the tags embeddings according to the utilised set.
Args:
_ids (list): Split set ids
Returns:
dict: Dictionary with keys to be the ImageIDs and values the tags embeddings
"""
return { key:value for key, value in self.tags_data.items() if key in _ids}
def build_dataset(self) -> tuple[list, list, list]:
""" Begins the whole process for the dataset creation.
Returns:
tuple[list, list, list]: The training dataset, the validation dataset and the test dataset for our models.
All sets are in list format.
1st index --> image vectors
2nd index --> captions
3rd index --> tags
"""
# random split
train_ids, dev_ids, test_ids = super().build_splits()
# fetch images
train_images = super().get_image_vectors(train_ids)
dev_images = super().get_image_vectors(dev_ids)
test_images = super().get_image_vectors(test_ids)
# fetch captions
train_captions = super().get_captions(train_ids)
dev_captions = super().get_captions(dev_ids)
test_captions = super().get_captions(test_ids)
# apply preprocess to training captions
train_captions_prepro = self.text_handler.preprocess_all(
list(train_captions.values()))
train_captions_prepro = dict( zip( train_ids, train_captions_prepro ) )
# fetch tags
train_tags = self.__get_tags(train_ids)
dev_tags = self.__get_tags(dev_ids)
test_tags = self.__get_tags(test_ids)
# build data for each set
train_dataset = [train_images, train_captions_prepro, train_tags]
dev_dataset = [dev_images, dev_captions, dev_tags]
test_dataset = [test_images, test_captions, test_tags]
return train_dataset, dev_dataset, test_dataset
class ImageCLEFDataset(Dataset):
def __init__(self, image_vectors: dict, captions_data: dict):
"""_summary_
Args:
image_vectors (dict): _description_
captions_data (dict): _description_
"""
super().__init__(image_vectors=image_vectors, captions_data=captions_data, clear_long_captions=True)
self.train_dataset, self.dev_dataset, self.test_dataset = self.build_dataset()
self.vocab, self.tokenizer, self.word2idx, self.idx2word = super().build_vocab(training_captions=list(self.train_dataset[1].values()))
def __str__(self) -> str:
""" Python built-in function for prints
Returns:
str: A modified print.
"""
text = f"Train: patients={len(self.train_dataset[0])}, captions={len(self.train_dataset[1])}"
text += f"\nDev: patients={len(self.dev_dataset[0])}, captions={len(self.dev_dataset[1])}"
text += f"\nTest: patients={len(self.test_dataset[0])}, captions={len(self.test_dataset[1])}"
return text
def get_splits_sets(self) -> tuple[list, list, list]:
""" Fetches the data for each split set.
Returns:
tuple[list, list, list]: train_dataset, dev_dataset, test_dataset
"""
return self.train_dataset, self.dev_dataset, self.test_dataset
def get_tokenizer_utils(self) -> tuple[Vocabulary, Tokenizer, dict, dict]:
""" Fetches the linguistic utilities.
Returns:
tuple[Vocabulary, Tokenizer, dict, dict]: The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary.
The latters are mappers for words and index respectively
"""
return self.vocab, self.tokenizer, self.word2idx, self.idx2word
def build_dataset(self) -> tuple[list, list, list]:
""" Begins the whole process for the dataset creation.
Returns:
tuple[list, list, list]: The training dataset, the validation dataset and the test dataset for our models.
All sets are in list format.
1st index --> image vectors
2nd index --> captions
"""
# random split
train_ids, dev_ids, test_ids = super().build_splits()
# fetch images
train_images = super().get_image_vectors(train_ids)
dev_images = super().get_image_vectors(dev_ids)
test_images = super().get_image_vectors(test_ids)
# fetch captions
train_captions = super().get_captions(train_ids)
dev_captions = super().get_captions(dev_ids)
test_captions = super().get_captions(test_ids)
# remove long outlier captions from training set
train_modified_captions = super().delete_long_captions(data=train_captions)
# get new training ids after removing
train_new_ids = list(train_modified_captions.keys())
train_new_images = {
key:image_vector for key, image_vector in train_images.items() if key in train_new_ids
}
# apply preprocess to training captions
train_captions_prepro = self.text_handler.preprocess_all(
list(train_modified_captions.values()))
train_captions_prepro = dict( zip( train_new_ids, train_captions_prepro ) )
# build data for each set
train_dataset = [train_new_images, train_captions_prepro]
dev_dataset = [dev_images, dev_captions]
test_dataset = [test_images, test_captions]
return train_dataset, dev_dataset, test_dataset