|
a |
|
b/utils/dataset.py |
|
|
1 |
# sklearn and nltk imports |
|
|
2 |
from sklearn.model_selection import KFold |
|
|
3 |
from nltk.tokenize import word_tokenize |
|
|
4 |
import numpy as np |
|
|
5 |
import nltk |
|
|
6 |
nltk.download("punkt", quiet=True) |
|
|
7 |
|
|
|
8 |
# tensorflow imports |
|
|
9 |
import tensorflow |
|
|
10 |
from tensorflow.keras.preprocessing.text import Tokenizer |
|
|
11 |
|
|
|
12 |
# progress bar |
|
|
13 |
from tqdm import tqdm |
|
|
14 |
|
|
|
15 |
# utils imports |
|
|
16 |
from utils.text_handler import TextHandler |
|
|
17 |
from utils.vocabulary import Vocabulary |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
|
|
|
21 |
class Dataset: |
|
|
22 |
def __init__(self, image_vectors:dict, captions_data:dict, clear_long_captions:bool = True): |
|
|
23 |
""" Base class to create the employed dataset for my research, i.e. ImageCLEF and IU X-Ray |
|
|
24 |
|
|
|
25 |
Args: |
|
|
26 |
image_vectors (dict): Dictionary with keys to be the ImageIDs and values the image embeddings. |
|
|
27 |
captions_data (dict): Dictionary with keys to be the ImageIDs and values the captions. |
|
|
28 |
clear_long_captions (bool, optional): If we want to drop the outlier long captions. Defaults to True. |
|
|
29 |
""" |
|
|
30 |
self.image_vectors = image_vectors |
|
|
31 |
self.captions_data = captions_data |
|
|
32 |
self.clear_long_captions = clear_long_captions |
|
|
33 |
# init a text handler object to pre-process training captions |
|
|
34 |
self.text_handler = TextHandler() |
|
|
35 |
|
|
|
36 |
def delete_long_captions(self, data:dict, threshold:int=80) -> dict: |
|
|
37 |
""" Function that removes the long captions only from the training set. This method was utilised during ImageCLEF campaign. |
|
|
38 |
|
|
|
39 |
Args: |
|
|
40 |
data (dict): Dictionary with keys to be the ImageIDs and values the captions. |
|
|
41 |
threshold (int, optional): The maximum length limit. Defaults to 80. |
|
|
42 |
|
|
|
43 |
Returns: |
|
|
44 |
dict: Dictionary with keys to be the ImageIDs and values the captions, without the instances whose captions are long. |
|
|
45 |
""" |
|
|
46 |
filtered_data = {} |
|
|
47 |
|
|
|
48 |
for image_id, caption in data.items(): |
|
|
49 |
tokens = word_tokenize(caption) |
|
|
50 |
if len(tokens) <= threshold: |
|
|
51 |
filtered_data[image_id] = caption |
|
|
52 |
|
|
|
53 |
return filtered_data |
|
|
54 |
|
|
|
55 |
@staticmethod |
|
|
56 |
def build_splits(self) ->tuple[list, list, list]: |
|
|
57 |
""" This function makes the split sets for trainig, validation and test. |
|
|
58 |
In particulare, we followed the next splits: |
|
|
59 |
train: 80% |
|
|
60 |
valid: 5% |
|
|
61 |
test: 15% |
|
|
62 |
|
|
|
63 |
Returns: |
|
|
64 |
tuple[list, list, list]: Training, validation, test set ids. |
|
|
65 |
""" |
|
|
66 |
|
|
|
67 |
image_ids = list( self.captions_data.keys() ) |
|
|
68 |
np.random.shuffle(image_ids) |
|
|
69 |
|
|
|
70 |
test_split_threshold = int(0.15 * len(image_ids)) |
|
|
71 |
train, test = ( |
|
|
72 |
image_ids[:-test_split_threshold], |
|
|
73 |
image_ids[-test_split_threshold:], |
|
|
74 |
) |
|
|
75 |
|
|
|
76 |
dev_split_threshold = int(0.1 * len(train)) |
|
|
77 |
train, dev = ( |
|
|
78 |
train[:-dev_split_threshold], |
|
|
79 |
train[-dev_split_threshold:], |
|
|
80 |
) |
|
|
81 |
|
|
|
82 |
return train, dev, test |
|
|
83 |
|
|
|
84 |
@staticmethod |
|
|
85 |
def get_image_vectors(self, keys:list) -> dict: |
|
|
86 |
""" Fetches from the whole dataset the image embeddings according to the utilised set. |
|
|
87 |
|
|
|
88 |
Args: |
|
|
89 |
keys (list): Split set ids |
|
|
90 |
|
|
|
91 |
Returns: |
|
|
92 |
dict: Dictionary with keys to be the ImageIDs and values the image embeddings, for each split set. |
|
|
93 |
""" |
|
|
94 |
|
|
|
95 |
return { k: v for k, v in tqdm(self.image_vectors.items(), desc="Fetching image embeddings..") if k in keys } |
|
|
96 |
|
|
|
97 |
def get_captions(self, _ids:list) -> dict: |
|
|
98 |
return { key:value for key, value in self.captions_data.items() if key in _ids} |
|
|
99 |
|
|
|
100 |
def build_pseudo_cv_splits(self) -> tuple[list, list]: |
|
|
101 |
""" This function makes cross-validaion splis using K-Fold cross validation. It was used only for ImageCLEF campaign. |
|
|
102 |
More details are described in my Thesis. |
|
|
103 |
|
|
|
104 |
Returns: |
|
|
105 |
tuple[list, list]: Training and test fold sets. |
|
|
106 |
""" |
|
|
107 |
image_ids = list( self.captions_data.keys() ) |
|
|
108 |
np.random.shuffle(image_ids) |
|
|
109 |
|
|
|
110 |
# apply 15-Fold CV |
|
|
111 |
kf = KFold(n_splits=15) |
|
|
112 |
train_fold_ids, test_fold_ids = list(), list() |
|
|
113 |
for train_index, test_index in kf.split(image_ids): |
|
|
114 |
train_ids = [image_ids[index] for index in train_index] |
|
|
115 |
test_ids = [image_ids[index] for index in test_index] |
|
|
116 |
train_fold_ids.append(train_ids) |
|
|
117 |
test_fold_ids.append(test_ids) |
|
|
118 |
|
|
|
119 |
return train_fold_ids, test_fold_ids |
|
|
120 |
|
|
|
121 |
def build_vocab(self, training_captions:list, threshold:int = 3) -> tuple[Vocabulary, Tokenizer, dict, dict]: |
|
|
122 |
""" This method creates the employed vocabulary given the training captions |
|
|
123 |
|
|
|
124 |
Args: |
|
|
125 |
training_captions (list): All training captions |
|
|
126 |
threshold (int, optional): The cut-off frequence for Vocabulary. Defaults to 3. |
|
|
127 |
|
|
|
128 |
Returns: |
|
|
129 |
tuple[Vocabulary, Tokenizer, dict, dict]: The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. |
|
|
130 |
The latters are mappers for words and index respectively |
|
|
131 |
""" |
|
|
132 |
vocab = Vocabulary(texts=training_captions, threshold=threshold) |
|
|
133 |
tokenizer, word2idx, idx2word = vocab.build_vocab() |
|
|
134 |
return vocab, tokenizer, word2idx, idx2word |
|
|
135 |
|
|
|
136 |
|
|
|
137 |
|
|
|
138 |
class IuXrayDataset(Dataset): |
|
|
139 |
def __init__(self, image_vectors: dict, captions_data: dict, tags_data: dict): |
|
|
140 |
""" Child class to create the employed IU X-Ray, inheriting the base class methods |
|
|
141 |
|
|
|
142 |
Args: |
|
|
143 |
image_vectors (dict): Dictionary with keys to be the ImageIDs and values the image embeddings. |
|
|
144 |
captions_data (dict): Dictionary with keys to be the ImageIDs and values the captions. |
|
|
145 |
tags_data (dict): Dictionary with keys to be the ImageIDs and values the tags embeddings. |
|
|
146 |
""" |
|
|
147 |
super().__init__(image_vectors=image_vectors, captions_data=captions_data, clear_long_captions=False) |
|
|
148 |
self.tags_data = tags_data |
|
|
149 |
# get the splits |
|
|
150 |
self.train_dataset, self.dev_dataset, self.test_dataset = self.build_dataset() |
|
|
151 |
# build linguistic attributes |
|
|
152 |
self.vocab, self.tokenizer, self.word2idx, self.idx2word = super().build_vocab(training_captions=list(self.train_dataset[1].values())) |
|
|
153 |
|
|
|
154 |
def __str__(self) -> str: |
|
|
155 |
""" Python built-in function for prints |
|
|
156 |
|
|
|
157 |
Returns: |
|
|
158 |
str: A modified print. |
|
|
159 |
""" |
|
|
160 |
text = f"Train: patients={len(self.train_dataset[0])}, captions={len(self.train_dataset[1])}, tags={len(self.train_dataset[2])}" |
|
|
161 |
text += f"\nDev: patients={len(self.dev_dataset[0])}, captions={len(self.dev_dataset[1])}, tags={len(self.dev_dataset[2])}" |
|
|
162 |
text += f"\nTest: patients={len(self.test_dataset[0])}, captions={len(self.test_dataset[1])}, tags={len(self.test_dataset[2])}" |
|
|
163 |
return text |
|
|
164 |
|
|
|
165 |
def get_splits_sets(self) ->tuple[list, list, list]: |
|
|
166 |
""" Fetches the data for each split set. |
|
|
167 |
|
|
|
168 |
Returns: |
|
|
169 |
tuple[list, list, list]: train_dataset, dev_dataset, test_dataset |
|
|
170 |
""" |
|
|
171 |
return self.train_dataset, self.dev_dataset, self.test_dataset |
|
|
172 |
|
|
|
173 |
def get_tokenizer_utils(self) ->tuple[Vocabulary, Tokenizer, dict, dict]: |
|
|
174 |
""" Fetches the linguistic utilities. |
|
|
175 |
|
|
|
176 |
Returns: |
|
|
177 |
tuple[Vocabulary, Tokenizer, dict, dict]: The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. |
|
|
178 |
The latters are mappers for words and index respectively |
|
|
179 |
""" |
|
|
180 |
return self.vocab, self.tokenizer, self.word2idx, self.idx2word |
|
|
181 |
|
|
|
182 |
def __get_tags(self, _ids:list) -> dict: |
|
|
183 |
""" Fetches from the whole dataset the tags embeddings according to the utilised set. |
|
|
184 |
|
|
|
185 |
Args: |
|
|
186 |
_ids (list): Split set ids |
|
|
187 |
|
|
|
188 |
Returns: |
|
|
189 |
dict: Dictionary with keys to be the ImageIDs and values the tags embeddings |
|
|
190 |
""" |
|
|
191 |
|
|
|
192 |
return { key:value for key, value in self.tags_data.items() if key in _ids} |
|
|
193 |
|
|
|
194 |
def build_dataset(self) -> tuple[list, list, list]: |
|
|
195 |
""" Begins the whole process for the dataset creation. |
|
|
196 |
|
|
|
197 |
Returns: |
|
|
198 |
tuple[list, list, list]: The training dataset, the validation dataset and the test dataset for our models. |
|
|
199 |
All sets are in list format. |
|
|
200 |
1st index --> image vectors |
|
|
201 |
2nd index --> captions |
|
|
202 |
3rd index --> tags |
|
|
203 |
""" |
|
|
204 |
# random split |
|
|
205 |
train_ids, dev_ids, test_ids = super().build_splits() |
|
|
206 |
|
|
|
207 |
# fetch images |
|
|
208 |
train_images = super().get_image_vectors(train_ids) |
|
|
209 |
dev_images = super().get_image_vectors(dev_ids) |
|
|
210 |
test_images = super().get_image_vectors(test_ids) |
|
|
211 |
# fetch captions |
|
|
212 |
train_captions = super().get_captions(train_ids) |
|
|
213 |
dev_captions = super().get_captions(dev_ids) |
|
|
214 |
test_captions = super().get_captions(test_ids) |
|
|
215 |
# apply preprocess to training captions |
|
|
216 |
train_captions_prepro = self.text_handler.preprocess_all( |
|
|
217 |
list(train_captions.values())) |
|
|
218 |
|
|
|
219 |
train_captions_prepro = dict( zip( train_ids, train_captions_prepro ) ) |
|
|
220 |
# fetch tags |
|
|
221 |
train_tags = self.__get_tags(train_ids) |
|
|
222 |
dev_tags = self.__get_tags(dev_ids) |
|
|
223 |
test_tags = self.__get_tags(test_ids) |
|
|
224 |
# build data for each set |
|
|
225 |
train_dataset = [train_images, train_captions_prepro, train_tags] |
|
|
226 |
dev_dataset = [dev_images, dev_captions, dev_tags] |
|
|
227 |
test_dataset = [test_images, test_captions, test_tags] |
|
|
228 |
|
|
|
229 |
|
|
|
230 |
return train_dataset, dev_dataset, test_dataset |
|
|
231 |
|
|
|
232 |
|
|
|
233 |
|
|
|
234 |
class ImageCLEFDataset(Dataset): |
|
|
235 |
def __init__(self, image_vectors: dict, captions_data: dict): |
|
|
236 |
"""_summary_ |
|
|
237 |
|
|
|
238 |
Args: |
|
|
239 |
image_vectors (dict): _description_ |
|
|
240 |
captions_data (dict): _description_ |
|
|
241 |
""" |
|
|
242 |
super().__init__(image_vectors=image_vectors, captions_data=captions_data, clear_long_captions=True) |
|
|
243 |
self.train_dataset, self.dev_dataset, self.test_dataset = self.build_dataset() |
|
|
244 |
|
|
|
245 |
self.vocab, self.tokenizer, self.word2idx, self.idx2word = super().build_vocab(training_captions=list(self.train_dataset[1].values())) |
|
|
246 |
|
|
|
247 |
def __str__(self) -> str: |
|
|
248 |
""" Python built-in function for prints |
|
|
249 |
|
|
|
250 |
Returns: |
|
|
251 |
str: A modified print. |
|
|
252 |
""" |
|
|
253 |
text = f"Train: patients={len(self.train_dataset[0])}, captions={len(self.train_dataset[1])}" |
|
|
254 |
text += f"\nDev: patients={len(self.dev_dataset[0])}, captions={len(self.dev_dataset[1])}" |
|
|
255 |
text += f"\nTest: patients={len(self.test_dataset[0])}, captions={len(self.test_dataset[1])}" |
|
|
256 |
return text |
|
|
257 |
|
|
|
258 |
def get_splits_sets(self) -> tuple[list, list, list]: |
|
|
259 |
""" Fetches the data for each split set. |
|
|
260 |
|
|
|
261 |
Returns: |
|
|
262 |
tuple[list, list, list]: train_dataset, dev_dataset, test_dataset |
|
|
263 |
""" |
|
|
264 |
return self.train_dataset, self.dev_dataset, self.test_dataset |
|
|
265 |
|
|
|
266 |
def get_tokenizer_utils(self) -> tuple[Vocabulary, Tokenizer, dict, dict]: |
|
|
267 |
""" Fetches the linguistic utilities. |
|
|
268 |
|
|
|
269 |
Returns: |
|
|
270 |
tuple[Vocabulary, Tokenizer, dict, dict]: The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. |
|
|
271 |
The latters are mappers for words and index respectively |
|
|
272 |
""" |
|
|
273 |
return self.vocab, self.tokenizer, self.word2idx, self.idx2word |
|
|
274 |
|
|
|
275 |
def build_dataset(self) -> tuple[list, list, list]: |
|
|
276 |
""" Begins the whole process for the dataset creation. |
|
|
277 |
|
|
|
278 |
Returns: |
|
|
279 |
tuple[list, list, list]: The training dataset, the validation dataset and the test dataset for our models. |
|
|
280 |
All sets are in list format. |
|
|
281 |
1st index --> image vectors |
|
|
282 |
2nd index --> captions |
|
|
283 |
""" |
|
|
284 |
# random split |
|
|
285 |
train_ids, dev_ids, test_ids = super().build_splits() |
|
|
286 |
# fetch images |
|
|
287 |
train_images = super().get_image_vectors(train_ids) |
|
|
288 |
dev_images = super().get_image_vectors(dev_ids) |
|
|
289 |
test_images = super().get_image_vectors(test_ids) |
|
|
290 |
# fetch captions |
|
|
291 |
train_captions = super().get_captions(train_ids) |
|
|
292 |
dev_captions = super().get_captions(dev_ids) |
|
|
293 |
test_captions = super().get_captions(test_ids) |
|
|
294 |
|
|
|
295 |
# remove long outlier captions from training set |
|
|
296 |
train_modified_captions = super().delete_long_captions(data=train_captions) |
|
|
297 |
# get new training ids after removing |
|
|
298 |
train_new_ids = list(train_modified_captions.keys()) |
|
|
299 |
train_new_images = { |
|
|
300 |
key:image_vector for key, image_vector in train_images.items() if key in train_new_ids |
|
|
301 |
} |
|
|
302 |
# apply preprocess to training captions |
|
|
303 |
train_captions_prepro = self.text_handler.preprocess_all( |
|
|
304 |
list(train_modified_captions.values())) |
|
|
305 |
|
|
|
306 |
train_captions_prepro = dict( zip( train_new_ids, train_captions_prepro ) ) |
|
|
307 |
# build data for each set |
|
|
308 |
train_dataset = [train_new_images, train_captions_prepro] |
|
|
309 |
dev_dataset = [dev_images, dev_captions] |
|
|
310 |
test_dataset = [test_images, test_captions] |
|
|
311 |
|
|
|
312 |
|
|
|
313 |
return train_dataset, dev_dataset, test_dataset |
|
|
314 |
|
|
|
315 |
|