a b/utils/dataset.py
1
# sklearn and nltk imports
2
from sklearn.model_selection import KFold
3
from nltk.tokenize import word_tokenize
4
import numpy as np
5
import nltk
6
nltk.download("punkt", quiet=True)
7
8
# tensorflow imports
9
import tensorflow
10
from tensorflow.keras.preprocessing.text import Tokenizer
11
12
# progress bar
13
from tqdm import tqdm
14
15
# utils imports
16
from utils.text_handler import TextHandler
17
from utils.vocabulary import Vocabulary
18
19
20
21
class Dataset:
22
    def __init__(self, image_vectors:dict, captions_data:dict, clear_long_captions:bool = True):
23
        """ Base class to create the employed dataset for my research, i.e. ImageCLEF and IU X-Ray 
24
25
        Args:
26
            image_vectors (dict): Dictionary with keys to be the ImageIDs and values the image embeddings.
27
            captions_data (dict): Dictionary with keys to be the ImageIDs and values the captions.
28
            clear_long_captions (bool, optional): If we want to drop the outlier long captions. Defaults to True.
29
        """
30
        self.image_vectors = image_vectors
31
        self.captions_data = captions_data
32
        self.clear_long_captions = clear_long_captions
33
        # init a text handler object to pre-process training captions
34
        self.text_handler = TextHandler()
35
36
    def delete_long_captions(self, data:dict, threshold:int=80) -> dict:
37
        """ Function that removes the long captions only from the training set. This method was utilised during ImageCLEF campaign.
38
39
        Args:
40
            data (dict): Dictionary with keys to be the ImageIDs and values the captions.
41
            threshold (int, optional): The maximum length limit. Defaults to 80.
42
43
        Returns:
44
            dict: Dictionary with keys to be the ImageIDs and values the captions, without the instances whose captions are long.
45
        """
46
        filtered_data = {}
47
48
        for image_id, caption in data.items():
49
            tokens = word_tokenize(caption)
50
            if len(tokens) <= threshold:
51
                filtered_data[image_id] = caption
52
53
        return filtered_data
54
    
55
    @staticmethod
56
    def build_splits(self) ->tuple[list, list, list]:
57
        """ This function makes the split sets for trainig, validation and test.
58
        In particulare, we followed the next splits:
59
        train: 80% 
60
        valid: 5%
61
        test: 15%
62
63
        Returns:
64
            tuple[list, list, list]: Training, validation, test set ids.
65
        """
66
    
67
        image_ids = list( self.captions_data.keys() )
68
        np.random.shuffle(image_ids)
69
70
        test_split_threshold = int(0.15 * len(image_ids))
71
        train, test = (
72
            image_ids[:-test_split_threshold],
73
            image_ids[-test_split_threshold:],
74
        )
75
76
        dev_split_threshold = int(0.1 * len(train))
77
        train, dev = (
78
            train[:-dev_split_threshold],
79
            train[-dev_split_threshold:],
80
        )
81
82
        return train, dev, test
83
    
84
    @staticmethod
85
    def get_image_vectors(self, keys:list) -> dict:
86
        """ Fetches from the whole dataset the image embeddings according to the utilised set.
87
88
        Args:
89
            keys (list): Split set ids
90
91
        Returns:
92
            dict: Dictionary with keys to be the ImageIDs and values the image embeddings, for each split set.
93
        """
94
95
        return { k: v for k, v in tqdm(self.image_vectors.items(), desc="Fetching image embeddings..") if k in keys }
96
97
    def get_captions(self, _ids:list) -> dict:
98
        return { key:value for key, value in self.captions_data.items() if key in _ids}
99
 
100
    def build_pseudo_cv_splits(self) -> tuple[list, list]:
101
        """ This function makes cross-validaion splis using K-Fold cross validation. It was used only for ImageCLEF campaign.
102
        More details are described in my Thesis.
103
104
        Returns:
105
            tuple[list, list]: Training and test fold sets.
106
        """
107
        image_ids = list( self.captions_data.keys() )
108
        np.random.shuffle(image_ids)
109
110
        # apply 15-Fold CV
111
        kf = KFold(n_splits=15)
112
        train_fold_ids, test_fold_ids = list(), list()
113
        for train_index, test_index in kf.split(image_ids):
114
            train_ids = [image_ids[index] for index in train_index]
115
            test_ids = [image_ids[index] for index in test_index]
116
            train_fold_ids.append(train_ids)
117
            test_fold_ids.append(test_ids)
118
119
        return train_fold_ids, test_fold_ids
120
121
    def build_vocab(self, training_captions:list, threshold:int = 3) -> tuple[Vocabulary, Tokenizer, dict, dict]:
122
        """ This method creates the employed vocabulary given the training captions
123
124
        Args:
125
            training_captions (list): All training captions
126
            threshold (int, optional): The cut-off frequence for Vocabulary. Defaults to 3.
127
128
        Returns:
129
            tuple[Vocabulary, Tokenizer, dict, dict]: The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. 
130
            The latters are mappers for words and index respectively
131
        """
132
        vocab = Vocabulary(texts=training_captions, threshold=threshold)
133
        tokenizer, word2idx, idx2word = vocab.build_vocab()
134
        return vocab, tokenizer, word2idx, idx2word
135
    
136
    
137
    
138
class IuXrayDataset(Dataset):
139
    def __init__(self, image_vectors: dict, captions_data: dict, tags_data: dict):
140
        """ Child class to create the employed IU X-Ray, inheriting the base class methods
141
142
        Args:
143
            image_vectors (dict): Dictionary with keys to be the ImageIDs and values the image embeddings.
144
            captions_data (dict): Dictionary with keys to be the ImageIDs and values the captions.
145
            tags_data (dict): Dictionary with keys to be the ImageIDs and values the tags embeddings.
146
        """
147
        super().__init__(image_vectors=image_vectors, captions_data=captions_data, clear_long_captions=False)
148
        self.tags_data = tags_data
149
        # get the splits
150
        self.train_dataset, self.dev_dataset, self.test_dataset = self.build_dataset()
151
        # build linguistic attributes
152
        self.vocab, self.tokenizer, self.word2idx, self.idx2word = super().build_vocab(training_captions=list(self.train_dataset[1].values()))
153
    
154
    def __str__(self) -> str:
155
        """ Python built-in function for prints
156
157
        Returns:
158
            str: A modified print.
159
        """
160
        text = f"Train: patients={len(self.train_dataset[0])}, captions={len(self.train_dataset[1])}, tags={len(self.train_dataset[2])}"
161
        text += f"\nDev: patients={len(self.dev_dataset[0])}, captions={len(self.dev_dataset[1])}, tags={len(self.dev_dataset[2])}"
162
        text += f"\nTest: patients={len(self.test_dataset[0])}, captions={len(self.test_dataset[1])}, tags={len(self.test_dataset[2])}"
163
        return text
164
    
165
    def get_splits_sets(self) ->tuple[list, list, list]:
166
        """ Fetches the data for each split set.
167
168
        Returns:
169
            tuple[list, list, list]: train_dataset, dev_dataset, test_dataset
170
        """
171
        return self.train_dataset, self.dev_dataset, self.test_dataset
172
    
173
    def get_tokenizer_utils(self) ->tuple[Vocabulary, Tokenizer, dict, dict]:
174
        """ Fetches the linguistic utilities.
175
176
        Returns:
177
            tuple[Vocabulary, Tokenizer, dict, dict]:  The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. 
178
            The latters are mappers for words and index respectively
179
        """
180
        return self.vocab, self.tokenizer, self.word2idx, self.idx2word
181
        
182
    def __get_tags(self, _ids:list) -> dict:
183
        """ Fetches from the whole dataset the tags embeddings according to the utilised set.
184
185
        Args:
186
            _ids (list): Split set ids
187
188
        Returns:
189
            dict: Dictionary with keys to be the ImageIDs and values the tags embeddings
190
        """
191
         
192
        return { key:value for key, value in self.tags_data.items() if key in _ids}
193
        
194
    def build_dataset(self) -> tuple[list, list, list]:
195
        """ Begins the whole process for the dataset creation.
196
197
        Returns:
198
            tuple[list, list, list]: The training dataset, the validation dataset and the test dataset for our models.
199
            All sets are in list format. 
200
            1st index --> image vectors
201
            2nd index --> captions
202
            3rd index --> tags
203
        """
204
        # random split
205
        train_ids, dev_ids, test_ids = super().build_splits()
206
207
        # fetch images
208
        train_images = super().get_image_vectors(train_ids)
209
        dev_images = super().get_image_vectors(dev_ids)
210
        test_images = super().get_image_vectors(test_ids)
211
        # fetch captions
212
        train_captions = super().get_captions(train_ids)
213
        dev_captions = super().get_captions(dev_ids)
214
        test_captions = super().get_captions(test_ids)
215
        # apply preprocess to training captions
216
        train_captions_prepro = self.text_handler.preprocess_all(
217
            list(train_captions.values()))
218
            
219
        train_captions_prepro = dict( zip( train_ids, train_captions_prepro ) )
220
        # fetch tags
221
        train_tags = self.__get_tags(train_ids)
222
        dev_tags = self.__get_tags(dev_ids)
223
        test_tags = self.__get_tags(test_ids)
224
        # build data for each set    
225
        train_dataset = [train_images, train_captions_prepro, train_tags]
226
        dev_dataset = [dev_images, dev_captions, dev_tags]
227
        test_dataset = [test_images, test_captions, test_tags]
228
229
230
        return train_dataset, dev_dataset, test_dataset
231
    
232
233
234
class ImageCLEFDataset(Dataset):
235
    def __init__(self, image_vectors: dict, captions_data: dict):
236
        """_summary_
237
238
        Args:
239
            image_vectors (dict): _description_
240
            captions_data (dict): _description_
241
        """
242
        super().__init__(image_vectors=image_vectors, captions_data=captions_data, clear_long_captions=True)
243
        self.train_dataset, self.dev_dataset, self.test_dataset = self.build_dataset()
244
        
245
        self.vocab, self.tokenizer, self.word2idx, self.idx2word = super().build_vocab(training_captions=list(self.train_dataset[1].values()))
246
        
247
    def __str__(self) -> str:
248
        """ Python built-in function for prints
249
250
        Returns:
251
            str: A modified print.
252
        """
253
        text = f"Train: patients={len(self.train_dataset[0])}, captions={len(self.train_dataset[1])}"
254
        text += f"\nDev: patients={len(self.dev_dataset[0])}, captions={len(self.dev_dataset[1])}"
255
        text += f"\nTest: patients={len(self.test_dataset[0])}, captions={len(self.test_dataset[1])}"
256
        return text
257
    
258
    def get_splits_sets(self) -> tuple[list, list, list]:
259
        """ Fetches the data for each split set.
260
261
        Returns:
262
            tuple[list, list, list]: train_dataset, dev_dataset, test_dataset
263
        """
264
        return self.train_dataset, self.dev_dataset, self.test_dataset
265
    
266
    def get_tokenizer_utils(self) -> tuple[Vocabulary, Tokenizer, dict, dict]:
267
        """ Fetches the linguistic utilities.
268
269
        Returns:
270
            tuple[Vocabulary, Tokenizer, dict, dict]:  The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. 
271
            The latters are mappers for words and index respectively
272
        """
273
        return self.vocab, self.tokenizer, self.word2idx, self.idx2word
274
        
275
    def build_dataset(self) -> tuple[list, list, list]:
276
        """ Begins the whole process for the dataset creation.
277
278
        Returns:
279
            tuple[list, list, list]: The training dataset, the validation dataset and the test dataset for our models.
280
            All sets are in list format. 
281
            1st index --> image vectors
282
            2nd index --> captions
283
        """
284
        # random split
285
        train_ids, dev_ids, test_ids = super().build_splits()
286
        # fetch images
287
        train_images = super().get_image_vectors(train_ids)
288
        dev_images = super().get_image_vectors(dev_ids)
289
        test_images = super().get_image_vectors(test_ids)
290
        # fetch captions
291
        train_captions = super().get_captions(train_ids)
292
        dev_captions = super().get_captions(dev_ids)
293
        test_captions = super().get_captions(test_ids)
294
        
295
        # remove long outlier captions from training set
296
        train_modified_captions = super().delete_long_captions(data=train_captions)
297
        # get new training ids after removing
298
        train_new_ids = list(train_modified_captions.keys())
299
        train_new_images = {
300
            key:image_vector for key, image_vector in train_images.items() if key in train_new_ids
301
        }
302
        # apply preprocess to training captions
303
        train_captions_prepro = self.text_handler.preprocess_all(
304
            list(train_modified_captions.values()))
305
            
306
        train_captions_prepro = dict( zip( train_new_ids, train_captions_prepro ) )
307
        # build data for each set  
308
        train_dataset = [train_new_images, train_captions_prepro]
309
        dev_dataset = [dev_images, dev_captions]
310
        test_dataset = [test_images, test_captions]
311
312
313
        return train_dataset, dev_dataset, test_dataset
314
315