Diff of /modules/greedy_search.py [000000] .. [03245f]

Switch to unified view

a b/modules/greedy_search.py
1
import numpy as np
2
# tensorflow imports
3
import tensorflow as tf
4
import tensorflow
5
from tensorflow.keras.preprocessing.text import Tokenizer
6
from tensorflow.keras.models import Model
7
8
class GreedySearch:
9
    def __init__(self, start_token:str, end_token:str, max_length:int, tokenizer:Tokenizer, idx_to_word:dict, word_to_idx:dict):
10
        """ The Greedy Search sampling method for generating captions.
11
12
        Args:
13
            start_token (str): The start-token used during pre-processing of the training captions
14
            end_token (str): The end-token used during pre-processing of the training captions
15
            max_length (int): The maximum length (limit) for the generated captions
16
            tokenizer (Tokenizer): The fitted tokenizer from the Vocabulary object
17
            idx_to_word (dict): Dictionary with keys to be the index number and values the words in the created vocabulary
18
            word_to_idx (dict): Dictionary with keys to be the words and values the index number in the created vocabulary  
19
        """
20
        self.start_token = start_token
21
        self.end_token = end_token
22
        self.max_length = max_length
23
        self.tokenizer = tokenizer
24
        self.idx_to_word = idx_to_word
25
        self.word_to_idx = word_to_idx
26
27
    def get_word(self, idx:int) -> str:
28
        """ Fetches the word from the index-to-word vocab, which was created after the pre-processing of the Training captions
29
30
        Args:
31
            idx (int): The index for the index-to-word vocab.
32
33
        Returns:
34
            str: The word for the given index if exist in the created index-to-word vocab, else None
35
        """
36
        return self.idx_to_word.get(idx, None)
37
38
    def get_idx(self, word:str)->int:
39
        """ Fetches the index number from the word-to-index vocab, which was created after the pre-processing of the Training captions
40
41
        Args:
42
            word (str): The word for which we want its index in the word-to-index dictionary.
43
44
        Returns:
45
            int: The index for the given word if exist in the created word-to-index vocab, else -1. The latter number refer to None
46
        """
47
        return self.word_to_idx.get(word, -1)
48
49
    def greedy_search_predict(self, model:Model, photo:np.array, tag:np.array, dataset:str='iuxray', multi_modal:bool=False)->str:
50
        """ Executes the greedy search algorithm, employing the pre-trained model along with the test instance's data.
51
52
        Args:
53
            model (Model): The model we want to evaluate on our employed dataset
54
            photo (np.array): Current test image embedding
55
            tag (np.array): The tag embedding for the current test instance. This is used only for IU X-Ray dataset.
56
            dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'.
57
            multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False.
58
59
        Returns:
60
            str: The generated description for the given image
61
        """
62
        # seed the generation process
63
        in_text = self.start_token
64
        # iterate over the whole length of the sequence
65
        for i in range(self.max_length):
66
            # integer encode input sequence
67
            sequence = self.tokenizer.texts_to_sequences([in_text])[0]
68
            # pad input
69
            sequence = tf.keras.preprocessing.sequence.pad_sequences(
70
                [sequence], maxlen=self.max_length
71
            )
72
            # predict next word
73
            if multi_modal:
74
                if dataset=='iuxray':
75
                    yhat = model.predict([photo[0], photo[1], tag, sequence], verbose=0)
76
            else:
77
                if dataset=='iuxray':
78
                    yhat = model.predict([photo[0], photo[1], sequence], verbose=0)
79
                else:
80
                    yhat = model.predict([photo, sequence], verbose=0)
81
            
82
                
83
            # convert probability to integer
84
            yhat = np.argmax(yhat)
85
            # map integer to word
86
            word = self.get_word(yhat)
87
            # stop if we cannot map the word
88
            if word is None:
89
                break
90
            # append as input for generating the next word
91
            in_text += " " + word
92
            # stop if we predict the end of the sequence
93
            if word == self.end_token:
94
                break
95
        return in_text
96
97
    def greedy_search_ensembles_AP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str:
98
        """ Executes the Average Probability Greedy Search algorithm employing the pre-trained models along with the test instances data.
99
        More details are provided in my Thesis. Acknowledgements: https://ieeexplore.ieee.org/document/9031513
100
101
        Args:
102
            models (list): The models we want to evaluate on our employed dataset
103
            photos (list): Current test images embeddings for each encoder we used.
104
            tags (list): Current test tags embeddings for each encoder we used.
105
            dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'.
106
            multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False.
107
108
        Returns:
109
            str: The generated description for the given image ID.
110
        """
111
        
112
        # seed the generation process
113
        in_text = self.start_token
114
        # iterate over the whole length of the sequence
115
        for i in range(self.max_length):
116
            # integer encode input sequence
117
            sequence = self.tokenizer.texts_to_sequences([in_text])[0]
118
            # pad input
119
            sequence = tf.keras.preprocessing.sequence.pad_sequences(
120
                [sequence], maxlen=self.max_length
121
            )
122
            # predict next word
123
            
124
            if multi_modal:
125
                if dataset=='iuxray':
126
                    yhats = [model.predict([photo[0], photo[1], tags, sequence], verbose=0) for model, photo in zip(models, photos)]
127
            else:
128
                if dataset=='iuxray':
129
                    yhats = [model.predict([photo[0], photo[1], sequence], verbose=0) for model, photo in zip(models, photos)]
130
                else:
131
                    yhats = [model.predict([photo, sequence], verbose=0) for model, photo in zip(models, photos)]
132
133
            # yhats = [
134
            #     model.predict([photo, sequence], verbose=0)
135
            #     for model, photo in zip(models, photos)
136
            # ]
137
            summed = np.sum(yhats, axis=0)
138
            # convert probability to integer
139
            yhat = np.argmax(summed, axis=1)
140
141
            # map integer to word
142
            word = self.get_word(yhat[0])
143
144
            # stop if we cannot map the word
145
            if word is None:
146
                break
147
            # append as input for generating the next word
148
            in_text += " " + word
149
            # stop if we predict the end of the sequence
150
            if word == self.end_token:
151
                break
152
153
        return in_text
154
155
    def greedy_search_ensembles_MVP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str:
156
        """ Executes the Maximum Voting Probability Greedy Search algorithm employing the pre-trained models along with the test instances data.
157
        More details are provided in my Thesis. Acknowledgements: https://ieeexplore.ieee.org/document/9031513
158
159
        Args:
160
            models (list): The models we want to evaluate on our employed dataset
161
            photos (list): Current test images embeddings for each encoder we used.
162
            tags (list): Current test tags embeddings for each encoder we used.
163
            dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'.
164
            multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False.
165
166
        Returns:
167
            str: The generated description for the given image ID.
168
        """
169
        # seed the generation process
170
        in_text = self.start_token
171
        # iterate over the whole length of the sequence
172
        for i in range(self.max_length):
173
            pred = []
174
            index = 0
175
            for each_model in models:
176
177
                # integer encode input sequence
178
                sequence = self.tokenizer.texts_to_sequences([in_text])[0]
179
                # pad input
180
                sequence = tf.keras.preprocessing.sequence.pad_sequences(
181
                    [sequence], maxlen=self.max_length
182
                )
183
                if multi_modal:
184
                    if dataset == 'iuxray':
185
                        yhat = each_model.predict([photos[index][0], photos[index][1], tags, sequence], verbose=0)
186
                else:
187
                    if dataset == 'iuxray':
188
                        yhat = each_model.predict([photos[index][0], photos[index][1], sequence], verbose=0)
189
                    else:
190
                        yhat = each_model.predict([photos[index], sequence], verbose=0)
191
                pred.append(np.argmax(yhat))
192
                index += 1
193
194
            # predict next word
195
            yhats = max(pred, key=pred.count)
196
197
            # map integer to word
198
            word = self.get_word(yhats)
199
200
            # stop if we cannot map the word
201
            if word is None:
202
                break
203
            # append as input for generating the next word
204
            in_text += " " + word
205
            # stop if we predict the end of the sequence
206
            if word == self.end_token:
207
                break
208
209
        return in_text
210
211
    def greedy_search_ensembles_MP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str:
212
        """ Executes the Maximum Probability Greedy Search algorithm employing the pre-trained models along with the test instances data.
213
        More details are provided in my Thesis. 
214
215
        Args:
216
            models (list): The models we want to evaluate on our employed dataset
217
            photos (list): Current test images embeddings for each encoder we used.
218
            tags (list): Current test tags embeddings for each encoder we used.
219
            dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'.
220
            multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False.
221
222
        Returns:
223
            str: The generated description for the given image ID.
224
        """
225
        # seed the generation process
226
        in_text = self.start_token
227
        # iterate over the whole length of the sequence
228
        for i in range(self.max_length):
229
            pred = []
230
            max_value = []
231
            index = 0
232
            for each_model in models:
233
234
                # integer encode input sequence
235
                sequence = self.tokenizer.texts_to_sequences([in_text])[0]
236
                # pad input
237
                sequence = tf.keras.preprocessing.sequence.pad_sequences(
238
                    [sequence], maxlen=self.max_length
239
                )
240
                if multi_modal:
241
                    if dataset == 'iuxray':
242
                        yhat = each_model.predict([photos[index][0], photos[index][1], tags, sequence], verbose=0)
243
                else:
244
                    if dataset == 'iuxray':
245
                        yhat = each_model.predict([photos[index][0], photos[index][1], sequence], verbose=0)
246
                    else:
247
                        yhat = each_model.predict([photos[index], sequence], verbose=0)
248
                max_value.append(np.amax(yhat))
249
                pred.append(np.argmax(yhat))
250
                index += 1
251
252
            # predict next word
253
            yhats = max(max_value)
254
            max_index = max_value.index(yhats)
255
            yhats = pred[max_index]
256
257
            # map integer to word
258
            word = self.get_word(yhats)
259
260
            # stop if we cannot map the word
261
            if word is None:
262
                break
263
            # append as input for generating the next word
264
            in_text += " " + word
265
            # stop if we predict the end of the sequence
266
            if word == self.end_token:
267
                break
268
269
        return in_text