|
a |
|
b/modules/greedy_search.py |
|
|
1 |
import numpy as np |
|
|
2 |
# tensorflow imports |
|
|
3 |
import tensorflow as tf |
|
|
4 |
import tensorflow |
|
|
5 |
from tensorflow.keras.preprocessing.text import Tokenizer |
|
|
6 |
from tensorflow.keras.models import Model |
|
|
7 |
|
|
|
8 |
class GreedySearch: |
|
|
9 |
def __init__(self, start_token:str, end_token:str, max_length:int, tokenizer:Tokenizer, idx_to_word:dict, word_to_idx:dict): |
|
|
10 |
""" The Greedy Search sampling method for generating captions. |
|
|
11 |
|
|
|
12 |
Args: |
|
|
13 |
start_token (str): The start-token used during pre-processing of the training captions |
|
|
14 |
end_token (str): The end-token used during pre-processing of the training captions |
|
|
15 |
max_length (int): The maximum length (limit) for the generated captions |
|
|
16 |
tokenizer (Tokenizer): The fitted tokenizer from the Vocabulary object |
|
|
17 |
idx_to_word (dict): Dictionary with keys to be the index number and values the words in the created vocabulary |
|
|
18 |
word_to_idx (dict): Dictionary with keys to be the words and values the index number in the created vocabulary |
|
|
19 |
""" |
|
|
20 |
self.start_token = start_token |
|
|
21 |
self.end_token = end_token |
|
|
22 |
self.max_length = max_length |
|
|
23 |
self.tokenizer = tokenizer |
|
|
24 |
self.idx_to_word = idx_to_word |
|
|
25 |
self.word_to_idx = word_to_idx |
|
|
26 |
|
|
|
27 |
def get_word(self, idx:int) -> str: |
|
|
28 |
""" Fetches the word from the index-to-word vocab, which was created after the pre-processing of the Training captions |
|
|
29 |
|
|
|
30 |
Args: |
|
|
31 |
idx (int): The index for the index-to-word vocab. |
|
|
32 |
|
|
|
33 |
Returns: |
|
|
34 |
str: The word for the given index if exist in the created index-to-word vocab, else None |
|
|
35 |
""" |
|
|
36 |
return self.idx_to_word.get(idx, None) |
|
|
37 |
|
|
|
38 |
def get_idx(self, word:str)->int: |
|
|
39 |
""" Fetches the index number from the word-to-index vocab, which was created after the pre-processing of the Training captions |
|
|
40 |
|
|
|
41 |
Args: |
|
|
42 |
word (str): The word for which we want its index in the word-to-index dictionary. |
|
|
43 |
|
|
|
44 |
Returns: |
|
|
45 |
int: The index for the given word if exist in the created word-to-index vocab, else -1. The latter number refer to None |
|
|
46 |
""" |
|
|
47 |
return self.word_to_idx.get(word, -1) |
|
|
48 |
|
|
|
49 |
def greedy_search_predict(self, model:Model, photo:np.array, tag:np.array, dataset:str='iuxray', multi_modal:bool=False)->str: |
|
|
50 |
""" Executes the greedy search algorithm, employing the pre-trained model along with the test instance's data. |
|
|
51 |
|
|
|
52 |
Args: |
|
|
53 |
model (Model): The model we want to evaluate on our employed dataset |
|
|
54 |
photo (np.array): Current test image embedding |
|
|
55 |
tag (np.array): The tag embedding for the current test instance. This is used only for IU X-Ray dataset. |
|
|
56 |
dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'. |
|
|
57 |
multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False. |
|
|
58 |
|
|
|
59 |
Returns: |
|
|
60 |
str: The generated description for the given image |
|
|
61 |
""" |
|
|
62 |
# seed the generation process |
|
|
63 |
in_text = self.start_token |
|
|
64 |
# iterate over the whole length of the sequence |
|
|
65 |
for i in range(self.max_length): |
|
|
66 |
# integer encode input sequence |
|
|
67 |
sequence = self.tokenizer.texts_to_sequences([in_text])[0] |
|
|
68 |
# pad input |
|
|
69 |
sequence = tf.keras.preprocessing.sequence.pad_sequences( |
|
|
70 |
[sequence], maxlen=self.max_length |
|
|
71 |
) |
|
|
72 |
# predict next word |
|
|
73 |
if multi_modal: |
|
|
74 |
if dataset=='iuxray': |
|
|
75 |
yhat = model.predict([photo[0], photo[1], tag, sequence], verbose=0) |
|
|
76 |
else: |
|
|
77 |
if dataset=='iuxray': |
|
|
78 |
yhat = model.predict([photo[0], photo[1], sequence], verbose=0) |
|
|
79 |
else: |
|
|
80 |
yhat = model.predict([photo, sequence], verbose=0) |
|
|
81 |
|
|
|
82 |
|
|
|
83 |
# convert probability to integer |
|
|
84 |
yhat = np.argmax(yhat) |
|
|
85 |
# map integer to word |
|
|
86 |
word = self.get_word(yhat) |
|
|
87 |
# stop if we cannot map the word |
|
|
88 |
if word is None: |
|
|
89 |
break |
|
|
90 |
# append as input for generating the next word |
|
|
91 |
in_text += " " + word |
|
|
92 |
# stop if we predict the end of the sequence |
|
|
93 |
if word == self.end_token: |
|
|
94 |
break |
|
|
95 |
return in_text |
|
|
96 |
|
|
|
97 |
def greedy_search_ensembles_AP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str: |
|
|
98 |
""" Executes the Average Probability Greedy Search algorithm employing the pre-trained models along with the test instances data. |
|
|
99 |
More details are provided in my Thesis. Acknowledgements: https://ieeexplore.ieee.org/document/9031513 |
|
|
100 |
|
|
|
101 |
Args: |
|
|
102 |
models (list): The models we want to evaluate on our employed dataset |
|
|
103 |
photos (list): Current test images embeddings for each encoder we used. |
|
|
104 |
tags (list): Current test tags embeddings for each encoder we used. |
|
|
105 |
dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'. |
|
|
106 |
multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False. |
|
|
107 |
|
|
|
108 |
Returns: |
|
|
109 |
str: The generated description for the given image ID. |
|
|
110 |
""" |
|
|
111 |
|
|
|
112 |
# seed the generation process |
|
|
113 |
in_text = self.start_token |
|
|
114 |
# iterate over the whole length of the sequence |
|
|
115 |
for i in range(self.max_length): |
|
|
116 |
# integer encode input sequence |
|
|
117 |
sequence = self.tokenizer.texts_to_sequences([in_text])[0] |
|
|
118 |
# pad input |
|
|
119 |
sequence = tf.keras.preprocessing.sequence.pad_sequences( |
|
|
120 |
[sequence], maxlen=self.max_length |
|
|
121 |
) |
|
|
122 |
# predict next word |
|
|
123 |
|
|
|
124 |
if multi_modal: |
|
|
125 |
if dataset=='iuxray': |
|
|
126 |
yhats = [model.predict([photo[0], photo[1], tags, sequence], verbose=0) for model, photo in zip(models, photos)] |
|
|
127 |
else: |
|
|
128 |
if dataset=='iuxray': |
|
|
129 |
yhats = [model.predict([photo[0], photo[1], sequence], verbose=0) for model, photo in zip(models, photos)] |
|
|
130 |
else: |
|
|
131 |
yhats = [model.predict([photo, sequence], verbose=0) for model, photo in zip(models, photos)] |
|
|
132 |
|
|
|
133 |
# yhats = [ |
|
|
134 |
# model.predict([photo, sequence], verbose=0) |
|
|
135 |
# for model, photo in zip(models, photos) |
|
|
136 |
# ] |
|
|
137 |
summed = np.sum(yhats, axis=0) |
|
|
138 |
# convert probability to integer |
|
|
139 |
yhat = np.argmax(summed, axis=1) |
|
|
140 |
|
|
|
141 |
# map integer to word |
|
|
142 |
word = self.get_word(yhat[0]) |
|
|
143 |
|
|
|
144 |
# stop if we cannot map the word |
|
|
145 |
if word is None: |
|
|
146 |
break |
|
|
147 |
# append as input for generating the next word |
|
|
148 |
in_text += " " + word |
|
|
149 |
# stop if we predict the end of the sequence |
|
|
150 |
if word == self.end_token: |
|
|
151 |
break |
|
|
152 |
|
|
|
153 |
return in_text |
|
|
154 |
|
|
|
155 |
def greedy_search_ensembles_MVP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str: |
|
|
156 |
""" Executes the Maximum Voting Probability Greedy Search algorithm employing the pre-trained models along with the test instances data. |
|
|
157 |
More details are provided in my Thesis. Acknowledgements: https://ieeexplore.ieee.org/document/9031513 |
|
|
158 |
|
|
|
159 |
Args: |
|
|
160 |
models (list): The models we want to evaluate on our employed dataset |
|
|
161 |
photos (list): Current test images embeddings for each encoder we used. |
|
|
162 |
tags (list): Current test tags embeddings for each encoder we used. |
|
|
163 |
dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'. |
|
|
164 |
multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False. |
|
|
165 |
|
|
|
166 |
Returns: |
|
|
167 |
str: The generated description for the given image ID. |
|
|
168 |
""" |
|
|
169 |
# seed the generation process |
|
|
170 |
in_text = self.start_token |
|
|
171 |
# iterate over the whole length of the sequence |
|
|
172 |
for i in range(self.max_length): |
|
|
173 |
pred = [] |
|
|
174 |
index = 0 |
|
|
175 |
for each_model in models: |
|
|
176 |
|
|
|
177 |
# integer encode input sequence |
|
|
178 |
sequence = self.tokenizer.texts_to_sequences([in_text])[0] |
|
|
179 |
# pad input |
|
|
180 |
sequence = tf.keras.preprocessing.sequence.pad_sequences( |
|
|
181 |
[sequence], maxlen=self.max_length |
|
|
182 |
) |
|
|
183 |
if multi_modal: |
|
|
184 |
if dataset == 'iuxray': |
|
|
185 |
yhat = each_model.predict([photos[index][0], photos[index][1], tags, sequence], verbose=0) |
|
|
186 |
else: |
|
|
187 |
if dataset == 'iuxray': |
|
|
188 |
yhat = each_model.predict([photos[index][0], photos[index][1], sequence], verbose=0) |
|
|
189 |
else: |
|
|
190 |
yhat = each_model.predict([photos[index], sequence], verbose=0) |
|
|
191 |
pred.append(np.argmax(yhat)) |
|
|
192 |
index += 1 |
|
|
193 |
|
|
|
194 |
# predict next word |
|
|
195 |
yhats = max(pred, key=pred.count) |
|
|
196 |
|
|
|
197 |
# map integer to word |
|
|
198 |
word = self.get_word(yhats) |
|
|
199 |
|
|
|
200 |
# stop if we cannot map the word |
|
|
201 |
if word is None: |
|
|
202 |
break |
|
|
203 |
# append as input for generating the next word |
|
|
204 |
in_text += " " + word |
|
|
205 |
# stop if we predict the end of the sequence |
|
|
206 |
if word == self.end_token: |
|
|
207 |
break |
|
|
208 |
|
|
|
209 |
return in_text |
|
|
210 |
|
|
|
211 |
def greedy_search_ensembles_MP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str: |
|
|
212 |
""" Executes the Maximum Probability Greedy Search algorithm employing the pre-trained models along with the test instances data. |
|
|
213 |
More details are provided in my Thesis. |
|
|
214 |
|
|
|
215 |
Args: |
|
|
216 |
models (list): The models we want to evaluate on our employed dataset |
|
|
217 |
photos (list): Current test images embeddings for each encoder we used. |
|
|
218 |
tags (list): Current test tags embeddings for each encoder we used. |
|
|
219 |
dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'. |
|
|
220 |
multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False. |
|
|
221 |
|
|
|
222 |
Returns: |
|
|
223 |
str: The generated description for the given image ID. |
|
|
224 |
""" |
|
|
225 |
# seed the generation process |
|
|
226 |
in_text = self.start_token |
|
|
227 |
# iterate over the whole length of the sequence |
|
|
228 |
for i in range(self.max_length): |
|
|
229 |
pred = [] |
|
|
230 |
max_value = [] |
|
|
231 |
index = 0 |
|
|
232 |
for each_model in models: |
|
|
233 |
|
|
|
234 |
# integer encode input sequence |
|
|
235 |
sequence = self.tokenizer.texts_to_sequences([in_text])[0] |
|
|
236 |
# pad input |
|
|
237 |
sequence = tf.keras.preprocessing.sequence.pad_sequences( |
|
|
238 |
[sequence], maxlen=self.max_length |
|
|
239 |
) |
|
|
240 |
if multi_modal: |
|
|
241 |
if dataset == 'iuxray': |
|
|
242 |
yhat = each_model.predict([photos[index][0], photos[index][1], tags, sequence], verbose=0) |
|
|
243 |
else: |
|
|
244 |
if dataset == 'iuxray': |
|
|
245 |
yhat = each_model.predict([photos[index][0], photos[index][1], sequence], verbose=0) |
|
|
246 |
else: |
|
|
247 |
yhat = each_model.predict([photos[index], sequence], verbose=0) |
|
|
248 |
max_value.append(np.amax(yhat)) |
|
|
249 |
pred.append(np.argmax(yhat)) |
|
|
250 |
index += 1 |
|
|
251 |
|
|
|
252 |
# predict next word |
|
|
253 |
yhats = max(max_value) |
|
|
254 |
max_index = max_value.index(yhats) |
|
|
255 |
yhats = pred[max_index] |
|
|
256 |
|
|
|
257 |
# map integer to word |
|
|
258 |
word = self.get_word(yhats) |
|
|
259 |
|
|
|
260 |
# stop if we cannot map the word |
|
|
261 |
if word is None: |
|
|
262 |
break |
|
|
263 |
# append as input for generating the next word |
|
|
264 |
in_text += " " + word |
|
|
265 |
# stop if we predict the end of the sequence |
|
|
266 |
if word == self.end_token: |
|
|
267 |
break |
|
|
268 |
|
|
|
269 |
return in_text |