--- a +++ b/modules/greedy_search.py @@ -0,0 +1,269 @@ +import numpy as np +# tensorflow imports +import tensorflow as tf +import tensorflow +from tensorflow.keras.preprocessing.text import Tokenizer +from tensorflow.keras.models import Model + +class GreedySearch: + def __init__(self, start_token:str, end_token:str, max_length:int, tokenizer:Tokenizer, idx_to_word:dict, word_to_idx:dict): + """ The Greedy Search sampling method for generating captions. + + Args: + start_token (str): The start-token used during pre-processing of the training captions + end_token (str): The end-token used during pre-processing of the training captions + max_length (int): The maximum length (limit) for the generated captions + tokenizer (Tokenizer): The fitted tokenizer from the Vocabulary object + idx_to_word (dict): Dictionary with keys to be the index number and values the words in the created vocabulary + word_to_idx (dict): Dictionary with keys to be the words and values the index number in the created vocabulary + """ + self.start_token = start_token + self.end_token = end_token + self.max_length = max_length + self.tokenizer = tokenizer + self.idx_to_word = idx_to_word + self.word_to_idx = word_to_idx + + def get_word(self, idx:int) -> str: + """ Fetches the word from the index-to-word vocab, which was created after the pre-processing of the Training captions + + Args: + idx (int): The index for the index-to-word vocab. + + Returns: + str: The word for the given index if exist in the created index-to-word vocab, else None + """ + return self.idx_to_word.get(idx, None) + + def get_idx(self, word:str)->int: + """ Fetches the index number from the word-to-index vocab, which was created after the pre-processing of the Training captions + + Args: + word (str): The word for which we want its index in the word-to-index dictionary. + + Returns: + int: The index for the given word if exist in the created word-to-index vocab, else -1. The latter number refer to None + """ + return self.word_to_idx.get(word, -1) + + def greedy_search_predict(self, model:Model, photo:np.array, tag:np.array, dataset:str='iuxray', multi_modal:bool=False)->str: + """ Executes the greedy search algorithm, employing the pre-trained model along with the test instance's data. + + Args: + model (Model): The model we want to evaluate on our employed dataset + photo (np.array): Current test image embedding + tag (np.array): The tag embedding for the current test instance. This is used only for IU X-Ray dataset. + dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'. + multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False. + + Returns: + str: The generated description for the given image + """ + # seed the generation process + in_text = self.start_token + # iterate over the whole length of the sequence + for i in range(self.max_length): + # integer encode input sequence + sequence = self.tokenizer.texts_to_sequences([in_text])[0] + # pad input + sequence = tf.keras.preprocessing.sequence.pad_sequences( + [sequence], maxlen=self.max_length + ) + # predict next word + if multi_modal: + if dataset=='iuxray': + yhat = model.predict([photo[0], photo[1], tag, sequence], verbose=0) + else: + if dataset=='iuxray': + yhat = model.predict([photo[0], photo[1], sequence], verbose=0) + else: + yhat = model.predict([photo, sequence], verbose=0) + + + # convert probability to integer + yhat = np.argmax(yhat) + # map integer to word + word = self.get_word(yhat) + # stop if we cannot map the word + if word is None: + break + # append as input for generating the next word + in_text += " " + word + # stop if we predict the end of the sequence + if word == self.end_token: + break + return in_text + + def greedy_search_ensembles_AP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str: + """ Executes the Average Probability Greedy Search algorithm employing the pre-trained models along with the test instances data. + More details are provided in my Thesis. Acknowledgements: https://ieeexplore.ieee.org/document/9031513 + + Args: + models (list): The models we want to evaluate on our employed dataset + photos (list): Current test images embeddings for each encoder we used. + tags (list): Current test tags embeddings for each encoder we used. + dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'. + multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False. + + Returns: + str: The generated description for the given image ID. + """ + + # seed the generation process + in_text = self.start_token + # iterate over the whole length of the sequence + for i in range(self.max_length): + # integer encode input sequence + sequence = self.tokenizer.texts_to_sequences([in_text])[0] + # pad input + sequence = tf.keras.preprocessing.sequence.pad_sequences( + [sequence], maxlen=self.max_length + ) + # predict next word + + if multi_modal: + if dataset=='iuxray': + yhats = [model.predict([photo[0], photo[1], tags, sequence], verbose=0) for model, photo in zip(models, photos)] + else: + if dataset=='iuxray': + yhats = [model.predict([photo[0], photo[1], sequence], verbose=0) for model, photo in zip(models, photos)] + else: + yhats = [model.predict([photo, sequence], verbose=0) for model, photo in zip(models, photos)] + + # yhats = [ + # model.predict([photo, sequence], verbose=0) + # for model, photo in zip(models, photos) + # ] + summed = np.sum(yhats, axis=0) + # convert probability to integer + yhat = np.argmax(summed, axis=1) + + # map integer to word + word = self.get_word(yhat[0]) + + # stop if we cannot map the word + if word is None: + break + # append as input for generating the next word + in_text += " " + word + # stop if we predict the end of the sequence + if word == self.end_token: + break + + return in_text + + def greedy_search_ensembles_MVP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str: + """ Executes the Maximum Voting Probability Greedy Search algorithm employing the pre-trained models along with the test instances data. + More details are provided in my Thesis. Acknowledgements: https://ieeexplore.ieee.org/document/9031513 + + Args: + models (list): The models we want to evaluate on our employed dataset + photos (list): Current test images embeddings for each encoder we used. + tags (list): Current test tags embeddings for each encoder we used. + dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'. + multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False. + + Returns: + str: The generated description for the given image ID. + """ + # seed the generation process + in_text = self.start_token + # iterate over the whole length of the sequence + for i in range(self.max_length): + pred = [] + index = 0 + for each_model in models: + + # integer encode input sequence + sequence = self.tokenizer.texts_to_sequences([in_text])[0] + # pad input + sequence = tf.keras.preprocessing.sequence.pad_sequences( + [sequence], maxlen=self.max_length + ) + if multi_modal: + if dataset == 'iuxray': + yhat = each_model.predict([photos[index][0], photos[index][1], tags, sequence], verbose=0) + else: + if dataset == 'iuxray': + yhat = each_model.predict([photos[index][0], photos[index][1], sequence], verbose=0) + else: + yhat = each_model.predict([photos[index], sequence], verbose=0) + pred.append(np.argmax(yhat)) + index += 1 + + # predict next word + yhats = max(pred, key=pred.count) + + # map integer to word + word = self.get_word(yhats) + + # stop if we cannot map the word + if word is None: + break + # append as input for generating the next word + in_text += " " + word + # stop if we predict the end of the sequence + if word == self.end_token: + break + + return in_text + + def greedy_search_ensembles_MP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str: + """ Executes the Maximum Probability Greedy Search algorithm employing the pre-trained models along with the test instances data. + More details are provided in my Thesis. + + Args: + models (list): The models we want to evaluate on our employed dataset + photos (list): Current test images embeddings for each encoder we used. + tags (list): Current test tags embeddings for each encoder we used. + dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'. + multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False. + + Returns: + str: The generated description for the given image ID. + """ + # seed the generation process + in_text = self.start_token + # iterate over the whole length of the sequence + for i in range(self.max_length): + pred = [] + max_value = [] + index = 0 + for each_model in models: + + # integer encode input sequence + sequence = self.tokenizer.texts_to_sequences([in_text])[0] + # pad input + sequence = tf.keras.preprocessing.sequence.pad_sequences( + [sequence], maxlen=self.max_length + ) + if multi_modal: + if dataset == 'iuxray': + yhat = each_model.predict([photos[index][0], photos[index][1], tags, sequence], verbose=0) + else: + if dataset == 'iuxray': + yhat = each_model.predict([photos[index][0], photos[index][1], sequence], verbose=0) + else: + yhat = each_model.predict([photos[index], sequence], verbose=0) + max_value.append(np.amax(yhat)) + pred.append(np.argmax(yhat)) + index += 1 + + # predict next word + yhats = max(max_value) + max_index = max_value.index(yhats) + yhats = pred[max_index] + + # map integer to word + word = self.get_word(yhats) + + # stop if we cannot map the word + if word is None: + break + # append as input for generating the next word + in_text += " " + word + # stop if we predict the end of the sequence + if word == self.end_token: + break + + return in_text