Diff of /modules/greedy_search.py [000000] .. [03245f]

Switch to side-by-side view

--- a
+++ b/modules/greedy_search.py
@@ -0,0 +1,269 @@
+import numpy as np
+# tensorflow imports
+import tensorflow as tf
+import tensorflow
+from tensorflow.keras.preprocessing.text import Tokenizer
+from tensorflow.keras.models import Model
+
+class GreedySearch:
+    def __init__(self, start_token:str, end_token:str, max_length:int, tokenizer:Tokenizer, idx_to_word:dict, word_to_idx:dict):
+        """ The Greedy Search sampling method for generating captions.
+
+        Args:
+            start_token (str): The start-token used during pre-processing of the training captions
+            end_token (str): The end-token used during pre-processing of the training captions
+            max_length (int): The maximum length (limit) for the generated captions
+            tokenizer (Tokenizer): The fitted tokenizer from the Vocabulary object
+            idx_to_word (dict): Dictionary with keys to be the index number and values the words in the created vocabulary
+            word_to_idx (dict): Dictionary with keys to be the words and values the index number in the created vocabulary  
+        """
+        self.start_token = start_token
+        self.end_token = end_token
+        self.max_length = max_length
+        self.tokenizer = tokenizer
+        self.idx_to_word = idx_to_word
+        self.word_to_idx = word_to_idx
+
+    def get_word(self, idx:int) -> str:
+        """ Fetches the word from the index-to-word vocab, which was created after the pre-processing of the Training captions
+
+        Args:
+            idx (int): The index for the index-to-word vocab.
+
+        Returns:
+            str: The word for the given index if exist in the created index-to-word vocab, else None
+        """
+        return self.idx_to_word.get(idx, None)
+
+    def get_idx(self, word:str)->int:
+        """ Fetches the index number from the word-to-index vocab, which was created after the pre-processing of the Training captions
+
+        Args:
+            word (str): The word for which we want its index in the word-to-index dictionary.
+
+        Returns:
+            int: The index for the given word if exist in the created word-to-index vocab, else -1. The latter number refer to None
+        """
+        return self.word_to_idx.get(word, -1)
+
+    def greedy_search_predict(self, model:Model, photo:np.array, tag:np.array, dataset:str='iuxray', multi_modal:bool=False)->str:
+        """ Executes the greedy search algorithm, employing the pre-trained model along with the test instance's data.
+
+        Args:
+            model (Model): The model we want to evaluate on our employed dataset
+            photo (np.array): Current test image embedding
+            tag (np.array): The tag embedding for the current test instance. This is used only for IU X-Ray dataset.
+            dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'.
+            multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False.
+
+        Returns:
+            str: The generated description for the given image
+        """
+        # seed the generation process
+        in_text = self.start_token
+        # iterate over the whole length of the sequence
+        for i in range(self.max_length):
+            # integer encode input sequence
+            sequence = self.tokenizer.texts_to_sequences([in_text])[0]
+            # pad input
+            sequence = tf.keras.preprocessing.sequence.pad_sequences(
+                [sequence], maxlen=self.max_length
+            )
+            # predict next word
+            if multi_modal:
+                if dataset=='iuxray':
+                    yhat = model.predict([photo[0], photo[1], tag, sequence], verbose=0)
+            else:
+                if dataset=='iuxray':
+                    yhat = model.predict([photo[0], photo[1], sequence], verbose=0)
+                else:
+                    yhat = model.predict([photo, sequence], verbose=0)
+            
+                
+            # convert probability to integer
+            yhat = np.argmax(yhat)
+            # map integer to word
+            word = self.get_word(yhat)
+            # stop if we cannot map the word
+            if word is None:
+                break
+            # append as input for generating the next word
+            in_text += " " + word
+            # stop if we predict the end of the sequence
+            if word == self.end_token:
+                break
+        return in_text
+
+    def greedy_search_ensembles_AP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str:
+        """ Executes the Average Probability Greedy Search algorithm employing the pre-trained models along with the test instances data.
+        More details are provided in my Thesis. Acknowledgements: https://ieeexplore.ieee.org/document/9031513
+
+        Args:
+            models (list): The models we want to evaluate on our employed dataset
+            photos (list): Current test images embeddings for each encoder we used.
+            tags (list): Current test tags embeddings for each encoder we used.
+            dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'.
+            multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False.
+
+        Returns:
+            str: The generated description for the given image ID.
+        """
+        
+        # seed the generation process
+        in_text = self.start_token
+        # iterate over the whole length of the sequence
+        for i in range(self.max_length):
+            # integer encode input sequence
+            sequence = self.tokenizer.texts_to_sequences([in_text])[0]
+            # pad input
+            sequence = tf.keras.preprocessing.sequence.pad_sequences(
+                [sequence], maxlen=self.max_length
+            )
+            # predict next word
+            
+            if multi_modal:
+                if dataset=='iuxray':
+                    yhats = [model.predict([photo[0], photo[1], tags, sequence], verbose=0) for model, photo in zip(models, photos)]
+            else:
+                if dataset=='iuxray':
+                    yhats = [model.predict([photo[0], photo[1], sequence], verbose=0) for model, photo in zip(models, photos)]
+                else:
+                    yhats = [model.predict([photo, sequence], verbose=0) for model, photo in zip(models, photos)]
+
+            # yhats = [
+            #     model.predict([photo, sequence], verbose=0)
+            #     for model, photo in zip(models, photos)
+            # ]
+            summed = np.sum(yhats, axis=0)
+            # convert probability to integer
+            yhat = np.argmax(summed, axis=1)
+
+            # map integer to word
+            word = self.get_word(yhat[0])
+
+            # stop if we cannot map the word
+            if word is None:
+                break
+            # append as input for generating the next word
+            in_text += " " + word
+            # stop if we predict the end of the sequence
+            if word == self.end_token:
+                break
+
+        return in_text
+
+    def greedy_search_ensembles_MVP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str:
+        """ Executes the Maximum Voting Probability Greedy Search algorithm employing the pre-trained models along with the test instances data.
+        More details are provided in my Thesis. Acknowledgements: https://ieeexplore.ieee.org/document/9031513
+
+        Args:
+            models (list): The models we want to evaluate on our employed dataset
+            photos (list): Current test images embeddings for each encoder we used.
+            tags (list): Current test tags embeddings for each encoder we used.
+            dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'.
+            multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False.
+
+        Returns:
+            str: The generated description for the given image ID.
+        """
+        # seed the generation process
+        in_text = self.start_token
+        # iterate over the whole length of the sequence
+        for i in range(self.max_length):
+            pred = []
+            index = 0
+            for each_model in models:
+
+                # integer encode input sequence
+                sequence = self.tokenizer.texts_to_sequences([in_text])[0]
+                # pad input
+                sequence = tf.keras.preprocessing.sequence.pad_sequences(
+                    [sequence], maxlen=self.max_length
+                )
+                if multi_modal:
+                    if dataset == 'iuxray':
+                        yhat = each_model.predict([photos[index][0], photos[index][1], tags, sequence], verbose=0)
+                else:
+                    if dataset == 'iuxray':
+                        yhat = each_model.predict([photos[index][0], photos[index][1], sequence], verbose=0)
+                    else:
+                        yhat = each_model.predict([photos[index], sequence], verbose=0)
+                pred.append(np.argmax(yhat))
+                index += 1
+
+            # predict next word
+            yhats = max(pred, key=pred.count)
+
+            # map integer to word
+            word = self.get_word(yhats)
+
+            # stop if we cannot map the word
+            if word is None:
+                break
+            # append as input for generating the next word
+            in_text += " " + word
+            # stop if we predict the end of the sequence
+            if word == self.end_token:
+                break
+
+        return in_text
+
+    def greedy_search_ensembles_MP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str:
+        """ Executes the Maximum Probability Greedy Search algorithm employing the pre-trained models along with the test instances data.
+        More details are provided in my Thesis. 
+
+        Args:
+            models (list): The models we want to evaluate on our employed dataset
+            photos (list): Current test images embeddings for each encoder we used.
+            tags (list): Current test tags embeddings for each encoder we used.
+            dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'.
+            multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False.
+
+        Returns:
+            str: The generated description for the given image ID.
+        """
+        # seed the generation process
+        in_text = self.start_token
+        # iterate over the whole length of the sequence
+        for i in range(self.max_length):
+            pred = []
+            max_value = []
+            index = 0
+            for each_model in models:
+
+                # integer encode input sequence
+                sequence = self.tokenizer.texts_to_sequences([in_text])[0]
+                # pad input
+                sequence = tf.keras.preprocessing.sequence.pad_sequences(
+                    [sequence], maxlen=self.max_length
+                )
+                if multi_modal:
+                    if dataset == 'iuxray':
+                        yhat = each_model.predict([photos[index][0], photos[index][1], tags, sequence], verbose=0)
+                else:
+                    if dataset == 'iuxray':
+                        yhat = each_model.predict([photos[index][0], photos[index][1], sequence], verbose=0)
+                    else:
+                        yhat = each_model.predict([photos[index], sequence], verbose=0)
+                max_value.append(np.amax(yhat))
+                pred.append(np.argmax(yhat))
+                index += 1
+
+            # predict next word
+            yhats = max(max_value)
+            max_index = max_value.index(yhats)
+            yhats = pred[max_index]
+
+            # map integer to word
+            word = self.get_word(yhats)
+
+            # stop if we cannot map the word
+            if word is None:
+                break
+            # append as input for generating the next word
+            in_text += " " + word
+            # stop if we predict the end of the sequence
+            if word == self.end_token:
+                break
+
+        return in_text