Thesis-Diagnostic-Caption / Git / [03245f] /modules/greedy

Models:
philipB/
Thesis-Diagnostic-Caption
Downloads: 1
[03245f]: / modules / greedy_search.py
History
Download this file
270 lines (231 with data), 12.6 kB

import numpy as np
# tensorflow imports
import tensorflow as tf
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Model

class GreedySearch:
    def __init__(self, start_token:str, end_token:str, max_length:int, tokenizer:Tokenizer, idx_to_word:dict, word_to_idx:dict):
        """ The Greedy Search sampling method for generating captions.

        Args:
            start_token (str): The start-token used during pre-processing of the training captions
            end_token (str): The end-token used during pre-processing of the training captions
            max_length (int): The maximum length (limit) for the generated captions
            tokenizer (Tokenizer): The fitted tokenizer from the Vocabulary object
            idx_to_word (dict): Dictionary with keys to be the index number and values the words in the created vocabulary
            word_to_idx (dict): Dictionary with keys to be the words and values the index number in the created vocabulary  
        """
        self.start_token = start_token
        self.end_token = end_token
        self.max_length = max_length
        self.tokenizer = tokenizer
        self.idx_to_word = idx_to_word
        self.word_to_idx = word_to_idx

    def get_word(self, idx:int) -> str:
        """ Fetches the word from the index-to-word vocab, which was created after the pre-processing of the Training captions

        Args:
            idx (int): The index for the index-to-word vocab.

        Returns:
            str: The word for the given index if exist in the created index-to-word vocab, else None
        """
        return self.idx_to_word.get(idx, None)

    def get_idx(self, word:str)->int:
        """ Fetches the index number from the word-to-index vocab, which was created after the pre-processing of the Training captions

        Args:
            word (str): The word for which we want its index in the word-to-index dictionary.

        Returns:
            int: The index for the given word if exist in the created word-to-index vocab, else -1. The latter number refer to None
        """
        return self.word_to_idx.get(word, -1)

    def greedy_search_predict(self, model:Model, photo:np.array, tag:np.array, dataset:str='iuxray', multi_modal:bool=False)->str:
        """ Executes the greedy search algorithm, employing the pre-trained model along with the test instance's data.

        Args:
            model (Model): The model we want to evaluate on our employed dataset
            photo (np.array): Current test image embedding
            tag (np.array): The tag embedding for the current test instance. This is used only for IU X-Ray dataset.
            dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'.
            multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False.

        Returns:
            str: The generated description for the given image
        """
        # seed the generation process
        in_text = self.start_token
        # iterate over the whole length of the sequence
        for i in range(self.max_length):
            # integer encode input sequence
            sequence = self.tokenizer.texts_to_sequences([in_text])[0]
            # pad input
            sequence = tf.keras.preprocessing.sequence.pad_sequences(
                [sequence], maxlen=self.max_length
            )
            # predict next word
            if multi_modal:
                if dataset=='iuxray':
                    yhat = model.predict([photo[0], photo[1], tag, sequence], verbose=0)
            else:
                if dataset=='iuxray':
                    yhat = model.predict([photo[0], photo[1], sequence], verbose=0)
                else:
                    yhat = model.predict([photo, sequence], verbose=0)
            
                
            # convert probability to integer
            yhat = np.argmax(yhat)
            # map integer to word
            word = self.get_word(yhat)
            # stop if we cannot map the word
            if word is None:
                break
            # append as input for generating the next word
            in_text += " " + word
            # stop if we predict the end of the sequence
            if word == self.end_token:
                break
        return in_text

    def greedy_search_ensembles_AP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str:
        """ Executes the Average Probability Greedy Search algorithm employing the pre-trained models along with the test instances data.
        More details are provided in my Thesis. Acknowledgements: https://ieeexplore.ieee.org/document/9031513

        Args:
            models (list): The models we want to evaluate on our employed dataset
            photos (list): Current test images embeddings for each encoder we used.
            tags (list): Current test tags embeddings for each encoder we used.
            dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'.
            multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False.

        Returns:
            str: The generated description for the given image ID.
        """
        
        # seed the generation process
        in_text = self.start_token
        # iterate over the whole length of the sequence
        for i in range(self.max_length):
            # integer encode input sequence
            sequence = self.tokenizer.texts_to_sequences([in_text])[0]
            # pad input
            sequence = tf.keras.preprocessing.sequence.pad_sequences(
                [sequence], maxlen=self.max_length
            )
            # predict next word
            
            if multi_modal:
                if dataset=='iuxray':
                    yhats = [model.predict([photo[0], photo[1], tags, sequence], verbose=0) for model, photo in zip(models, photos)]
            else:
                if dataset=='iuxray':
                    yhats = [model.predict([photo[0], photo[1], sequence], verbose=0) for model, photo in zip(models, photos)]
                else:
                    yhats = [model.predict([photo, sequence], verbose=0) for model, photo in zip(models, photos)]

            # yhats = [
            #     model.predict([photo, sequence], verbose=0)
            #     for model, photo in zip(models, photos)
            # ]
            summed = np.sum(yhats, axis=0)
            # convert probability to integer
            yhat = np.argmax(summed, axis=1)

            # map integer to word
            word = self.get_word(yhat[0])

            # stop if we cannot map the word
            if word is None:
                break
            # append as input for generating the next word
            in_text += " " + word
            # stop if we predict the end of the sequence
            if word == self.end_token:
                break

        return in_text

    def greedy_search_ensembles_MVP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str:
        """ Executes the Maximum Voting Probability Greedy Search algorithm employing the pre-trained models along with the test instances data.
        More details are provided in my Thesis. Acknowledgements: https://ieeexplore.ieee.org/document/9031513

        Args:
            models (list): The models we want to evaluate on our employed dataset
            photos (list): Current test images embeddings for each encoder we used.
            tags (list): Current test tags embeddings for each encoder we used.
            dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'.
            multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False.

        Returns:
            str: The generated description for the given image ID.
        """
        # seed the generation process
        in_text = self.start_token
        # iterate over the whole length of the sequence
        for i in range(self.max_length):
            pred = []
            index = 0
            for each_model in models:

                # integer encode input sequence
                sequence = self.tokenizer.texts_to_sequences([in_text])[0]
                # pad input
                sequence = tf.keras.preprocessing.sequence.pad_sequences(
                    [sequence], maxlen=self.max_length
                )
                if multi_modal:
                    if dataset == 'iuxray':
                        yhat = each_model.predict([photos[index][0], photos[index][1], tags, sequence], verbose=0)
                else:
                    if dataset == 'iuxray':
                        yhat = each_model.predict([photos[index][0], photos[index][1], sequence], verbose=0)
                    else:
                        yhat = each_model.predict([photos[index], sequence], verbose=0)
                pred.append(np.argmax(yhat))
                index += 1

            # predict next word
            yhats = max(pred, key=pred.count)

            # map integer to word
            word = self.get_word(yhats)

            # stop if we cannot map the word
            if word is None:
                break
            # append as input for generating the next word
            in_text += " " + word
            # stop if we predict the end of the sequence
            if word == self.end_token:
                break

        return in_text

    def greedy_search_ensembles_MP(self, models:list, photos:list, tags:list, dataset:str='iuxray', multi_modal:bool=False)->str:
        """ Executes the Maximum Probability Greedy Search algorithm employing the pre-trained models along with the test instances data.
        More details are provided in my Thesis. 

        Args:
            models (list): The models we want to evaluate on our employed dataset
            photos (list): Current test images embeddings for each encoder we used.
            tags (list): Current test tags embeddings for each encoder we used.
            dataset (str, optional): The dataset we employed for the model. Defaults to 'iuxray'.
            multi_modal (bool, optional): If we want to use the multi-modal version of model. This is used only for IU X-Ray dataset. Defaults to False.

        Returns:
            str: The generated description for the given image ID.
        """
        # seed the generation process
        in_text = self.start_token
        # iterate over the whole length of the sequence
        for i in range(self.max_length):
            pred = []
            max_value = []
            index = 0
            for each_model in models:

                # integer encode input sequence
                sequence = self.tokenizer.texts_to_sequences([in_text])[0]
                # pad input
                sequence = tf.keras.preprocessing.sequence.pad_sequences(
                    [sequence], maxlen=self.max_length
                )
                if multi_modal:
                    if dataset == 'iuxray':
                        yhat = each_model.predict([photos[index][0], photos[index][1], tags, sequence], verbose=0)
                else:
                    if dataset == 'iuxray':
                        yhat = each_model.predict([photos[index][0], photos[index][1], sequence], verbose=0)
                    else:
                        yhat = each_model.predict([photos[index], sequence], verbose=0)
                max_value.append(np.amax(yhat))
                pred.append(np.argmax(yhat))
                index += 1

            # predict next word
            yhats = max(max_value)
            max_index = max_value.index(yhats)
            yhats = pred[max_index]

            # map integer to word
            word = self.get_word(yhats)

            # stop if we cannot map the word
            if word is None:
                break
            # append as input for generating the next word
            in_text += " " + word
            # stop if we predict the end of the sequence
            if word == self.end_token:
                break

        return in_text