Thesis-Diagnostic-Caption / Git / [03245f] /utils/dataset.py

Models:
philipB/
Thesis-Diagnostic-Caption
Downloads: 1
[03245f]: / utils / dataset.py
History
Download this file
316 lines (246 with data), 13.4 kB

# sklearn and nltk imports
from sklearn.model_selection import KFold
from nltk.tokenize import word_tokenize
import numpy as np
import nltk
nltk.download("punkt", quiet=True)

# tensorflow imports
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer

# progress bar
from tqdm import tqdm

# utils imports
from utils.text_handler import TextHandler
from utils.vocabulary import Vocabulary



class Dataset:
    def __init__(self, image_vectors:dict, captions_data:dict, clear_long_captions:bool = True):
        """ Base class to create the employed dataset for my research, i.e. ImageCLEF and IU X-Ray 

        Args:
            image_vectors (dict): Dictionary with keys to be the ImageIDs and values the image embeddings.
            captions_data (dict): Dictionary with keys to be the ImageIDs and values the captions.
            clear_long_captions (bool, optional): If we want to drop the outlier long captions. Defaults to True.
        """
        self.image_vectors = image_vectors
        self.captions_data = captions_data
        self.clear_long_captions = clear_long_captions
        # init a text handler object to pre-process training captions
        self.text_handler = TextHandler()

    def delete_long_captions(self, data:dict, threshold:int=80) -> dict:
        """ Function that removes the long captions only from the training set. This method was utilised during ImageCLEF campaign.

        Args:
            data (dict): Dictionary with keys to be the ImageIDs and values the captions.
            threshold (int, optional): The maximum length limit. Defaults to 80.

        Returns:
            dict: Dictionary with keys to be the ImageIDs and values the captions, without the instances whose captions are long.
        """
        filtered_data = {}

        for image_id, caption in data.items():
            tokens = word_tokenize(caption)
            if len(tokens) <= threshold:
                filtered_data[image_id] = caption

        return filtered_data
    
    @staticmethod
    def build_splits(self) ->tuple[list, list, list]:
        """ This function makes the split sets for trainig, validation and test.
        In particulare, we followed the next splits:
        train: 80% 
        valid: 5%
        test: 15%

        Returns:
            tuple[list, list, list]: Training, validation, test set ids.
        """
    
        image_ids = list( self.captions_data.keys() )
        np.random.shuffle(image_ids)

        test_split_threshold = int(0.15 * len(image_ids))
        train, test = (
            image_ids[:-test_split_threshold],
            image_ids[-test_split_threshold:],
        )

        dev_split_threshold = int(0.1 * len(train))
        train, dev = (
            train[:-dev_split_threshold],
            train[-dev_split_threshold:],
        )

        return train, dev, test
    
    @staticmethod
    def get_image_vectors(self, keys:list) -> dict:
        """ Fetches from the whole dataset the image embeddings according to the utilised set.

        Args:
            keys (list): Split set ids

        Returns:
            dict: Dictionary with keys to be the ImageIDs and values the image embeddings, for each split set.
        """

        return { k: v for k, v in tqdm(self.image_vectors.items(), desc="Fetching image embeddings..") if k in keys }

    def get_captions(self, _ids:list) -> dict:
        return { key:value for key, value in self.captions_data.items() if key in _ids}
 
    def build_pseudo_cv_splits(self) -> tuple[list, list]:
        """ This function makes cross-validaion splis using K-Fold cross validation. It was used only for ImageCLEF campaign.
        More details are described in my Thesis.

        Returns:
            tuple[list, list]: Training and test fold sets.
        """
        image_ids = list( self.captions_data.keys() )
        np.random.shuffle(image_ids)

        # apply 15-Fold CV
        kf = KFold(n_splits=15)
        train_fold_ids, test_fold_ids = list(), list()
        for train_index, test_index in kf.split(image_ids):
            train_ids = [image_ids[index] for index in train_index]
            test_ids = [image_ids[index] for index in test_index]
            train_fold_ids.append(train_ids)
            test_fold_ids.append(test_ids)

        return train_fold_ids, test_fold_ids

    def build_vocab(self, training_captions:list, threshold:int = 3) -> tuple[Vocabulary, Tokenizer, dict, dict]:
        """ This method creates the employed vocabulary given the training captions

        Args:
            training_captions (list): All training captions
            threshold (int, optional): The cut-off frequence for Vocabulary. Defaults to 3.

        Returns:
            tuple[Vocabulary, Tokenizer, dict, dict]: The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. 
            The latters are mappers for words and index respectively
        """
        vocab = Vocabulary(texts=training_captions, threshold=threshold)
        tokenizer, word2idx, idx2word = vocab.build_vocab()
        return vocab, tokenizer, word2idx, idx2word
    
    
    
class IuXrayDataset(Dataset):
    def __init__(self, image_vectors: dict, captions_data: dict, tags_data: dict):
        """ Child class to create the employed IU X-Ray, inheriting the base class methods

        Args:
            image_vectors (dict): Dictionary with keys to be the ImageIDs and values the image embeddings.
            captions_data (dict): Dictionary with keys to be the ImageIDs and values the captions.
            tags_data (dict): Dictionary with keys to be the ImageIDs and values the tags embeddings.
        """
        super().__init__(image_vectors=image_vectors, captions_data=captions_data, clear_long_captions=False)
        self.tags_data = tags_data
        # get the splits
        self.train_dataset, self.dev_dataset, self.test_dataset = self.build_dataset()
        # build linguistic attributes
        self.vocab, self.tokenizer, self.word2idx, self.idx2word = super().build_vocab(training_captions=list(self.train_dataset[1].values()))
    
    def __str__(self) -> str:
        """ Python built-in function for prints

        Returns:
            str: A modified print.
        """
        text = f"Train: patients={len(self.train_dataset[0])}, captions={len(self.train_dataset[1])}, tags={len(self.train_dataset[2])}"
        text += f"\nDev: patients={len(self.dev_dataset[0])}, captions={len(self.dev_dataset[1])}, tags={len(self.dev_dataset[2])}"
        text += f"\nTest: patients={len(self.test_dataset[0])}, captions={len(self.test_dataset[1])}, tags={len(self.test_dataset[2])}"
        return text
    
    def get_splits_sets(self) ->tuple[list, list, list]:
        """ Fetches the data for each split set.

        Returns:
            tuple[list, list, list]: train_dataset, dev_dataset, test_dataset
        """
        return self.train_dataset, self.dev_dataset, self.test_dataset
    
    def get_tokenizer_utils(self) ->tuple[Vocabulary, Tokenizer, dict, dict]:
        """ Fetches the linguistic utilities.

        Returns:
            tuple[Vocabulary, Tokenizer, dict, dict]:  The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. 
            The latters are mappers for words and index respectively
        """
        return self.vocab, self.tokenizer, self.word2idx, self.idx2word
        
    def __get_tags(self, _ids:list) -> dict:
        """ Fetches from the whole dataset the tags embeddings according to the utilised set.

        Args:
            _ids (list): Split set ids

        Returns:
            dict: Dictionary with keys to be the ImageIDs and values the tags embeddings
        """
         
        return { key:value for key, value in self.tags_data.items() if key in _ids}
        
    def build_dataset(self) -> tuple[list, list, list]:
        """ Begins the whole process for the dataset creation.

        Returns:
            tuple[list, list, list]: The training dataset, the validation dataset and the test dataset for our models.
            All sets are in list format. 
            1st index --> image vectors
            2nd index --> captions
            3rd index --> tags
        """
        # random split
        train_ids, dev_ids, test_ids = super().build_splits()

        # fetch images
        train_images = super().get_image_vectors(train_ids)
        dev_images = super().get_image_vectors(dev_ids)
        test_images = super().get_image_vectors(test_ids)
        # fetch captions
        train_captions = super().get_captions(train_ids)
        dev_captions = super().get_captions(dev_ids)
        test_captions = super().get_captions(test_ids)
        # apply preprocess to training captions
        train_captions_prepro = self.text_handler.preprocess_all(
            list(train_captions.values()))
            
        train_captions_prepro = dict( zip( train_ids, train_captions_prepro ) )
        # fetch tags
        train_tags = self.__get_tags(train_ids)
        dev_tags = self.__get_tags(dev_ids)
        test_tags = self.__get_tags(test_ids)
        # build data for each set    
        train_dataset = [train_images, train_captions_prepro, train_tags]
        dev_dataset = [dev_images, dev_captions, dev_tags]
        test_dataset = [test_images, test_captions, test_tags]


        return train_dataset, dev_dataset, test_dataset
    


class ImageCLEFDataset(Dataset):
    def __init__(self, image_vectors: dict, captions_data: dict):
        """_summary_

        Args:
            image_vectors (dict): _description_
            captions_data (dict): _description_
        """
        super().__init__(image_vectors=image_vectors, captions_data=captions_data, clear_long_captions=True)
        self.train_dataset, self.dev_dataset, self.test_dataset = self.build_dataset()
        
        self.vocab, self.tokenizer, self.word2idx, self.idx2word = super().build_vocab(training_captions=list(self.train_dataset[1].values()))
        
    def __str__(self) -> str:
        """ Python built-in function for prints

        Returns:
            str: A modified print.
        """
        text = f"Train: patients={len(self.train_dataset[0])}, captions={len(self.train_dataset[1])}"
        text += f"\nDev: patients={len(self.dev_dataset[0])}, captions={len(self.dev_dataset[1])}"
        text += f"\nTest: patients={len(self.test_dataset[0])}, captions={len(self.test_dataset[1])}"
        return text
    
    def get_splits_sets(self) -> tuple[list, list, list]:
        """ Fetches the data for each split set.

        Returns:
            tuple[list, list, list]: train_dataset, dev_dataset, test_dataset
        """
        return self.train_dataset, self.dev_dataset, self.test_dataset
    
    def get_tokenizer_utils(self) -> tuple[Vocabulary, Tokenizer, dict, dict]:
        """ Fetches the linguistic utilities.

        Returns:
            tuple[Vocabulary, Tokenizer, dict, dict]:  The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. 
            The latters are mappers for words and index respectively
        """
        return self.vocab, self.tokenizer, self.word2idx, self.idx2word
        
    def build_dataset(self) -> tuple[list, list, list]:
        """ Begins the whole process for the dataset creation.

        Returns:
            tuple[list, list, list]: The training dataset, the validation dataset and the test dataset for our models.
            All sets are in list format. 
            1st index --> image vectors
            2nd index --> captions
        """
        # random split
        train_ids, dev_ids, test_ids = super().build_splits()
        # fetch images
        train_images = super().get_image_vectors(train_ids)
        dev_images = super().get_image_vectors(dev_ids)
        test_images = super().get_image_vectors(test_ids)
        # fetch captions
        train_captions = super().get_captions(train_ids)
        dev_captions = super().get_captions(dev_ids)
        test_captions = super().get_captions(test_ids)
        
        # remove long outlier captions from training set
        train_modified_captions = super().delete_long_captions(data=train_captions)
        # get new training ids after removing
        train_new_ids = list(train_modified_captions.keys())
        train_new_images = {
            key:image_vector for key, image_vector in train_images.items() if key in train_new_ids
        }
        # apply preprocess to training captions
        train_captions_prepro = self.text_handler.preprocess_all(
            list(train_modified_captions.values()))
            
        train_captions_prepro = dict( zip( train_new_ids, train_captions_prepro ) )
        # build data for each set  
        train_dataset = [train_new_images, train_captions_prepro]
        dev_dataset = [dev_images, dev_captions]
        test_dataset = [test_images, test_captions]


        return train_dataset, dev_dataset, test_dataset