Diff of /utils/dataset.py [000000] .. [03245f]

Switch to side-by-side view

--- a
+++ b/utils/dataset.py
@@ -0,0 +1,315 @@
+# sklearn and nltk imports
+from sklearn.model_selection import KFold
+from nltk.tokenize import word_tokenize
+import numpy as np
+import nltk
+nltk.download("punkt", quiet=True)
+
+# tensorflow imports
+import tensorflow
+from tensorflow.keras.preprocessing.text import Tokenizer
+
+# progress bar
+from tqdm import tqdm
+
+# utils imports
+from utils.text_handler import TextHandler
+from utils.vocabulary import Vocabulary
+
+
+
+class Dataset:
+    def __init__(self, image_vectors:dict, captions_data:dict, clear_long_captions:bool = True):
+        """ Base class to create the employed dataset for my research, i.e. ImageCLEF and IU X-Ray 
+
+        Args:
+            image_vectors (dict): Dictionary with keys to be the ImageIDs and values the image embeddings.
+            captions_data (dict): Dictionary with keys to be the ImageIDs and values the captions.
+            clear_long_captions (bool, optional): If we want to drop the outlier long captions. Defaults to True.
+        """
+        self.image_vectors = image_vectors
+        self.captions_data = captions_data
+        self.clear_long_captions = clear_long_captions
+        # init a text handler object to pre-process training captions
+        self.text_handler = TextHandler()
+
+    def delete_long_captions(self, data:dict, threshold:int=80) -> dict:
+        """ Function that removes the long captions only from the training set. This method was utilised during ImageCLEF campaign.
+
+        Args:
+            data (dict): Dictionary with keys to be the ImageIDs and values the captions.
+            threshold (int, optional): The maximum length limit. Defaults to 80.
+
+        Returns:
+            dict: Dictionary with keys to be the ImageIDs and values the captions, without the instances whose captions are long.
+        """
+        filtered_data = {}
+
+        for image_id, caption in data.items():
+            tokens = word_tokenize(caption)
+            if len(tokens) <= threshold:
+                filtered_data[image_id] = caption
+
+        return filtered_data
+    
+    @staticmethod
+    def build_splits(self) ->tuple[list, list, list]:
+        """ This function makes the split sets for trainig, validation and test.
+        In particulare, we followed the next splits:
+        train: 80% 
+        valid: 5%
+        test: 15%
+
+        Returns:
+            tuple[list, list, list]: Training, validation, test set ids.
+        """
+    
+        image_ids = list( self.captions_data.keys() )
+        np.random.shuffle(image_ids)
+
+        test_split_threshold = int(0.15 * len(image_ids))
+        train, test = (
+            image_ids[:-test_split_threshold],
+            image_ids[-test_split_threshold:],
+        )
+
+        dev_split_threshold = int(0.1 * len(train))
+        train, dev = (
+            train[:-dev_split_threshold],
+            train[-dev_split_threshold:],
+        )
+
+        return train, dev, test
+    
+    @staticmethod
+    def get_image_vectors(self, keys:list) -> dict:
+        """ Fetches from the whole dataset the image embeddings according to the utilised set.
+
+        Args:
+            keys (list): Split set ids
+
+        Returns:
+            dict: Dictionary with keys to be the ImageIDs and values the image embeddings, for each split set.
+        """
+
+        return { k: v for k, v in tqdm(self.image_vectors.items(), desc="Fetching image embeddings..") if k in keys }
+
+    def get_captions(self, _ids:list) -> dict:
+        return { key:value for key, value in self.captions_data.items() if key in _ids}
+ 
+    def build_pseudo_cv_splits(self) -> tuple[list, list]:
+        """ This function makes cross-validaion splis using K-Fold cross validation. It was used only for ImageCLEF campaign.
+        More details are described in my Thesis.
+
+        Returns:
+            tuple[list, list]: Training and test fold sets.
+        """
+        image_ids = list( self.captions_data.keys() )
+        np.random.shuffle(image_ids)
+
+        # apply 15-Fold CV
+        kf = KFold(n_splits=15)
+        train_fold_ids, test_fold_ids = list(), list()
+        for train_index, test_index in kf.split(image_ids):
+            train_ids = [image_ids[index] for index in train_index]
+            test_ids = [image_ids[index] for index in test_index]
+            train_fold_ids.append(train_ids)
+            test_fold_ids.append(test_ids)
+
+        return train_fold_ids, test_fold_ids
+
+    def build_vocab(self, training_captions:list, threshold:int = 3) -> tuple[Vocabulary, Tokenizer, dict, dict]:
+        """ This method creates the employed vocabulary given the training captions
+
+        Args:
+            training_captions (list): All training captions
+            threshold (int, optional): The cut-off frequence for Vocabulary. Defaults to 3.
+
+        Returns:
+            tuple[Vocabulary, Tokenizer, dict, dict]: The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. 
+            The latters are mappers for words and index respectively
+        """
+        vocab = Vocabulary(texts=training_captions, threshold=threshold)
+        tokenizer, word2idx, idx2word = vocab.build_vocab()
+        return vocab, tokenizer, word2idx, idx2word
+    
+    
+    
+class IuXrayDataset(Dataset):
+    def __init__(self, image_vectors: dict, captions_data: dict, tags_data: dict):
+        """ Child class to create the employed IU X-Ray, inheriting the base class methods
+
+        Args:
+            image_vectors (dict): Dictionary with keys to be the ImageIDs and values the image embeddings.
+            captions_data (dict): Dictionary with keys to be the ImageIDs and values the captions.
+            tags_data (dict): Dictionary with keys to be the ImageIDs and values the tags embeddings.
+        """
+        super().__init__(image_vectors=image_vectors, captions_data=captions_data, clear_long_captions=False)
+        self.tags_data = tags_data
+        # get the splits
+        self.train_dataset, self.dev_dataset, self.test_dataset = self.build_dataset()
+        # build linguistic attributes
+        self.vocab, self.tokenizer, self.word2idx, self.idx2word = super().build_vocab(training_captions=list(self.train_dataset[1].values()))
+    
+    def __str__(self) -> str:
+        """ Python built-in function for prints
+
+        Returns:
+            str: A modified print.
+        """
+        text = f"Train: patients={len(self.train_dataset[0])}, captions={len(self.train_dataset[1])}, tags={len(self.train_dataset[2])}"
+        text += f"\nDev: patients={len(self.dev_dataset[0])}, captions={len(self.dev_dataset[1])}, tags={len(self.dev_dataset[2])}"
+        text += f"\nTest: patients={len(self.test_dataset[0])}, captions={len(self.test_dataset[1])}, tags={len(self.test_dataset[2])}"
+        return text
+    
+    def get_splits_sets(self) ->tuple[list, list, list]:
+        """ Fetches the data for each split set.
+
+        Returns:
+            tuple[list, list, list]: train_dataset, dev_dataset, test_dataset
+        """
+        return self.train_dataset, self.dev_dataset, self.test_dataset
+    
+    def get_tokenizer_utils(self) ->tuple[Vocabulary, Tokenizer, dict, dict]:
+        """ Fetches the linguistic utilities.
+
+        Returns:
+            tuple[Vocabulary, Tokenizer, dict, dict]:  The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. 
+            The latters are mappers for words and index respectively
+        """
+        return self.vocab, self.tokenizer, self.word2idx, self.idx2word
+        
+    def __get_tags(self, _ids:list) -> dict:
+        """ Fetches from the whole dataset the tags embeddings according to the utilised set.
+
+        Args:
+            _ids (list): Split set ids
+
+        Returns:
+            dict: Dictionary with keys to be the ImageIDs and values the tags embeddings
+        """
+         
+        return { key:value for key, value in self.tags_data.items() if key in _ids}
+        
+    def build_dataset(self) -> tuple[list, list, list]:
+        """ Begins the whole process for the dataset creation.
+
+        Returns:
+            tuple[list, list, list]: The training dataset, the validation dataset and the test dataset for our models.
+            All sets are in list format. 
+            1st index --> image vectors
+            2nd index --> captions
+            3rd index --> tags
+        """
+        # random split
+        train_ids, dev_ids, test_ids = super().build_splits()
+
+        # fetch images
+        train_images = super().get_image_vectors(train_ids)
+        dev_images = super().get_image_vectors(dev_ids)
+        test_images = super().get_image_vectors(test_ids)
+        # fetch captions
+        train_captions = super().get_captions(train_ids)
+        dev_captions = super().get_captions(dev_ids)
+        test_captions = super().get_captions(test_ids)
+        # apply preprocess to training captions
+        train_captions_prepro = self.text_handler.preprocess_all(
+            list(train_captions.values()))
+            
+        train_captions_prepro = dict( zip( train_ids, train_captions_prepro ) )
+        # fetch tags
+        train_tags = self.__get_tags(train_ids)
+        dev_tags = self.__get_tags(dev_ids)
+        test_tags = self.__get_tags(test_ids)
+        # build data for each set    
+        train_dataset = [train_images, train_captions_prepro, train_tags]
+        dev_dataset = [dev_images, dev_captions, dev_tags]
+        test_dataset = [test_images, test_captions, test_tags]
+
+
+        return train_dataset, dev_dataset, test_dataset
+    
+
+
+class ImageCLEFDataset(Dataset):
+    def __init__(self, image_vectors: dict, captions_data: dict):
+        """_summary_
+
+        Args:
+            image_vectors (dict): _description_
+            captions_data (dict): _description_
+        """
+        super().__init__(image_vectors=image_vectors, captions_data=captions_data, clear_long_captions=True)
+        self.train_dataset, self.dev_dataset, self.test_dataset = self.build_dataset()
+        
+        self.vocab, self.tokenizer, self.word2idx, self.idx2word = super().build_vocab(training_captions=list(self.train_dataset[1].values()))
+        
+    def __str__(self) -> str:
+        """ Python built-in function for prints
+
+        Returns:
+            str: A modified print.
+        """
+        text = f"Train: patients={len(self.train_dataset[0])}, captions={len(self.train_dataset[1])}"
+        text += f"\nDev: patients={len(self.dev_dataset[0])}, captions={len(self.dev_dataset[1])}"
+        text += f"\nTest: patients={len(self.test_dataset[0])}, captions={len(self.test_dataset[1])}"
+        return text
+    
+    def get_splits_sets(self) -> tuple[list, list, list]:
+        """ Fetches the data for each split set.
+
+        Returns:
+            tuple[list, list, list]: train_dataset, dev_dataset, test_dataset
+        """
+        return self.train_dataset, self.dev_dataset, self.test_dataset
+    
+    def get_tokenizer_utils(self) -> tuple[Vocabulary, Tokenizer, dict, dict]:
+        """ Fetches the linguistic utilities.
+
+        Returns:
+            tuple[Vocabulary, Tokenizer, dict, dict]:  The Vocabulary object, the fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. 
+            The latters are mappers for words and index respectively
+        """
+        return self.vocab, self.tokenizer, self.word2idx, self.idx2word
+        
+    def build_dataset(self) -> tuple[list, list, list]:
+        """ Begins the whole process for the dataset creation.
+
+        Returns:
+            tuple[list, list, list]: The training dataset, the validation dataset and the test dataset for our models.
+            All sets are in list format. 
+            1st index --> image vectors
+            2nd index --> captions
+        """
+        # random split
+        train_ids, dev_ids, test_ids = super().build_splits()
+        # fetch images
+        train_images = super().get_image_vectors(train_ids)
+        dev_images = super().get_image_vectors(dev_ids)
+        test_images = super().get_image_vectors(test_ids)
+        # fetch captions
+        train_captions = super().get_captions(train_ids)
+        dev_captions = super().get_captions(dev_ids)
+        test_captions = super().get_captions(test_ids)
+        
+        # remove long outlier captions from training set
+        train_modified_captions = super().delete_long_captions(data=train_captions)
+        # get new training ids after removing
+        train_new_ids = list(train_modified_captions.keys())
+        train_new_images = {
+            key:image_vector for key, image_vector in train_images.items() if key in train_new_ids
+        }
+        # apply preprocess to training captions
+        train_captions_prepro = self.text_handler.preprocess_all(
+            list(train_modified_captions.values()))
+            
+        train_captions_prepro = dict( zip( train_new_ids, train_captions_prepro ) )
+        # build data for each set  
+        train_dataset = [train_new_images, train_captions_prepro]
+        dev_dataset = [dev_images, dev_captions]
+        test_dataset = [test_images, test_captions]
+
+
+        return train_dataset, dev_dataset, test_dataset
+
+