Diff of /utils/text_handler.py [000000] .. [03245f]

Switch to side-by-side view

--- a
+++ b/utils/text_handler.py
@@ -0,0 +1,177 @@
+# re imports
+import re
+
+# numpy imports 
+import numpy as np
+
+# tensorflow imports
+import tensorflow
+from tensorflow.keras.preprocessing.text import Tokenizer
+
+# inflect imports
+import inflect
+# create the NUMBER2TEXT object which helps us to convert each numerical text to its textual represantation
+NUMBER_TO_TEXT = inflect.engine()
+
+# utils imports
+from utils.vocabulary import Vocabulary
+# nltk imports
+import nltk
+nltk.download("punkt")
+
+class TextHandler:
+    def __init__(self, clean:bool=False, use_sep:bool=True):
+        """ Text Hanlder class we used to pre-process our captions.
+        The steps are provided in my Thesis.
+
+        Args:
+            clean (bool, optional): If we want to clean our text from special words like x-XXXX. Defaults to False.
+            use_sep (bool, optional): If we want to separate our sentences with a SEQ_SEP token. Defaults to True.
+        """
+        self.__clean = clean
+        self.__start_token = "startsequence"
+        self.__end_token = "endsequence"
+        self.__seq_sep = None
+        if use_sep:
+            self.__seq_sep = " endofsequence "
+
+    def get_basic_token(self) -> tuple[str, str, str]:
+        """ Returns the start, end, and seq_sep special tokens
+
+        Returns:
+            tuple[str, str, str]: start, end, and seq_sep tokens
+        """
+        return self.__start_token, self.__end_token, self.__seq_sep
+    
+    def remove_punctuations(self, text:str) -> str:
+        """ Removes punctuations from training captions as well as cpecial characters
+
+        Args:
+            text (str): Text to pre-process
+
+        Returns:
+            str: Pre-processed text, without punctuation
+        """
+        return re.sub(r"[-()\"#/@;:<>{}`+=~|!.?$%^&*'/+\[\]_]+", "", text)
+    
+    def num2words(self, text:str) -> str:
+        """This function converts each numerical text to its textual represantation. Like 10 to ten, and not onezero.
+
+        Args:
+            text (str): Text to pre-process
+
+        Returns:
+            str: Pre-processed text, with textual numbers
+        """
+        sentences = text.split('.')
+        new_seqs = list()
+        # get all sequences
+        for s in sentences:
+            tokens = s.split()
+            new_tokens = list()
+            # for each seq, get all words
+            for token in tokens:
+                # find the number
+                try:
+                    number = int(token)
+                    # convert to each textual represantion. This also converts 10 to ten, and not onezero
+                    word = NUMBER_TO_TEXT.number_to_words(number)
+                except:
+                    word = token
+                new_tokens.append(word)
+            new_seqs.append(' '.join(new_tokens))
+
+        # connect again whole sentence
+        modified_text = '. '.join(new_seqs)
+        return modified_text
+            
+
+    def __preprocess_text(self, text:str) -> str:
+        """ Exetures the pre-processed steps. More details are provided in my Thesis
+
+        Args:
+            text (str): Text to pre-process
+
+        Returns:
+            str: Pre-processed text. 
+        """
+        text = re.sub(r"won\'t", "will not", text)
+        text = re.sub(r"can\'t", "can not", text)
+        text = re.sub(r"n\'t", " not", text)
+        text = re.sub(r"\'re", " are", text)
+        text = re.sub(r"\'s", " is", text)
+        text = re.sub(r"\'d", " would", text)
+        text = re.sub(r"\'ll", " will", text)
+        text = re.sub(r"\'t", " not", text)
+        text = re.sub(r"\'ve", " have", text)
+        text = re.sub(r"\'m", " am", text)
+        text = re.sub("&", "and", text)
+        text = re.sub("@", "at", text)
+        text = re.sub("year old", "", text)
+        text = re.sub("yearold", "", text)
+        
+        text = self.num2words(text)
+        
+        if self.__clean:
+            text = self.__clean_text(text)
+
+        text = text.strip().lower()
+        text = " ".join(text.split())  # removes unwanted spaces
+        if text == "":
+            text = np.nan
+
+        return text
+
+    def __clean_text(self, text:str) -> str:
+        """ This function cleans the text from special words.
+
+        Args:
+            text (str): Text to pre-process
+
+        Returns:
+            str: Pre-processed text, without special wortds.
+        """
+        regex = r"\d."
+        text = re.sub(regex, "", text)
+
+        regex = r"X+"
+        text = re.sub(regex, "", text)
+
+        regex = r"[^.a-zA-Z]"
+        text = re.sub(regex, " ", text)
+
+        regex = r"http\S+"
+        text = re.sub(regex, "", text)
+
+        return text
+
+    def separate_sequences(self, text:str) -> str:
+        """ This function reads a sequence of texts and appends a SEQ_SEP token between sentences, for better training.
+        More details are provided in my Thesis
+
+        Args:
+            text (str): Text to pre-process
+
+        Returns:
+            str: Pre-processed text, with SEQ_SEP special token.
+        """
+        start, end, seq_sep = self.get_basic_token()
+        if seq_sep is not None:
+            sequences = nltk.tokenize.sent_tokenize(text)
+            sequences = [s for s in sequences if len(s) > 5]
+            text = seq_sep.join(sequences)
+            text = self.remove_punctuations(text)
+        return start + " " + text + " " + end
+
+    def preprocess_all(self, texts:list) -> list:
+        """ Begins the pre-processing for a list of texts.
+
+        Args:
+            texts (list): All texts in which we want to apply the pre-process.
+
+        Returns:
+            list: Pre-processed texts
+        """
+        preprocessed_texts = [self.__preprocess_text(text) for text in texts]
+        separated_texts = [self.separate_sequences(text) for text in preprocessed_texts]
+        return separated_texts