--- a +++ b/utils/text_handler.py @@ -0,0 +1,177 @@ +# re imports +import re + +# numpy imports +import numpy as np + +# tensorflow imports +import tensorflow +from tensorflow.keras.preprocessing.text import Tokenizer + +# inflect imports +import inflect +# create the NUMBER2TEXT object which helps us to convert each numerical text to its textual represantation +NUMBER_TO_TEXT = inflect.engine() + +# utils imports +from utils.vocabulary import Vocabulary +# nltk imports +import nltk +nltk.download("punkt") + +class TextHandler: + def __init__(self, clean:bool=False, use_sep:bool=True): + """ Text Hanlder class we used to pre-process our captions. + The steps are provided in my Thesis. + + Args: + clean (bool, optional): If we want to clean our text from special words like x-XXXX. Defaults to False. + use_sep (bool, optional): If we want to separate our sentences with a SEQ_SEP token. Defaults to True. + """ + self.__clean = clean + self.__start_token = "startsequence" + self.__end_token = "endsequence" + self.__seq_sep = None + if use_sep: + self.__seq_sep = " endofsequence " + + def get_basic_token(self) -> tuple[str, str, str]: + """ Returns the start, end, and seq_sep special tokens + + Returns: + tuple[str, str, str]: start, end, and seq_sep tokens + """ + return self.__start_token, self.__end_token, self.__seq_sep + + def remove_punctuations(self, text:str) -> str: + """ Removes punctuations from training captions as well as cpecial characters + + Args: + text (str): Text to pre-process + + Returns: + str: Pre-processed text, without punctuation + """ + return re.sub(r"[-()\"#/@;:<>{}`+=~|!.?$%^&*'/+\[\]_]+", "", text) + + def num2words(self, text:str) -> str: + """This function converts each numerical text to its textual represantation. Like 10 to ten, and not onezero. + + Args: + text (str): Text to pre-process + + Returns: + str: Pre-processed text, with textual numbers + """ + sentences = text.split('.') + new_seqs = list() + # get all sequences + for s in sentences: + tokens = s.split() + new_tokens = list() + # for each seq, get all words + for token in tokens: + # find the number + try: + number = int(token) + # convert to each textual represantion. This also converts 10 to ten, and not onezero + word = NUMBER_TO_TEXT.number_to_words(number) + except: + word = token + new_tokens.append(word) + new_seqs.append(' '.join(new_tokens)) + + # connect again whole sentence + modified_text = '. '.join(new_seqs) + return modified_text + + + def __preprocess_text(self, text:str) -> str: + """ Exetures the pre-processed steps. More details are provided in my Thesis + + Args: + text (str): Text to pre-process + + Returns: + str: Pre-processed text. + """ + text = re.sub(r"won\'t", "will not", text) + text = re.sub(r"can\'t", "can not", text) + text = re.sub(r"n\'t", " not", text) + text = re.sub(r"\'re", " are", text) + text = re.sub(r"\'s", " is", text) + text = re.sub(r"\'d", " would", text) + text = re.sub(r"\'ll", " will", text) + text = re.sub(r"\'t", " not", text) + text = re.sub(r"\'ve", " have", text) + text = re.sub(r"\'m", " am", text) + text = re.sub("&", "and", text) + text = re.sub("@", "at", text) + text = re.sub("year old", "", text) + text = re.sub("yearold", "", text) + + text = self.num2words(text) + + if self.__clean: + text = self.__clean_text(text) + + text = text.strip().lower() + text = " ".join(text.split()) # removes unwanted spaces + if text == "": + text = np.nan + + return text + + def __clean_text(self, text:str) -> str: + """ This function cleans the text from special words. + + Args: + text (str): Text to pre-process + + Returns: + str: Pre-processed text, without special wortds. + """ + regex = r"\d." + text = re.sub(regex, "", text) + + regex = r"X+" + text = re.sub(regex, "", text) + + regex = r"[^.a-zA-Z]" + text = re.sub(regex, " ", text) + + regex = r"http\S+" + text = re.sub(regex, "", text) + + return text + + def separate_sequences(self, text:str) -> str: + """ This function reads a sequence of texts and appends a SEQ_SEP token between sentences, for better training. + More details are provided in my Thesis + + Args: + text (str): Text to pre-process + + Returns: + str: Pre-processed text, with SEQ_SEP special token. + """ + start, end, seq_sep = self.get_basic_token() + if seq_sep is not None: + sequences = nltk.tokenize.sent_tokenize(text) + sequences = [s for s in sequences if len(s) > 5] + text = seq_sep.join(sequences) + text = self.remove_punctuations(text) + return start + " " + text + " " + end + + def preprocess_all(self, texts:list) -> list: + """ Begins the pre-processing for a list of texts. + + Args: + texts (list): All texts in which we want to apply the pre-process. + + Returns: + list: Pre-processed texts + """ + preprocessed_texts = [self.__preprocess_text(text) for text in texts] + separated_texts = [self.separate_sequences(text) for text in preprocessed_texts] + return separated_texts