Thesis-Diagnostic-Caption / Git / [03245f] /utils/text

Models:
philipB/
Thesis-Diagnostic-Caption
Downloads: 1
[03245f]: / utils / text_handler.py
History
Download this file
178 lines (141 with data), 5.8 kB

# re imports
import re

# numpy imports 
import numpy as np

# tensorflow imports
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer

# inflect imports
import inflect
# create the NUMBER2TEXT object which helps us to convert each numerical text to its textual represantation
NUMBER_TO_TEXT = inflect.engine()

# utils imports
from utils.vocabulary import Vocabulary
# nltk imports
import nltk
nltk.download("punkt")

class TextHandler:
    def __init__(self, clean:bool=False, use_sep:bool=True):
        """ Text Hanlder class we used to pre-process our captions.
        The steps are provided in my Thesis.

        Args:
            clean (bool, optional): If we want to clean our text from special words like x-XXXX. Defaults to False.
            use_sep (bool, optional): If we want to separate our sentences with a SEQ_SEP token. Defaults to True.
        """
        self.__clean = clean
        self.__start_token = "startsequence"
        self.__end_token = "endsequence"
        self.__seq_sep = None
        if use_sep:
            self.__seq_sep = " endofsequence "

    def get_basic_token(self) -> tuple[str, str, str]:
        """ Returns the start, end, and seq_sep special tokens

        Returns:
            tuple[str, str, str]: start, end, and seq_sep tokens
        """
        return self.__start_token, self.__end_token, self.__seq_sep
    
    def remove_punctuations(self, text:str) -> str:
        """ Removes punctuations from training captions as well as cpecial characters

        Args:
            text (str): Text to pre-process

        Returns:
            str: Pre-processed text, without punctuation
        """
        return re.sub(r"[-()\"#/@;:<>{}`+=~|!.?$%^&*'/+\[\]_]+", "", text)
    
    def num2words(self, text:str) -> str:
        """This function converts each numerical text to its textual represantation. Like 10 to ten, and not onezero.

        Args:
            text (str): Text to pre-process

        Returns:
            str: Pre-processed text, with textual numbers
        """
        sentences = text.split('.')
        new_seqs = list()
        # get all sequences
        for s in sentences:
            tokens = s.split()
            new_tokens = list()
            # for each seq, get all words
            for token in tokens:
                # find the number
                try:
                    number = int(token)
                    # convert to each textual represantion. This also converts 10 to ten, and not onezero
                    word = NUMBER_TO_TEXT.number_to_words(number)
                except:
                    word = token
                new_tokens.append(word)
            new_seqs.append(' '.join(new_tokens))

        # connect again whole sentence
        modified_text = '. '.join(new_seqs)
        return modified_text
            

    def __preprocess_text(self, text:str) -> str:
        """ Exetures the pre-processed steps. More details are provided in my Thesis

        Args:
            text (str): Text to pre-process

        Returns:
            str: Pre-processed text. 
        """
        text = re.sub(r"won\'t", "will not", text)
        text = re.sub(r"can\'t", "can not", text)
        text = re.sub(r"n\'t", " not", text)
        text = re.sub(r"\'re", " are", text)
        text = re.sub(r"\'s", " is", text)
        text = re.sub(r"\'d", " would", text)
        text = re.sub(r"\'ll", " will", text)
        text = re.sub(r"\'t", " not", text)
        text = re.sub(r"\'ve", " have", text)
        text = re.sub(r"\'m", " am", text)
        text = re.sub("&", "and", text)
        text = re.sub("@", "at", text)
        text = re.sub("year old", "", text)
        text = re.sub("yearold", "", text)
        
        text = self.num2words(text)
        
        if self.__clean:
            text = self.__clean_text(text)

        text = text.strip().lower()
        text = " ".join(text.split())  # removes unwanted spaces
        if text == "":
            text = np.nan

        return text

    def __clean_text(self, text:str) -> str:
        """ This function cleans the text from special words.

        Args:
            text (str): Text to pre-process

        Returns:
            str: Pre-processed text, without special wortds.
        """
        regex = r"\d."
        text = re.sub(regex, "", text)

        regex = r"X+"
        text = re.sub(regex, "", text)

        regex = r"[^.a-zA-Z]"
        text = re.sub(regex, " ", text)

        regex = r"http\S+"
        text = re.sub(regex, "", text)

        return text

    def separate_sequences(self, text:str) -> str:
        """ This function reads a sequence of texts and appends a SEQ_SEP token between sentences, for better training.
        More details are provided in my Thesis

        Args:
            text (str): Text to pre-process

        Returns:
            str: Pre-processed text, with SEQ_SEP special token.
        """
        start, end, seq_sep = self.get_basic_token()
        if seq_sep is not None:
            sequences = nltk.tokenize.sent_tokenize(text)
            sequences = [s for s in sequences if len(s) > 5]
            text = seq_sep.join(sequences)
            text = self.remove_punctuations(text)
        return start + " " + text + " " + end

    def preprocess_all(self, texts:list) -> list:
        """ Begins the pre-processing for a list of texts.

        Args:
            texts (list): All texts in which we want to apply the pre-process.

        Returns:
            list: Pre-processed texts
        """
        preprocessed_texts = [self.__preprocess_text(text) for text in texts]
        separated_texts = [self.separate_sequences(text) for text in preprocessed_texts]
        return separated_texts