Diff of /utils/text_handler.py [000000] .. [03245f]

Switch to unified view

a b/utils/text_handler.py
1
# re imports
2
import re
3
4
# numpy imports 
5
import numpy as np
6
7
# tensorflow imports
8
import tensorflow
9
from tensorflow.keras.preprocessing.text import Tokenizer
10
11
# inflect imports
12
import inflect
13
# create the NUMBER2TEXT object which helps us to convert each numerical text to its textual represantation
14
NUMBER_TO_TEXT = inflect.engine()
15
16
# utils imports
17
from utils.vocabulary import Vocabulary
18
# nltk imports
19
import nltk
20
nltk.download("punkt")
21
22
class TextHandler:
23
    def __init__(self, clean:bool=False, use_sep:bool=True):
24
        """ Text Hanlder class we used to pre-process our captions.
25
        The steps are provided in my Thesis.
26
27
        Args:
28
            clean (bool, optional): If we want to clean our text from special words like x-XXXX. Defaults to False.
29
            use_sep (bool, optional): If we want to separate our sentences with a SEQ_SEP token. Defaults to True.
30
        """
31
        self.__clean = clean
32
        self.__start_token = "startsequence"
33
        self.__end_token = "endsequence"
34
        self.__seq_sep = None
35
        if use_sep:
36
            self.__seq_sep = " endofsequence "
37
38
    def get_basic_token(self) -> tuple[str, str, str]:
39
        """ Returns the start, end, and seq_sep special tokens
40
41
        Returns:
42
            tuple[str, str, str]: start, end, and seq_sep tokens
43
        """
44
        return self.__start_token, self.__end_token, self.__seq_sep
45
    
46
    def remove_punctuations(self, text:str) -> str:
47
        """ Removes punctuations from training captions as well as cpecial characters
48
49
        Args:
50
            text (str): Text to pre-process
51
52
        Returns:
53
            str: Pre-processed text, without punctuation
54
        """
55
        return re.sub(r"[-()\"#/@;:<>{}`+=~|!.?$%^&*'/+\[\]_]+", "", text)
56
    
57
    def num2words(self, text:str) -> str:
58
        """This function converts each numerical text to its textual represantation. Like 10 to ten, and not onezero.
59
60
        Args:
61
            text (str): Text to pre-process
62
63
        Returns:
64
            str: Pre-processed text, with textual numbers
65
        """
66
        sentences = text.split('.')
67
        new_seqs = list()
68
        # get all sequences
69
        for s in sentences:
70
            tokens = s.split()
71
            new_tokens = list()
72
            # for each seq, get all words
73
            for token in tokens:
74
                # find the number
75
                try:
76
                    number = int(token)
77
                    # convert to each textual represantion. This also converts 10 to ten, and not onezero
78
                    word = NUMBER_TO_TEXT.number_to_words(number)
79
                except:
80
                    word = token
81
                new_tokens.append(word)
82
            new_seqs.append(' '.join(new_tokens))
83
84
        # connect again whole sentence
85
        modified_text = '. '.join(new_seqs)
86
        return modified_text
87
            
88
89
    def __preprocess_text(self, text:str) -> str:
90
        """ Exetures the pre-processed steps. More details are provided in my Thesis
91
92
        Args:
93
            text (str): Text to pre-process
94
95
        Returns:
96
            str: Pre-processed text. 
97
        """
98
        text = re.sub(r"won\'t", "will not", text)
99
        text = re.sub(r"can\'t", "can not", text)
100
        text = re.sub(r"n\'t", " not", text)
101
        text = re.sub(r"\'re", " are", text)
102
        text = re.sub(r"\'s", " is", text)
103
        text = re.sub(r"\'d", " would", text)
104
        text = re.sub(r"\'ll", " will", text)
105
        text = re.sub(r"\'t", " not", text)
106
        text = re.sub(r"\'ve", " have", text)
107
        text = re.sub(r"\'m", " am", text)
108
        text = re.sub("&", "and", text)
109
        text = re.sub("@", "at", text)
110
        text = re.sub("year old", "", text)
111
        text = re.sub("yearold", "", text)
112
        
113
        text = self.num2words(text)
114
        
115
        if self.__clean:
116
            text = self.__clean_text(text)
117
118
        text = text.strip().lower()
119
        text = " ".join(text.split())  # removes unwanted spaces
120
        if text == "":
121
            text = np.nan
122
123
        return text
124
125
    def __clean_text(self, text:str) -> str:
126
        """ This function cleans the text from special words.
127
128
        Args:
129
            text (str): Text to pre-process
130
131
        Returns:
132
            str: Pre-processed text, without special wortds.
133
        """
134
        regex = r"\d."
135
        text = re.sub(regex, "", text)
136
137
        regex = r"X+"
138
        text = re.sub(regex, "", text)
139
140
        regex = r"[^.a-zA-Z]"
141
        text = re.sub(regex, " ", text)
142
143
        regex = r"http\S+"
144
        text = re.sub(regex, "", text)
145
146
        return text
147
148
    def separate_sequences(self, text:str) -> str:
149
        """ This function reads a sequence of texts and appends a SEQ_SEP token between sentences, for better training.
150
        More details are provided in my Thesis
151
152
        Args:
153
            text (str): Text to pre-process
154
155
        Returns:
156
            str: Pre-processed text, with SEQ_SEP special token.
157
        """
158
        start, end, seq_sep = self.get_basic_token()
159
        if seq_sep is not None:
160
            sequences = nltk.tokenize.sent_tokenize(text)
161
            sequences = [s for s in sequences if len(s) > 5]
162
            text = seq_sep.join(sequences)
163
            text = self.remove_punctuations(text)
164
        return start + " " + text + " " + end
165
166
    def preprocess_all(self, texts:list) -> list:
167
        """ Begins the pre-processing for a list of texts.
168
169
        Args:
170
            texts (list): All texts in which we want to apply the pre-process.
171
172
        Returns:
173
            list: Pre-processed texts
174
        """
175
        preprocessed_texts = [self.__preprocess_text(text) for text in texts]
176
        separated_texts = [self.separate_sequences(text) for text in preprocessed_texts]
177
        return separated_texts