|
a |
|
b/utils/text_handler.py |
|
|
1 |
# re imports |
|
|
2 |
import re |
|
|
3 |
|
|
|
4 |
# numpy imports |
|
|
5 |
import numpy as np |
|
|
6 |
|
|
|
7 |
# tensorflow imports |
|
|
8 |
import tensorflow |
|
|
9 |
from tensorflow.keras.preprocessing.text import Tokenizer |
|
|
10 |
|
|
|
11 |
# inflect imports |
|
|
12 |
import inflect |
|
|
13 |
# create the NUMBER2TEXT object which helps us to convert each numerical text to its textual represantation |
|
|
14 |
NUMBER_TO_TEXT = inflect.engine() |
|
|
15 |
|
|
|
16 |
# utils imports |
|
|
17 |
from utils.vocabulary import Vocabulary |
|
|
18 |
# nltk imports |
|
|
19 |
import nltk |
|
|
20 |
nltk.download("punkt") |
|
|
21 |
|
|
|
22 |
class TextHandler: |
|
|
23 |
def __init__(self, clean:bool=False, use_sep:bool=True): |
|
|
24 |
""" Text Hanlder class we used to pre-process our captions. |
|
|
25 |
The steps are provided in my Thesis. |
|
|
26 |
|
|
|
27 |
Args: |
|
|
28 |
clean (bool, optional): If we want to clean our text from special words like x-XXXX. Defaults to False. |
|
|
29 |
use_sep (bool, optional): If we want to separate our sentences with a SEQ_SEP token. Defaults to True. |
|
|
30 |
""" |
|
|
31 |
self.__clean = clean |
|
|
32 |
self.__start_token = "startsequence" |
|
|
33 |
self.__end_token = "endsequence" |
|
|
34 |
self.__seq_sep = None |
|
|
35 |
if use_sep: |
|
|
36 |
self.__seq_sep = " endofsequence " |
|
|
37 |
|
|
|
38 |
def get_basic_token(self) -> tuple[str, str, str]: |
|
|
39 |
""" Returns the start, end, and seq_sep special tokens |
|
|
40 |
|
|
|
41 |
Returns: |
|
|
42 |
tuple[str, str, str]: start, end, and seq_sep tokens |
|
|
43 |
""" |
|
|
44 |
return self.__start_token, self.__end_token, self.__seq_sep |
|
|
45 |
|
|
|
46 |
def remove_punctuations(self, text:str) -> str: |
|
|
47 |
""" Removes punctuations from training captions as well as cpecial characters |
|
|
48 |
|
|
|
49 |
Args: |
|
|
50 |
text (str): Text to pre-process |
|
|
51 |
|
|
|
52 |
Returns: |
|
|
53 |
str: Pre-processed text, without punctuation |
|
|
54 |
""" |
|
|
55 |
return re.sub(r"[-()\"#/@;:<>{}`+=~|!.?$%^&*'/+\[\]_]+", "", text) |
|
|
56 |
|
|
|
57 |
def num2words(self, text:str) -> str: |
|
|
58 |
"""This function converts each numerical text to its textual represantation. Like 10 to ten, and not onezero. |
|
|
59 |
|
|
|
60 |
Args: |
|
|
61 |
text (str): Text to pre-process |
|
|
62 |
|
|
|
63 |
Returns: |
|
|
64 |
str: Pre-processed text, with textual numbers |
|
|
65 |
""" |
|
|
66 |
sentences = text.split('.') |
|
|
67 |
new_seqs = list() |
|
|
68 |
# get all sequences |
|
|
69 |
for s in sentences: |
|
|
70 |
tokens = s.split() |
|
|
71 |
new_tokens = list() |
|
|
72 |
# for each seq, get all words |
|
|
73 |
for token in tokens: |
|
|
74 |
# find the number |
|
|
75 |
try: |
|
|
76 |
number = int(token) |
|
|
77 |
# convert to each textual represantion. This also converts 10 to ten, and not onezero |
|
|
78 |
word = NUMBER_TO_TEXT.number_to_words(number) |
|
|
79 |
except: |
|
|
80 |
word = token |
|
|
81 |
new_tokens.append(word) |
|
|
82 |
new_seqs.append(' '.join(new_tokens)) |
|
|
83 |
|
|
|
84 |
# connect again whole sentence |
|
|
85 |
modified_text = '. '.join(new_seqs) |
|
|
86 |
return modified_text |
|
|
87 |
|
|
|
88 |
|
|
|
89 |
def __preprocess_text(self, text:str) -> str: |
|
|
90 |
""" Exetures the pre-processed steps. More details are provided in my Thesis |
|
|
91 |
|
|
|
92 |
Args: |
|
|
93 |
text (str): Text to pre-process |
|
|
94 |
|
|
|
95 |
Returns: |
|
|
96 |
str: Pre-processed text. |
|
|
97 |
""" |
|
|
98 |
text = re.sub(r"won\'t", "will not", text) |
|
|
99 |
text = re.sub(r"can\'t", "can not", text) |
|
|
100 |
text = re.sub(r"n\'t", " not", text) |
|
|
101 |
text = re.sub(r"\'re", " are", text) |
|
|
102 |
text = re.sub(r"\'s", " is", text) |
|
|
103 |
text = re.sub(r"\'d", " would", text) |
|
|
104 |
text = re.sub(r"\'ll", " will", text) |
|
|
105 |
text = re.sub(r"\'t", " not", text) |
|
|
106 |
text = re.sub(r"\'ve", " have", text) |
|
|
107 |
text = re.sub(r"\'m", " am", text) |
|
|
108 |
text = re.sub("&", "and", text) |
|
|
109 |
text = re.sub("@", "at", text) |
|
|
110 |
text = re.sub("year old", "", text) |
|
|
111 |
text = re.sub("yearold", "", text) |
|
|
112 |
|
|
|
113 |
text = self.num2words(text) |
|
|
114 |
|
|
|
115 |
if self.__clean: |
|
|
116 |
text = self.__clean_text(text) |
|
|
117 |
|
|
|
118 |
text = text.strip().lower() |
|
|
119 |
text = " ".join(text.split()) # removes unwanted spaces |
|
|
120 |
if text == "": |
|
|
121 |
text = np.nan |
|
|
122 |
|
|
|
123 |
return text |
|
|
124 |
|
|
|
125 |
def __clean_text(self, text:str) -> str: |
|
|
126 |
""" This function cleans the text from special words. |
|
|
127 |
|
|
|
128 |
Args: |
|
|
129 |
text (str): Text to pre-process |
|
|
130 |
|
|
|
131 |
Returns: |
|
|
132 |
str: Pre-processed text, without special wortds. |
|
|
133 |
""" |
|
|
134 |
regex = r"\d." |
|
|
135 |
text = re.sub(regex, "", text) |
|
|
136 |
|
|
|
137 |
regex = r"X+" |
|
|
138 |
text = re.sub(regex, "", text) |
|
|
139 |
|
|
|
140 |
regex = r"[^.a-zA-Z]" |
|
|
141 |
text = re.sub(regex, " ", text) |
|
|
142 |
|
|
|
143 |
regex = r"http\S+" |
|
|
144 |
text = re.sub(regex, "", text) |
|
|
145 |
|
|
|
146 |
return text |
|
|
147 |
|
|
|
148 |
def separate_sequences(self, text:str) -> str: |
|
|
149 |
""" This function reads a sequence of texts and appends a SEQ_SEP token between sentences, for better training. |
|
|
150 |
More details are provided in my Thesis |
|
|
151 |
|
|
|
152 |
Args: |
|
|
153 |
text (str): Text to pre-process |
|
|
154 |
|
|
|
155 |
Returns: |
|
|
156 |
str: Pre-processed text, with SEQ_SEP special token. |
|
|
157 |
""" |
|
|
158 |
start, end, seq_sep = self.get_basic_token() |
|
|
159 |
if seq_sep is not None: |
|
|
160 |
sequences = nltk.tokenize.sent_tokenize(text) |
|
|
161 |
sequences = [s for s in sequences if len(s) > 5] |
|
|
162 |
text = seq_sep.join(sequences) |
|
|
163 |
text = self.remove_punctuations(text) |
|
|
164 |
return start + " " + text + " " + end |
|
|
165 |
|
|
|
166 |
def preprocess_all(self, texts:list) -> list: |
|
|
167 |
""" Begins the pre-processing for a list of texts. |
|
|
168 |
|
|
|
169 |
Args: |
|
|
170 |
texts (list): All texts in which we want to apply the pre-process. |
|
|
171 |
|
|
|
172 |
Returns: |
|
|
173 |
list: Pre-processed texts |
|
|
174 |
""" |
|
|
175 |
preprocessed_texts = [self.__preprocess_text(text) for text in texts] |
|
|
176 |
separated_texts = [self.separate_sequences(text) for text in preprocessed_texts] |
|
|
177 |
return separated_texts |