[03245f]: / utils / text_handler.py

Download this file

178 lines (141 with data), 5.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# re imports
import re
# numpy imports
import numpy as np
# tensorflow imports
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
# inflect imports
import inflect
# create the NUMBER2TEXT object which helps us to convert each numerical text to its textual represantation
NUMBER_TO_TEXT = inflect.engine()
# utils imports
from utils.vocabulary import Vocabulary
# nltk imports
import nltk
nltk.download("punkt")
class TextHandler:
def __init__(self, clean:bool=False, use_sep:bool=True):
""" Text Hanlder class we used to pre-process our captions.
The steps are provided in my Thesis.
Args:
clean (bool, optional): If we want to clean our text from special words like x-XXXX. Defaults to False.
use_sep (bool, optional): If we want to separate our sentences with a SEQ_SEP token. Defaults to True.
"""
self.__clean = clean
self.__start_token = "startsequence"
self.__end_token = "endsequence"
self.__seq_sep = None
if use_sep:
self.__seq_sep = " endofsequence "
def get_basic_token(self) -> tuple[str, str, str]:
""" Returns the start, end, and seq_sep special tokens
Returns:
tuple[str, str, str]: start, end, and seq_sep tokens
"""
return self.__start_token, self.__end_token, self.__seq_sep
def remove_punctuations(self, text:str) -> str:
""" Removes punctuations from training captions as well as cpecial characters
Args:
text (str): Text to pre-process
Returns:
str: Pre-processed text, without punctuation
"""
return re.sub(r"[-()\"#/@;:<>{}`+=~|!.?$%^&*'/+\[\]_]+", "", text)
def num2words(self, text:str) -> str:
"""This function converts each numerical text to its textual represantation. Like 10 to ten, and not onezero.
Args:
text (str): Text to pre-process
Returns:
str: Pre-processed text, with textual numbers
"""
sentences = text.split('.')
new_seqs = list()
# get all sequences
for s in sentences:
tokens = s.split()
new_tokens = list()
# for each seq, get all words
for token in tokens:
# find the number
try:
number = int(token)
# convert to each textual represantion. This also converts 10 to ten, and not onezero
word = NUMBER_TO_TEXT.number_to_words(number)
except:
word = token
new_tokens.append(word)
new_seqs.append(' '.join(new_tokens))
# connect again whole sentence
modified_text = '. '.join(new_seqs)
return modified_text
def __preprocess_text(self, text:str) -> str:
""" Exetures the pre-processed steps. More details are provided in my Thesis
Args:
text (str): Text to pre-process
Returns:
str: Pre-processed text.
"""
text = re.sub(r"won\'t", "will not", text)
text = re.sub(r"can\'t", "can not", text)
text = re.sub(r"n\'t", " not", text)
text = re.sub(r"\'re", " are", text)
text = re.sub(r"\'s", " is", text)
text = re.sub(r"\'d", " would", text)
text = re.sub(r"\'ll", " will", text)
text = re.sub(r"\'t", " not", text)
text = re.sub(r"\'ve", " have", text)
text = re.sub(r"\'m", " am", text)
text = re.sub("&", "and", text)
text = re.sub("@", "at", text)
text = re.sub("year old", "", text)
text = re.sub("yearold", "", text)
text = self.num2words(text)
if self.__clean:
text = self.__clean_text(text)
text = text.strip().lower()
text = " ".join(text.split()) # removes unwanted spaces
if text == "":
text = np.nan
return text
def __clean_text(self, text:str) -> str:
""" This function cleans the text from special words.
Args:
text (str): Text to pre-process
Returns:
str: Pre-processed text, without special wortds.
"""
regex = r"\d."
text = re.sub(regex, "", text)
regex = r"X+"
text = re.sub(regex, "", text)
regex = r"[^.a-zA-Z]"
text = re.sub(regex, " ", text)
regex = r"http\S+"
text = re.sub(regex, "", text)
return text
def separate_sequences(self, text:str) -> str:
""" This function reads a sequence of texts and appends a SEQ_SEP token between sentences, for better training.
More details are provided in my Thesis
Args:
text (str): Text to pre-process
Returns:
str: Pre-processed text, with SEQ_SEP special token.
"""
start, end, seq_sep = self.get_basic_token()
if seq_sep is not None:
sequences = nltk.tokenize.sent_tokenize(text)
sequences = [s for s in sequences if len(s) > 5]
text = seq_sep.join(sequences)
text = self.remove_punctuations(text)
return start + " " + text + " " + end
def preprocess_all(self, texts:list) -> list:
""" Begins the pre-processing for a list of texts.
Args:
texts (list): All texts in which we want to apply the pre-process.
Returns:
list: Pre-processed texts
"""
preprocessed_texts = [self.__preprocess_text(text) for text in texts]
separated_texts = [self.separate_sequences(text) for text in preprocessed_texts]
return separated_texts