|
a |
|
b/utils/vocabulary.py |
|
|
1 |
# tensorflow imports |
|
|
2 |
import tensorflow |
|
|
3 |
from tensorflow.keras.preprocessing.text import Tokenizer |
|
|
4 |
|
|
|
5 |
|
|
|
6 |
class Vocabulary: |
|
|
7 |
def __init__(self, texts:list, threshold:int = 3): |
|
|
8 |
""" Vocabulary class we utilised in CNN-RNN models. This class creates a vocabulary for our captions with a cut-off frequency. |
|
|
9 |
|
|
|
10 |
Args: |
|
|
11 |
texts (list): All training captions from which we extract our vocabulary |
|
|
12 |
threshold (int, optional): The cut-off frequency in our dictionary. Defaults to 3. |
|
|
13 |
""" |
|
|
14 |
self.texts = texts |
|
|
15 |
# add <pad> abd <unk> tokens in vocabulary. <unk> refer to the OOV (out-of-vocabulary) tokens. |
|
|
16 |
self.pad_token = "<pad>" |
|
|
17 |
self.unk_token = "<unk>" |
|
|
18 |
self.threshold = threshold |
|
|
19 |
# init the tokenizer |
|
|
20 |
self.tokenizer = Tokenizer(oov_token="<unk>") |
|
|
21 |
# dictionaries |
|
|
22 |
self.word2idx = {} |
|
|
23 |
self.idx2word = {} |
|
|
24 |
|
|
|
25 |
def build_vocab(self) ->tuple[Tokenizer, dict, dict]: |
|
|
26 |
""" This function creates the Vocabulary we want to employ for our CNN-RNN model. |
|
|
27 |
|
|
|
28 |
Returns: |
|
|
29 |
tuple[Tokenizer, dict, dict]: The fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. |
|
|
30 |
The latters are mappers for words and index respectively |
|
|
31 |
""" |
|
|
32 |
# fit in training captions |
|
|
33 |
self.tokenizer.fit_on_texts(self.texts) |
|
|
34 |
# create the vocabulary in sorted way |
|
|
35 |
sorted_vocab = dict( |
|
|
36 |
sorted(self.tokenizer.word_counts.items(), key=lambda t: t[1], reverse=True) |
|
|
37 |
) |
|
|
38 |
|
|
|
39 |
word_index_threshold, index_word_threshold = {}, {} |
|
|
40 |
|
|
|
41 |
# add pad and unk token to tokenizer dictionary |
|
|
42 |
word_index_threshold[self.pad_token] = 0 |
|
|
43 |
index_word_threshold[0] = self.pad_token |
|
|
44 |
word_index_threshold[self.unk_token] = 1 |
|
|
45 |
index_word_threshold[1] = self.unk_token |
|
|
46 |
|
|
|
47 |
# begin from index=2 |
|
|
48 |
idx = 2 |
|
|
49 |
for k, v in sorted_vocab.items(): |
|
|
50 |
# if the occurence of current word is less than the threshold, continue. |
|
|
51 |
if v >= self.threshold: |
|
|
52 |
word_index_threshold[k] = idx |
|
|
53 |
index_word_threshold[idx] = k |
|
|
54 |
idx += 1 |
|
|
55 |
|
|
|
56 |
# get the dictionaries |
|
|
57 |
self.tokenizer.word_index = word_index_threshold |
|
|
58 |
self.tokenizer.index_word = index_word_threshold |
|
|
59 |
|
|
|
60 |
dictionary = self.tokenizer.word_index |
|
|
61 |
|
|
|
62 |
# append to the global ones |
|
|
63 |
for k, v in dictionary.items(): |
|
|
64 |
self.word2idx[k] = v |
|
|
65 |
self.idx2word[v] = k |
|
|
66 |
|
|
|
67 |
print(f"Made a vocabulary with {len(self.word2idx)} words!") |
|
|
68 |
|
|
|
69 |
return self.tokenizer, self.word2idx, self.idx2word |