a b/utils/vocabulary.py
1
# tensorflow imports
2
import tensorflow
3
from tensorflow.keras.preprocessing.text import Tokenizer
4
5
6
class Vocabulary:
7
    def __init__(self, texts:list, threshold:int = 3):
8
        """ Vocabulary class we utilised in CNN-RNN models. This class creates a vocabulary for our captions with a cut-off frequency.
9
10
        Args:
11
            texts (list): All training captions from which we extract our vocabulary
12
            threshold (int, optional): The cut-off frequency in our dictionary. Defaults to 3.
13
        """
14
        self.texts = texts
15
        # add <pad> abd <unk> tokens in vocabulary. <unk> refer to the OOV (out-of-vocabulary) tokens.
16
        self.pad_token = "<pad>"
17
        self.unk_token = "<unk>"
18
        self.threshold = threshold
19
        # init the tokenizer
20
        self.tokenizer = Tokenizer(oov_token="<unk>")
21
        # dictionaries
22
        self.word2idx = {}
23
        self.idx2word = {}
24
25
    def build_vocab(self) ->tuple[Tokenizer, dict, dict]:
26
        """ This function creates the Vocabulary we want to employ for our CNN-RNN model.
27
28
        Returns:
29
            tuple[Tokenizer, dict, dict]: The fitted tokenizer, the word-to-idx dictionary, and idx-to-word dictionary. 
30
            The latters are mappers for words and index respectively
31
        """
32
        # fit in training captions
33
        self.tokenizer.fit_on_texts(self.texts)
34
        # create the vocabulary in sorted way
35
        sorted_vocab = dict(
36
            sorted(self.tokenizer.word_counts.items(), key=lambda t: t[1], reverse=True)
37
        )
38
39
        word_index_threshold, index_word_threshold = {}, {}
40
41
        # add pad and unk token to tokenizer dictionary
42
        word_index_threshold[self.pad_token] = 0
43
        index_word_threshold[0] = self.pad_token
44
        word_index_threshold[self.unk_token] = 1
45
        index_word_threshold[1] = self.unk_token
46
47
        # begin from index=2
48
        idx = 2
49
        for k, v in sorted_vocab.items():
50
            # if the occurence of current word is less than the threshold, continue.
51
            if v >= self.threshold:
52
                word_index_threshold[k] = idx
53
                index_word_threshold[idx] = k
54
                idx += 1
55
56
        # get the dictionaries
57
        self.tokenizer.word_index = word_index_threshold
58
        self.tokenizer.index_word = index_word_threshold
59
60
        dictionary = self.tokenizer.word_index
61
62
        # append to the global ones
63
        for k, v in dictionary.items():
64
            self.word2idx[k] = v
65
            self.idx2word[v] = k
66
67
        print(f"Made a vocabulary with {len(self.word2idx)} words!")
68
        
69
        return self.tokenizer, self.word2idx, self.idx2word