--- a +++ b/bilstm_crf_ner/model/config.py @@ -0,0 +1,116 @@ +import os + + +from .general_utils import get_logger +from .data_utils import get_trimmed_glove_vectors, load_vocab, \ + get_processing_word + +WORKING_DIR = "bilstm_crf_ner/" + +class Config(): + def __init__(self, load=True): + """Initialize hyperparameters and load vocabs + + Args: + load_embeddings: (bool) if True, load embeddings into + np array, else None + + """ + # directory for training outputs + if not os.path.exists(self.dir_output): + os.makedirs(self.dir_output) + + # create instance of logger + self.logger = get_logger(self.path_log) + + # load if requested (default) + if load: + self.load() + + def load(self): + """Loads vocabulary, processing functions and embeddings + + Supposes that build_data.py has been run successfully and that + the corresponding files have been created (vocab and trimmed GloVe + vectors) + + """ + # 1. vocabulary + self.vocab_words = load_vocab(self.filename_words) + self.vocab_tags = load_vocab(self.filename_tags) + self.vocab_chars = load_vocab(self.filename_chars) + + self.nwords = len(self.vocab_words) + self.nchars = len(self.vocab_chars) + self.ntags = len(self.vocab_tags) + + # 2. get processing functions that map str -> id + self.processing_word = get_processing_word(self.vocab_words, + self.vocab_chars, lowercase=True, chars=self.use_chars) + self.processing_tag = get_processing_word(self.vocab_tags, + lowercase=False, allow_unk=False) + + # 3. get pre-trained embeddings + self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed) + if self.use_pretrained else None) + + + # general config + dir_output = WORKING_DIR + "output_full/" + dir_model = dir_output + path_log = dir_output + "log.txt" + + # embeddings + dim_word = 300 + dim_char = 100 + + # glove files + filename_glove = WORKING_DIR + "dataset/glove.6B/glove.6B.{}d.txt".format(dim_word) + # trimmed embeddings (created from glove_filename with build_data.py) + filename_trimmed = WORKING_DIR + "dataset/glove.6B.{}d.trimmed.npz".format(dim_word) + use_pretrained = True + + # dataset + # filename_dev = "data/coNLL/eng/eng.testa.iob" + # filename_test = "data/coNLL/eng/eng.testb.iob" + # filename_train = "data/coNLL/eng/eng.train.iob" + + #filename_dev = filename_test = filename_train = "data/test.txt" # test + + filename_dev = WORKING_DIR + "dataset/devel.txt" + filename_test = WORKING_DIR + "dataset/test.txt" + filename_train = WORKING_DIR + "dataset/train.txt" + + max_iter = None # if not None, max number of examples in Dataset + + # vocab (created from dataset with build_data.py) + filename_words = WORKING_DIR + "dataset/words.txt" + filename_tags = WORKING_DIR + "dataset/tags.txt" + filename_chars = WORKING_DIR + "dataset/chars.txt" + filename_test_preds = WORKING_DIR + "dataset/test_preds.txt" + + # training + train_embeddings = False + nepochs = 15 + dropout = 0.5 + batch_size = 5 + lr_method = "adam" + lr = 0.001 + lr_decay = 0.9 + epoch_drop = 1 # Step Decay: per # epochs to apply lr_decay + clip = -1 # if negative, no clipping + nepoch_no_imprv = 3 + + # model hyperparameters + hidden_size_char = 100 # lstm on chars + hidden_size_lstm = 300 # lstm on word embeddings + + ner_model_path = "ner_{}e_bilstm_crf_elmo".format(nepochs) + + # elmo config + use_elmo = True + dim_elmo = 1024 + + # NOTE: if both chars and crf, only 1.6x slower on GPU + use_crf = True # if crf, training is 1.7x slower on CPU + use_chars = False if use_elmo else True# if char embedding, training is 3.5x slower on CPU