[1de6ed]: / bilstm_crf_ner / model / config.py

Download this file

117 lines (88 with data), 3.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
import os
from .general_utils import get_logger
from .data_utils import get_trimmed_glove_vectors, load_vocab, \
get_processing_word
WORKING_DIR = "bilstm_crf_ner/"
class Config():
def __init__(self, load=True):
"""Initialize hyperparameters and load vocabs
Args:
load_embeddings: (bool) if True, load embeddings into
np array, else None
"""
# directory for training outputs
if not os.path.exists(self.dir_output):
os.makedirs(self.dir_output)
# create instance of logger
self.logger = get_logger(self.path_log)
# load if requested (default)
if load:
self.load()
def load(self):
"""Loads vocabulary, processing functions and embeddings
Supposes that build_data.py has been run successfully and that
the corresponding files have been created (vocab and trimmed GloVe
vectors)
"""
# 1. vocabulary
self.vocab_words = load_vocab(self.filename_words)
self.vocab_tags = load_vocab(self.filename_tags)
self.vocab_chars = load_vocab(self.filename_chars)
self.nwords = len(self.vocab_words)
self.nchars = len(self.vocab_chars)
self.ntags = len(self.vocab_tags)
# 2. get processing functions that map str -> id
self.processing_word = get_processing_word(self.vocab_words,
self.vocab_chars, lowercase=True, chars=self.use_chars)
self.processing_tag = get_processing_word(self.vocab_tags,
lowercase=False, allow_unk=False)
# 3. get pre-trained embeddings
self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed)
if self.use_pretrained else None)
# general config
dir_output = WORKING_DIR + "output_full/"
dir_model = dir_output
path_log = dir_output + "log.txt"
# embeddings
dim_word = 300
dim_char = 100
# glove files
filename_glove = WORKING_DIR + "dataset/glove.6B/glove.6B.{}d.txt".format(dim_word)
# trimmed embeddings (created from glove_filename with build_data.py)
filename_trimmed = WORKING_DIR + "dataset/glove.6B.{}d.trimmed.npz".format(dim_word)
use_pretrained = True
# dataset
# filename_dev = "data/coNLL/eng/eng.testa.iob"
# filename_test = "data/coNLL/eng/eng.testb.iob"
# filename_train = "data/coNLL/eng/eng.train.iob"
#filename_dev = filename_test = filename_train = "data/test.txt" # test
filename_dev = WORKING_DIR + "dataset/devel.txt"
filename_test = WORKING_DIR + "dataset/test.txt"
filename_train = WORKING_DIR + "dataset/train.txt"
max_iter = None # if not None, max number of examples in Dataset
# vocab (created from dataset with build_data.py)
filename_words = WORKING_DIR + "dataset/words.txt"
filename_tags = WORKING_DIR + "dataset/tags.txt"
filename_chars = WORKING_DIR + "dataset/chars.txt"
filename_test_preds = WORKING_DIR + "dataset/test_preds.txt"
# training
train_embeddings = False
nepochs = 15
dropout = 0.5
batch_size = 5
lr_method = "adam"
lr = 0.001
lr_decay = 0.9
epoch_drop = 1 # Step Decay: per # epochs to apply lr_decay
clip = -1 # if negative, no clipping
nepoch_no_imprv = 3
# model hyperparameters
hidden_size_char = 100 # lstm on chars
hidden_size_lstm = 300 # lstm on word embeddings
ner_model_path = "ner_{}e_bilstm_crf_elmo".format(nepochs)
# elmo config
use_elmo = True
dim_elmo = 1024
# NOTE: if both chars and crf, only 1.6x slower on GPU
use_crf = True # if crf, training is 1.7x slower on CPU
use_chars = False if use_elmo else True# if char embedding, training is 3.5x slower on CPU