ehr-relation-extraction / Git / Diff of /bilstm_crf

Models:
philipB/
ehr-relation-extraction
Downloads: 1
Diff of /bilstm_crf_ner/model/config.py [000000] .. [1de6ed]
Switch to side-by-side view

--- a
+++ b/bilstm_crf_ner/model/config.py
@@ -0,0 +1,116 @@
+import os
+
+
+from .general_utils import get_logger
+from .data_utils import get_trimmed_glove_vectors, load_vocab, \
+        get_processing_word
+
+WORKING_DIR = "bilstm_crf_ner/"
+
+class Config():
+    def __init__(self, load=True):
+        """Initialize hyperparameters and load vocabs
+
+        Args:
+            load_embeddings: (bool) if True, load embeddings into
+                np array, else None
+
+        """
+        # directory for training outputs
+        if not os.path.exists(self.dir_output):
+            os.makedirs(self.dir_output)
+
+        # create instance of logger
+        self.logger = get_logger(self.path_log)
+
+        # load if requested (default)
+        if load:
+            self.load()
+
+    def load(self):
+        """Loads vocabulary, processing functions and embeddings
+
+        Supposes that build_data.py has been run successfully and that
+        the corresponding files have been created (vocab and trimmed GloVe
+        vectors)
+
+        """
+        # 1. vocabulary
+        self.vocab_words = load_vocab(self.filename_words)
+        self.vocab_tags  = load_vocab(self.filename_tags)
+        self.vocab_chars = load_vocab(self.filename_chars)
+
+        self.nwords     = len(self.vocab_words)
+        self.nchars     = len(self.vocab_chars)
+        self.ntags      = len(self.vocab_tags)
+
+        # 2. get processing functions that map str -> id
+        self.processing_word = get_processing_word(self.vocab_words,
+                self.vocab_chars, lowercase=True, chars=self.use_chars)
+        self.processing_tag  = get_processing_word(self.vocab_tags,
+                lowercase=False, allow_unk=False)
+
+        # 3. get pre-trained embeddings
+        self.embeddings = (get_trimmed_glove_vectors(self.filename_trimmed)
+                if self.use_pretrained else None)
+
+
+    # general config
+    dir_output = WORKING_DIR + "output_full/"
+    dir_model  = dir_output
+    path_log   = dir_output + "log.txt"
+
+    # embeddings
+    dim_word = 300
+    dim_char = 100
+
+    # glove files
+    filename_glove = WORKING_DIR + "dataset/glove.6B/glove.6B.{}d.txt".format(dim_word)
+    # trimmed embeddings (created from glove_filename with build_data.py)
+    filename_trimmed = WORKING_DIR + "dataset/glove.6B.{}d.trimmed.npz".format(dim_word)
+    use_pretrained = True
+
+    # dataset
+    # filename_dev = "data/coNLL/eng/eng.testa.iob"
+    # filename_test = "data/coNLL/eng/eng.testb.iob"
+    # filename_train = "data/coNLL/eng/eng.train.iob"
+
+    #filename_dev = filename_test = filename_train = "data/test.txt" # test
+
+    filename_dev = WORKING_DIR + "dataset/devel.txt"
+    filename_test = WORKING_DIR + "dataset/test.txt"
+    filename_train = WORKING_DIR + "dataset/train.txt"
+
+    max_iter = None # if not None, max number of examples in Dataset
+
+    # vocab (created from dataset with build_data.py)
+    filename_words = WORKING_DIR + "dataset/words.txt"
+    filename_tags = WORKING_DIR + "dataset/tags.txt"
+    filename_chars = WORKING_DIR + "dataset/chars.txt"
+    filename_test_preds = WORKING_DIR + "dataset/test_preds.txt"
+
+    # training
+    train_embeddings = False
+    nepochs          = 15
+    dropout          = 0.5
+    batch_size       = 5
+    lr_method        = "adam"
+    lr               = 0.001
+    lr_decay         = 0.9
+    epoch_drop       = 1 # Step Decay: per # epochs to apply lr_decay
+    clip             = -1 # if negative, no clipping
+    nepoch_no_imprv  = 3
+
+    # model hyperparameters
+    hidden_size_char = 100 # lstm on chars
+    hidden_size_lstm = 300 # lstm on word embeddings
+
+    ner_model_path = "ner_{}e_bilstm_crf_elmo".format(nepochs)
+
+    # elmo config
+    use_elmo = True
+    dim_elmo = 1024
+
+    # NOTE: if both chars and crf, only 1.6x slower on GPU
+    use_crf = True # if crf, training is 1.7x slower on CPU
+    use_chars = False if use_elmo else True#  if char embedding, training is 3.5x slower on CPU