ehr-relation-extraction / Git / Diff of /bilstm_crf_ner/build

Models:
philipB/
ehr-relation-extraction
Downloads: 1
Diff of /bilstm_crf_ner/build_data.py [000000] .. [1de6ed]
Switch to side-by-side view

--- a
+++ b/bilstm_crf_ner/build_data.py
@@ -0,0 +1,59 @@
+from model.config import Config
+from model.data_utils import CoNLLDataset, get_vocabs, UNK, NUM, \
+    get_glove_vocab, write_vocab, load_vocab, get_char_vocab, \
+    export_trimmed_glove_vectors, get_processing_word
+
+
+def main():
+    """Procedure to build data
+
+    You MUST RUN this procedure. It iterates over the whole dataset (train,
+    dev and test) and extract the vocabularies in terms of words, tags, and
+    characters. Having built the vocabularies it writes them in a file. The
+    writing of vocabulary in a file assigns an id (the line #) to each word.
+    It then extract the relevant GloVe vectors and stores them in a np array
+    such that the i-th entry corresponds to the i-th word in the vocabulary.
+
+
+    Args:
+        config: (instance of Config) has attributes like hyper-params...
+
+    """
+    # 1. get config and processing of words
+    config = Config(load=False)
+
+    #2. Get processing word generator
+    processing_word = get_processing_word(lowercase=True)
+
+    # 3. Generators
+    dev   = CoNLLDataset(config.filename_dev, processing_word)
+    test  = CoNLLDataset(config.filename_test, processing_word)
+    train = CoNLLDataset(config.filename_train, processing_word)
+
+
+    # 4. Build Word and Tag vocab
+    vocab_words, vocab_tags = get_vocabs([train, dev, test])
+    vocab_glove = get_glove_vocab(config.filename_glove)
+
+    # 5. Get a vocab set for words in both vocab_words and vocab_glove
+    vocab = vocab_words & vocab_glove
+    vocab.add(UNK)
+    vocab.add(NUM)
+
+    # 6. Save vocab
+    write_vocab(vocab, config.filename_words)
+    write_vocab(vocab_tags, config.filename_tags)
+
+    # 7. Trim GloVe Vectors
+    vocab = load_vocab(config.filename_words)
+    export_trimmed_glove_vectors(vocab, config.filename_glove,
+                                config.filename_trimmed, config.dim_word)
+
+    # Build and save char vocab
+    train = CoNLLDataset(config.filename_train)
+    vocab_chars = get_char_vocab(train)
+    write_vocab(vocab_chars, config.filename_chars)
+
+
+if __name__ == "__main__":
+    main()