NLP_CRT / Git / [8d2107] /doc2vec

Models:
philipB/
NLP_CRT
Downloads: 1
[8d2107]: / doc2vec_trainer.py
History
Download this file
70 lines (55 with data), 2.4 kB

import argparse
import lockfile

from daemon import DaemonContext
from gensim.models import Doc2Vec
from gensim.models.doc2vec import LabeledSentence

from loader import get_data


class LabeledDocIterator(object):
    def __init__(self, patient_list, categories, status):
        self.patient_list = patient_list 
        self.category = categories
        self.status = status

    def __iter__(self):
        for i in self.patient_list:
            p = get_data([i])[0]
            self.status.write(p['NEW_EMPI'] + '\n')
            for category in categories:
                if category in p:
                    for idx, doc in enumerate(p[category]):
                        tag = p['NEW_EMPI'] + '_' + category + '_' + str(idx) + '\n'
                        yield LabeledSentence(words=doc['free_text'].split(), tags=[tag])


def train_doc2vec_model(categories, n_patients, output_file, status_file, dm):
    with open(status_file, 'w') as status:
        it = LabeledDocIterator(range(n_patients), categories, status)

        model = Doc2Vec(size=300, window=10, dm=dm, min_count=5, workers=11,alpha=0.025, min_alpha=0.025) # use fixed learning rate
        model.build_vocab(it)
        for epoch in range(10):
            message = ("***********Training Epoch: " + str(epoch)
                       + ("***********") + '\n')
            print(message)
            status.write(message)
            model.train(it)
            model.alpha -= 0.002 # decrease the learning rate
            model.min_alpha = model.alpha # fix the learning rate, no decay
            model.train(it)

        # Save the model
        model.save(output_file)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("output_file")
    parser.add_argument("n_patients")
    parser.add_argument("categories")
    # Switches between Distributed Memory and Distributed Bag of Words Model
    parser.add_argument("dm")
    args = parser.parse_args()
    status_file = args.output_file + '.status'
    categories = args.categories.split(',')


    base = '/home/ubuntu/josh_project'
    context = DaemonContext(
        working_directory=base,
        umask=0o002,
        pidfile=lockfile.FileLock(base + 'doc2vec_trainer.pid'),
    )

    with context:
        train_doc2vec_model(categories, int(args.n_patients), 
                        args.output_file, status_file, int(args.dm))