medaCy / Git / [6c353a] /medacy/pipelines/lstm_systematic_review

Models:
philipB/
medaCy
Downloads: 1
[6c353a]: / medacy / pipelines / lstm_systematic_review_pipeline.py
History
Download this file
53 lines (39 with data), 2.1 kB

import spacy

from medacy.pipeline_components.feature_extractors.discrete_feature_extractor import FeatureExtractor
from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap
from medacy.pipeline_components.feature_overlayers.metamap.metamap_component import MetaMapOverlayer
from medacy.pipeline_components.learners.bilstm_crf_learner import BiLstmCrfLearner
from medacy.pipeline_components.tokenizers.systematic_review_tokenizer import SystematicReviewTokenizer
from medacy.pipelines.base.base_pipeline import BasePipeline


class LstmSystematicReviewPipeline(BasePipeline):
    """
    A pipeline for clinical named entity recognition using the BiLSTM+CRF learner.

    Created by Jorge Vargas of NLP@VCU
    """

    def __init__(self, entities, metamap=None, **kwargs):
        """
        Create a pipeline with the name 'clinical_pipeline' utilizing
        by default spaCy's small english model.

        :param entities: a list of entities to be used by this pipeline
        :param metamap: MetaMap object (optional, absence of which will not use MetaMap)
        :param word_embeddings: path to a word embeddings file
        :param cuda_device: int for which GPU to use, defaults to using the CPU
        """

        super().__init__(entities, spacy_pipeline=spacy.load("en_core_web_sm"), **kwargs)

        if metamap:
            metamap = MetaMap(metamap)
            self.add_component(MetaMapOverlayer, metamap)

        if not kwargs['word_embeddings']:
            raise ValueError('This pipeline requires word embeddings.')

        self.word_embeddings = kwargs['word_embeddings']
        self.cuda_device = kwargs['cuda_device']

    def get_learner(self):
        learner = BiLstmCrfLearner(self.word_embeddings, self.cuda_device)
        return 'BiLSTM+CRF', learner

    def get_tokenizer(self):
        return SystematicReviewTokenizer(self.spacy_pipeline)

    def get_feature_extractor(self):
        return FeatureExtractor(
            window_size=0,
            spacy_features=['text', 'pos', 'shape', 'prefix', 'suffix']
        )