--- a +++ b/medacy/pipelines/clinical_pipeline.py @@ -0,0 +1,41 @@ +import spacy + +from medacy.pipeline_components.feature_extractors.discrete_feature_extractor import FeatureExtractor +from medacy.pipeline_components.feature_overlayers.metamap.metamap_component import MetaMapOverlayer +from medacy.pipeline_components.learners.crf_learner import get_crf +from medacy.pipeline_components.tokenizers.clinical_tokenizer import ClinicalTokenizer +from medacy.pipelines.base.base_pipeline import BasePipeline + + +class ClinicalPipeline(BasePipeline): + """ + A pipeline for clinical named entity recognition. A special tokenizer that breaks down a clinical document + to character level tokens defines this pipeline. It was created for the extraction of ADE related entities + from the 2018 N2C2 Shared Task. + + Created by Andiy Mulyar (andriymulyar.com) of NLP@VCU + """ + + + def __init__(self, entities, metamap=None, **kwargs): + """ + Create a pipeline with the name 'clinical_pipeline' utilizing + by default spaCy's small english model. + + :param entities: a list of entities to use in this pipeline. + :param metamap: an instance of MetaMap if metamap should be used, defaults to None. + """ + + super().__init__(entities, spacy_pipeline=spacy.load("en_core_web_sm"), **kwargs) + + if metamap: + self.add_component(MetaMapOverlayer, metamap) + + def get_learner(self): + return "CRF_l2sgd", get_crf() + + def get_tokenizer(self): + return ClinicalTokenizer(self.spacy_pipeline) + + def get_feature_extractor(self): + return FeatureExtractor(window_size=3, spacy_features=['pos_', 'shape_', 'prefix_', 'suffix_', 'text'])