Switch to unified view

a b/medacy/pipelines/scispacy_pipeline.py
1
import spacy
2
3
from medacy.pipeline_components.feature_extractors.discrete_feature_extractor import FeatureExtractor
4
from medacy.pipeline_components.feature_overlayers.metamap.metamap_component import MetaMapOverlayer
5
from medacy.pipeline_components.learners.crf_learner import get_crf
6
from medacy.pipelines.base.base_pipeline import BasePipeline
7
8
9
class ScispacyPipeline(BasePipeline):
10
    """
11
    A pipeline for named entity recognition using ScispaCy, see https://allenai.github.io/scispacy/
12
13
    This pipeline differs from the ClinicalPipeline in that it uses AllenAI's 'en_core_sci_md' model and
14
    the tokenizer is simply spaCy's tokenizer.
15
16
    Created by Steele Farnsworth of NLP@VCU
17
18
    Requirements:
19
    scispacy
20
    https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.0/en_core_sci_md-0.2.0.tar.gz
21
    """
22
23
    def __init__(self, entities, metamap=None, **kwargs):
24
        """
25
        :param entities: a list of entities
26
        :param metamap: an instance of MetaMap if metamap should be used, defaults to None.
27
        """
28
        super().__init__(entities, spacy_pipeline=spacy.load("en_core_sci_md"), **kwargs)
29
30
        if metamap:
31
            self.add_component(MetaMapOverlayer, metamap)
32
33
    def get_learner(self):
34
        return "CRF_l2sgd", get_crf()
35
36
    def get_tokenizer(self):
37
        return None
38
39
    def get_feature_extractor(self):
40
        return FeatureExtractor(window_size=3, spacy_features=['pos_', 'shape_', 'prefix_', 'suffix_', 'text'])