|
a |
|
b/medacy/pipelines/scispacy_pipeline.py |
|
|
1 |
import spacy |
|
|
2 |
|
|
|
3 |
from medacy.pipeline_components.feature_extractors.discrete_feature_extractor import FeatureExtractor |
|
|
4 |
from medacy.pipeline_components.feature_overlayers.metamap.metamap_component import MetaMapOverlayer |
|
|
5 |
from medacy.pipeline_components.learners.crf_learner import get_crf |
|
|
6 |
from medacy.pipelines.base.base_pipeline import BasePipeline |
|
|
7 |
|
|
|
8 |
|
|
|
9 |
class ScispacyPipeline(BasePipeline): |
|
|
10 |
""" |
|
|
11 |
A pipeline for named entity recognition using ScispaCy, see https://allenai.github.io/scispacy/ |
|
|
12 |
|
|
|
13 |
This pipeline differs from the ClinicalPipeline in that it uses AllenAI's 'en_core_sci_md' model and |
|
|
14 |
the tokenizer is simply spaCy's tokenizer. |
|
|
15 |
|
|
|
16 |
Created by Steele Farnsworth of NLP@VCU |
|
|
17 |
|
|
|
18 |
Requirements: |
|
|
19 |
scispacy |
|
|
20 |
https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.0/en_core_sci_md-0.2.0.tar.gz |
|
|
21 |
""" |
|
|
22 |
|
|
|
23 |
def __init__(self, entities, metamap=None, **kwargs): |
|
|
24 |
""" |
|
|
25 |
:param entities: a list of entities |
|
|
26 |
:param metamap: an instance of MetaMap if metamap should be used, defaults to None. |
|
|
27 |
""" |
|
|
28 |
super().__init__(entities, spacy_pipeline=spacy.load("en_core_sci_md"), **kwargs) |
|
|
29 |
|
|
|
30 |
if metamap: |
|
|
31 |
self.add_component(MetaMapOverlayer, metamap) |
|
|
32 |
|
|
|
33 |
def get_learner(self): |
|
|
34 |
return "CRF_l2sgd", get_crf() |
|
|
35 |
|
|
|
36 |
def get_tokenizer(self): |
|
|
37 |
return None |
|
|
38 |
|
|
|
39 |
def get_feature_extractor(self): |
|
|
40 |
return FeatureExtractor(window_size=3, spacy_features=['pos_', 'shape_', 'prefix_', 'suffix_', 'text']) |