|
a |
|
b/medacy/pipelines/systematic_review_pipeline.py |
|
|
1 |
import spacy |
|
|
2 |
|
|
|
3 |
from medacy.pipeline_components.feature_extractors.discrete_feature_extractor import FeatureExtractor |
|
|
4 |
from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap |
|
|
5 |
from medacy.pipeline_components.feature_overlayers.metamap.metamap_component import MetaMapOverlayer |
|
|
6 |
from medacy.pipeline_components.learners.crf_learner import get_crf |
|
|
7 |
from medacy.pipeline_components.tokenizers.systematic_review_tokenizer import SystematicReviewTokenizer |
|
|
8 |
from medacy.pipelines.base.base_pipeline import BasePipeline |
|
|
9 |
|
|
|
10 |
|
|
|
11 |
class SystematicReviewPipeline(BasePipeline): |
|
|
12 |
""" |
|
|
13 |
A pipeline for clinical named entity recognition. This pipeline was designed over-top of the TAC 2018 SRIE track |
|
|
14 |
challenge. |
|
|
15 |
|
|
|
16 |
Created by Andriy Mulyar (andriymulyar.com) of NLP@VCU |
|
|
17 |
""" |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
def __init__(self, entities, metamap=None, **kwargs): |
|
|
21 |
""" |
|
|
22 |
Create a pipeline with the name 'clinical_pipeline' utilizing |
|
|
23 |
by default spaCy's small english model. |
|
|
24 |
|
|
|
25 |
:param entities: a list of entities |
|
|
26 |
:param metamap: an instance of MetaMap |
|
|
27 |
""" |
|
|
28 |
|
|
|
29 |
super().__init__(entities, spacy_pipeline=spacy.load("en_core_web_sm"), **kwargs) |
|
|
30 |
|
|
|
31 |
if metamap: |
|
|
32 |
metamap = MetaMap(metamap) |
|
|
33 |
self.add_component(MetaMapOverlayer, metamap) |
|
|
34 |
|
|
|
35 |
def get_learner(self): |
|
|
36 |
return "CRF_l2sgd", get_crf() |
|
|
37 |
|
|
|
38 |
def get_tokenizer(self): |
|
|
39 |
return SystematicReviewTokenizer(self.spacy_pipeline) |
|
|
40 |
|
|
|
41 |
def get_feature_extractor(self): |
|
|
42 |
return FeatureExtractor(window_size=10, spacy_features=['pos_', 'shape_', 'prefix_', 'suffix_', 'text']) |