[6c353a]: / medacy / pipelines / base / base_pipeline.py

Download this file

145 lines (112 with data), 5.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import inspect
import time
from abc import ABC, abstractmethod
import spacy
import medacy
from medacy.pipeline_components.feature_overlayers.gold_annotator_component import GoldAnnotatorOverlayer
class BasePipeline(ABC):
"""
An abstract wrapper for a Medical NER Pipeline
"""
def __init__(self, entities, spacy_pipeline, **kwargs):
"""
Initializes a pipeline
:param entities: a list of entities, or an empty list (or None) if the pipeline is for a model that
has already been fitted
:param spacy_pipeline: the corresponding spacy pipeline (language) to utilize.
:param cuda_device: the GPU to use, if any (defaults to -1 for using the CPU)
"""
self.entities = entities or []
self.spacy_pipeline = spacy_pipeline
self.overlayers = [] # Stores feature overlayers
self._kwargs = kwargs
# Set tokenizer, if something other than the spaCy pipeline's tokenizer is specified in get_tokenizer()
tokenizer = self.get_tokenizer()
if tokenizer:
# 'tokenizer' is a class with an attribute named 'tokenizer'
self.spacy_pipeline.tokenizer = tokenizer.tokenizer
self.add_component(GoldAnnotatorOverlayer, entities)
# The following code was causing GPU errors because you cannot specify which GPU spaCy will use;
# You may uncomment this code if you know you have access to the GPU that spaCy will use.
# if cuda_device >= 0:
# spacy.require_gpu()
@abstractmethod
def get_tokenizer(self):
"""Returns an instance of a tokenizer"""
pass
@abstractmethod
def get_learner(self):
"""Returns an instance of a sci-kit learn compatible learning algorithm."""
pass
@abstractmethod
def get_feature_extractor(self):
"""Returns an instant of FeatureExtractor with all configs set."""
pass
def add_component(self, component, *argv, **kwargs):
"""
Adds a given component to pipeline
:param component: a subclass of BaseOverlayer (the class itself; not an instance)
:param args, kwargs: arguments to pass to the constructor of the component
"""
current_components = [component_name for component_name, proc in self.spacy_pipeline.pipeline]
assert component.name not in current_components, "%s is already in the pipeline." % component.name
for dependent in component.dependencies:
assert dependent in current_components, "%s depends on %s but it hasn't been added to the pipeline" % (component, dependent)
new_component = component(self.spacy_pipeline, *argv, **kwargs)
self.spacy_pipeline.add_pipe(new_component)
if component.name != "gold_annotator":
self.overlayers.append(new_component)
def get_component_names(self):
"""
Retrieves a listing of all components currently in the pipeline.
:return: a list of components inside the pipeline.
"""
return [component_name for component_name, _ in self.spacy_pipeline.pipeline if component_name != 'ner']
def __call__(self, doc, predict=False):
"""
Passes a single document through the pipeline.
All relevant document attributes should be set prior to this call.
:param self:
:param doc: the document to annotate over
:return: the annotated document
"""
for component_name, proc in self.spacy_pipeline.pipeline:
if predict and component_name == "gold_annotator":
continue
doc = proc(doc)
if component_name == 'ner':
# remove labeled default entities
doc.ents = []
return doc
def get_report(self):
"""
Generates a report about the pipeline class's configuration
:return: str
"""
# Get data about these components
learner_name, learner = self.get_learner()
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
spacy_metadata = self.spacy_pipeline.meta
# Start the report with the name of the class and the docstring
report = f"{type(self).__name__}\n{self.__doc__}\n\n"
report += f"Report created at {time.asctime()}\n\n"
report += f"MedaCy Version: {medacy.__version__}\nSpaCy Version: {spacy.__version__}\n"
report += f"SpaCy Model: {spacy_metadata['name']}, version {spacy_metadata['version']}\n"
report += f"Entities: {self.entities}\n"
report += f"Constructor arguments: {self._kwargs}\n\n"
# Print data about the feature overlayers
if self.overlayers:
report += "Feature Overlayers:\n\n"
report += "\n\n".join(o.get_report() for o in self.overlayers) + '\n\n'
# Print data about the feature extractor
report += f"Feature Extractor: {type(feature_extractor).__name__} at {inspect.getfile(type(feature_extractor))}\n"
report += f"\tWindow Size: {feature_extractor.window_size}\n"
report += f"\tSpaCy Features: {feature_extractor.spacy_features}\n"
# Print the name and location of the remaining components
report += f"Learner: {learner_name} at {inspect.getfile(type(learner))}\n"
if self.get_tokenizer():
report += f"Tokenizer: {type(tokenizer).__name__} at {inspect.getfile(type(tokenizer))}\n"
else:
report += f"Tokenizer: spaCy pipeline default\n"
return report