import inspect
import time
from abc import ABC, abstractmethod
import spacy
import medacy
from medacy.pipeline_components.feature_overlayers.gold_annotator_component import GoldAnnotatorOverlayer
class BasePipeline(ABC):
"""
An abstract wrapper for a Medical NER Pipeline
"""
def __init__(self, entities, spacy_pipeline, **kwargs):
"""
Initializes a pipeline
:param entities: a list of entities, or an empty list (or None) if the pipeline is for a model that
has already been fitted
:param spacy_pipeline: the corresponding spacy pipeline (language) to utilize.
:param cuda_device: the GPU to use, if any (defaults to -1 for using the CPU)
"""
self.entities = entities or []
self.spacy_pipeline = spacy_pipeline
self.overlayers = [] # Stores feature overlayers
self._kwargs = kwargs
# Set tokenizer, if something other than the spaCy pipeline's tokenizer is specified in get_tokenizer()
tokenizer = self.get_tokenizer()
if tokenizer:
# 'tokenizer' is a class with an attribute named 'tokenizer'
self.spacy_pipeline.tokenizer = tokenizer.tokenizer
self.add_component(GoldAnnotatorOverlayer, entities)
# The following code was causing GPU errors because you cannot specify which GPU spaCy will use;
# You may uncomment this code if you know you have access to the GPU that spaCy will use.
# if cuda_device >= 0:
# spacy.require_gpu()
@abstractmethod
def get_tokenizer(self):
"""Returns an instance of a tokenizer"""
pass
@abstractmethod
def get_learner(self):
"""Returns an instance of a sci-kit learn compatible learning algorithm."""
pass
@abstractmethod
def get_feature_extractor(self):
"""Returns an instant of FeatureExtractor with all configs set."""
pass
def add_component(self, component, *argv, **kwargs):
"""
Adds a given component to pipeline
:param component: a subclass of BaseOverlayer (the class itself; not an instance)
:param args, kwargs: arguments to pass to the constructor of the component
"""
current_components = [component_name for component_name, proc in self.spacy_pipeline.pipeline]
assert component.name not in current_components, "%s is already in the pipeline." % component.name
for dependent in component.dependencies:
assert dependent in current_components, "%s depends on %s but it hasn't been added to the pipeline" % (component, dependent)
new_component = component(self.spacy_pipeline, *argv, **kwargs)
self.spacy_pipeline.add_pipe(new_component)
if component.name != "gold_annotator":
self.overlayers.append(new_component)
def get_component_names(self):
"""
Retrieves a listing of all components currently in the pipeline.
:return: a list of components inside the pipeline.
"""
return [component_name for component_name, _ in self.spacy_pipeline.pipeline if component_name != 'ner']
def __call__(self, doc, predict=False):
"""
Passes a single document through the pipeline.
All relevant document attributes should be set prior to this call.
:param self:
:param doc: the document to annotate over
:return: the annotated document
"""
for component_name, proc in self.spacy_pipeline.pipeline:
if predict and component_name == "gold_annotator":
continue
doc = proc(doc)
if component_name == 'ner':
# remove labeled default entities
doc.ents = []
return doc
def get_report(self):
"""
Generates a report about the pipeline class's configuration
:return: str
"""
# Get data about these components
learner_name, learner = self.get_learner()
tokenizer = self.get_tokenizer()
feature_extractor = self.get_feature_extractor()
spacy_metadata = self.spacy_pipeline.meta
# Start the report with the name of the class and the docstring
report = f"{type(self).__name__}\n{self.__doc__}\n\n"
report += f"Report created at {time.asctime()}\n\n"
report += f"MedaCy Version: {medacy.__version__}\nSpaCy Version: {spacy.__version__}\n"
report += f"SpaCy Model: {spacy_metadata['name']}, version {spacy_metadata['version']}\n"
report += f"Entities: {self.entities}\n"
report += f"Constructor arguments: {self._kwargs}\n\n"
# Print data about the feature overlayers
if self.overlayers:
report += "Feature Overlayers:\n\n"
report += "\n\n".join(o.get_report() for o in self.overlayers) + '\n\n'
# Print data about the feature extractor
report += f"Feature Extractor: {type(feature_extractor).__name__} at {inspect.getfile(type(feature_extractor))}\n"
report += f"\tWindow Size: {feature_extractor.window_size}\n"
report += f"\tSpaCy Features: {feature_extractor.spacy_features}\n"
# Print the name and location of the remaining components
report += f"Learner: {learner_name} at {inspect.getfile(type(learner))}\n"
if self.get_tokenizer():
report += f"Tokenizer: {type(tokenizer).__name__} at {inspect.getfile(type(tokenizer))}\n"
else:
report += f"Tokenizer: spaCy pipeline default\n"
return report