[6c353a]: / medacy / model / multi_model.py

Download this file

114 lines (95 with data), 5.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
import os
from shutil import copyfile
from medacy.data.annotations import Annotations
from medacy.data.dataset import Dataset
from medacy.model.model import Model
from medacy.pipelines.base.base_pipeline import BasePipeline
def _activate_model(model_path, pipeline_class, args, kwargs):
"""
Creates a Model with the given pipeline configuration and sets its weights to the pickled model path
:param model_path: path to the model pickle file
:param pipeline_class: the pipeline class for the pickled model
:param args, kwargs: arguments to pass to the pipeline constructor
:return: a usable Model instance
"""
pipeline_instance = pipeline_class(*args, **kwargs)
model = Model(pipeline_instance)
model.load(model_path)
return model
class MultiModel:
"""
Allows for prediction with multiple models, ensuring that only the model being used at a given time and its pipeline
are present in memory.
An example use case:
>>> from medacy.model.multi_model import MultiModel
>>> from medacy.pipelines.clinical_pipeline import ClinicalPipeline
>>> from medacy.pipelines.scispacy_pipeline import ScispacyPipeline
>>> multimodel = MultiModel()
>>> multimodel.add_model('path/to/model_one.pkl', ClinicalPipeline, ['Drug', 'ADE'])
>>> multimodel.add_model('path/to/model_two.pkl', ScispacyPipeline, ['Dose', 'Frequency'])
>>> for model in multimodel:
... model.predict('The patient was prescribed 5mg of Tylenol and got a headache.')
>>> predicted_data = multimodel.predict_directory('path/to/input/data', 'path/to/output/directory')
"""
def __init__(self):
"""No values are needed to instantiate a new MultiModel."""
self.models = []
def __len__(self):
return len(self.models)
def add_model(self, model_path, pipeline_class, *args, **kwargs):
"""
Adds a new model to the MultiModel
:param model_path: path to the model pickle file
:param pipeline_class: the pipeline class for the pickled model
:param args, kwargs: arguments to pass to the pipeline constructor
:return: None
"""
if not os.path.isfile(model_path):
raise FileNotFoundError(f"'model path' is not a path to an existing file, but is {repr(model_path)}")
if not issubclass(pipeline_class, BasePipeline):
raise TypeError(f"'pipeline_class' must be a subclass of BasePipeline, but is '{repr(pipeline_class)}'")
self.models.append((model_path, pipeline_class, args, kwargs))
def __iter__(self):
"""
Individually activates and returns usable Model instances one at a time
"""
for tup in self.models:
model_path, pipeline, args, kwargs = tup
yield _activate_model(model_path, pipeline, args, kwargs)
def predict_directory(self, data_directory, prediction_directory):
"""
Predicts over all txt files in a directory using every Model. Note that this method spends a lot of time
on file IO because each txt file is opened as many times as there are models.
:param data_directory: Path to a directory of text files to predict over
:param prediction_directory: a directory to write predictions to
:return: a Dataset of the predictions
"""
if not os.path.isdir(data_directory):
raise ValueError(f"'data_directory' must be an existing directory, but is '{repr(data_directory)}'")
if not os.path.isdir(prediction_directory):
raise ValueError(f"'prediction_directory' must be a directory, but is '{repr(prediction_directory)}'")
# Get all the txt files in the input directory
txt_files = [f for f in os.listdir(data_directory) if f.endswith('.txt')]
# Create a dictionary of empty Annotations objects to store the predictions
annotation_dict = {f: Annotations([], source_text_path=f) for f in txt_files}
for model in self:
for file_name in txt_files:
file_path = os.path.join(data_directory, file_name)
with open(file_path) as f:
text = f.read()
this_annotations = annotation_dict[file_name]
resulting_annotations = model.predict(text)
# Merge the two Annotations together and store them back in the dictionary
annotation_dict[file_name] = this_annotations | resulting_annotations
# Create the new Dataset directory
for path, ann in annotation_dict.items():
# Get the name of the output ann file
path = os.path.join(data_directory, path)
base_name = os.path.basename(path)[:-4]
output_ann = os.path.join(prediction_directory, base_name + '.ann')
output_txt = os.path.join(prediction_directory, base_name + '.txt')
# Write the ann file
ann.to_ann(output_ann)
# Copy the txt file
copyfile(path, output_txt)
return Dataset(prediction_directory)