[6c353a]: / medacy / tools / json_to_pipeline.py

Download this file

145 lines (115 with data), 6.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
import json
import os
import spacy
from medacy.pipeline_components.feature_extractors.discrete_feature_extractor import FeatureExtractor
from medacy.pipeline_components.feature_extractors.text_extractor import TextExtractor
from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap
from medacy.pipeline_components.feature_overlayers.metamap.metamap_all_types_component import MetaMapAllTypesOverlayer
from medacy.pipeline_components.feature_overlayers.metamap.metamap_component import MetaMapOverlayer
from medacy.pipeline_components.learners.bert_learner import BertLearner
from medacy.pipeline_components.learners.bilstm_crf_learner import BiLstmCrfLearner
from medacy.pipeline_components.learners.crf_learner import get_crf
from medacy.pipeline_components.tokenizers.character_tokenizer import CharacterTokenizer
from medacy.pipeline_components.tokenizers.clinical_tokenizer import ClinicalTokenizer
from medacy.pipeline_components.tokenizers.systematic_review_tokenizer import SystematicReviewTokenizer
from medacy.pipelines.base.base_pipeline import BasePipeline
required_keys = [
'learner',
'spacy_pipeline',
]
def json_to_pipeline(json_path):
"""
Constructs a custom pipeline from a json file
The json must have the following keys:
'learner': 'CRF', 'BiLSTM', or 'BERT'
'spacy_pipeline': the spaCy model to use
The following keys are optional:
'spacy_features': a list of features that exist as spaCy token annotations
'window_size': the number of words +/- the target word whose features should be used along with the target word; defaults to 0
'tokenizer': 'clinical', 'systematic_review', or 'character'; defaults to the spaCy model's tokenizer
'metamap': the path to the MetaMap binary; MetaMap will only be used if this key is present
if 'metamap' is a key, 'semantic_types' must also be a key, with value 'all', 'none', or
a list of semantic type strings
:param json_path: the path to the json file, or a dict of what that json would be
:return: a custom pipeline class
"""
if isinstance(json_path, (str, os.PathLike)):
with open(json_path, 'rb') as f:
input_json = json.load(f)
elif isinstance(json_path, dict):
input_json = json_path
missing_keys = [key for key in required_keys if key not in input_json.keys()]
if missing_keys:
raise ValueError(f"Required key(s) '{missing_keys}' was/were not found in the json file.")
class CustomPipeline(BasePipeline):
"""A custom pipeline configured from a JSON file"""
def __init__(self, entities, **kwargs):
super().__init__(entities, spacy_pipeline=spacy.load(input_json['spacy_pipeline']))
if 'metamap' in input_json.keys():
if 'semantic_types' not in input_json.keys():
raise ValueError("'semantic_types' must be a key when 'metamap' is a key.")
metamap = MetaMap(input_json['metamap'])
if input_json['semantic_types'] == 'all':
self.add_component(MetaMapAllTypesOverlayer, metamap)
elif input_json['semantic_types'] == 'none':
self.add_component(MetaMapOverlayer, metamap, semantic_type_labels=[])
elif isinstance(input_json['semantic_types'], list):
self.add_component(MetaMapOverlayer, metamap, semantic_type_labels=input_json['semantic_types'])
else:
raise ValueError("'semantic_types' must be 'all', 'none', or a list of strings")
# BERT values
self.cuda_device = kwargs['cuda_device'] if 'cuda_device' in kwargs else -1
self.batch_size = kwargs['batch_size'] if 'batch_size' in kwargs else 8
self.learning_rate = kwargs['learning_rate'] if 'learning_rate' in kwargs else 1e-5
self.epochs = kwargs['epochs'] if 'epochs' in kwargs else 3
self.pretrained_model = kwargs['pretrained_model'] if 'pretrained_model' in kwargs else 'bert-large-cased'
self.using_crf = kwargs['using_crf'] if 'using_crf' in kwargs else False
# BiLSTM value
if input_json['learner'] == 'BiLSTM':
if 'word_embeddings' not in kwargs:
raise ValueError("BiLSTM learner requires word embeddings; use the parameter '--word_embeddings' "
"to specify an embedding path")
self.word_embeddings = kwargs['word_embeddings']
def get_tokenizer(self):
if 'tokenizer' not in input_json.keys():
return None
selection = input_json['tokenizer']
options = {
'clinical': ClinicalTokenizer,
'systematic_review': SystematicReviewTokenizer,
'character': CharacterTokenizer
}
if selection not in options:
raise ValueError(f"Tokenizer selection '{selection}' not an option")
Tokenizer = options[selection]
return Tokenizer(self.spacy_pipeline)
def get_learner(self):
learner_selection = input_json['learner']
if learner_selection == 'CRF':
return "CRF_l2sgd", get_crf()
if learner_selection == 'BiLSTM':
return 'BiLSTM+CRF', BiLstmCrfLearner(self.word_embeddings, self.cuda_device)
if learner_selection == 'BERT':
learner = BertLearner(
self.cuda_device,
pretrained_model=self.pretrained_model,
batch_size=self.batch_size,
learning_rate=self.learning_rate,
epochs=self.epochs,
using_crf=self.using_crf
)
return 'BERT', learner
else:
raise ValueError(f"'learner' must be 'CRF', 'BiLSTM', or 'BERT', but is {learner_selection}")
def get_feature_extractor(self):
if input_json['learner'] == 'BERT':
return TextExtractor()
return FeatureExtractor(
window_size=input_json['window_size'] if 'window_size' in input_json else 0,
spacy_features=input_json['spacy_features'] if 'spacy_features' in input_json else ['text']
)
def get_report(self):
report = super().get_report() + '\n'
report += f"Pipeline configured from a JSON: {json.dumps(input_json)}\nJSON path: {json_path}"
return report
return CustomPipeline