medaCy / Git / [6c353a] /medacy/tools/json_to

Models:
philipB/
medaCy
Downloads: 1
[6c353a]: / medacy / tools / json_to_pipeline.py
History
Download this file
145 lines (115 with data), 6.6 kB

import json
import os

import spacy

from medacy.pipeline_components.feature_extractors.discrete_feature_extractor import FeatureExtractor
from medacy.pipeline_components.feature_extractors.text_extractor import TextExtractor
from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap
from medacy.pipeline_components.feature_overlayers.metamap.metamap_all_types_component import MetaMapAllTypesOverlayer
from medacy.pipeline_components.feature_overlayers.metamap.metamap_component import MetaMapOverlayer
from medacy.pipeline_components.learners.bert_learner import BertLearner
from medacy.pipeline_components.learners.bilstm_crf_learner import BiLstmCrfLearner
from medacy.pipeline_components.learners.crf_learner import get_crf
from medacy.pipeline_components.tokenizers.character_tokenizer import CharacterTokenizer
from medacy.pipeline_components.tokenizers.clinical_tokenizer import ClinicalTokenizer
from medacy.pipeline_components.tokenizers.systematic_review_tokenizer import SystematicReviewTokenizer
from medacy.pipelines.base.base_pipeline import BasePipeline


required_keys = [
    'learner',
    'spacy_pipeline',
]


def json_to_pipeline(json_path):
    """
    Constructs a custom pipeline from a json file

    The json must have the following keys:

    'learner': 'CRF', 'BiLSTM', or 'BERT'
    'spacy_pipeline': the spaCy model to use

    The following keys are optional:
    'spacy_features': a list of features that exist as spaCy token annotations
    'window_size': the number of words +/- the target word whose features should be used along with the target word; defaults to 0
    'tokenizer': 'clinical', 'systematic_review', or 'character'; defaults to the spaCy model's tokenizer
    'metamap': the path to the MetaMap binary; MetaMap will only be used if this key is present
        if 'metamap' is a key, 'semantic_types' must also be a key, with value 'all', 'none', or
        a list of semantic type strings

    :param json_path: the path to the json file, or a dict of what that json would be
    :return: a custom pipeline class
    """

    if isinstance(json_path, (str, os.PathLike)):
        with open(json_path, 'rb') as f:
            input_json = json.load(f)
    elif isinstance(json_path, dict):
        input_json = json_path

    missing_keys = [key for key in required_keys if key not in input_json.keys()]
    if missing_keys:
        raise ValueError(f"Required key(s) '{missing_keys}' was/were not found in the json file.")

    class CustomPipeline(BasePipeline):
        """A custom pipeline configured from a JSON file"""

        def __init__(self, entities, **kwargs):
            super().__init__(entities, spacy_pipeline=spacy.load(input_json['spacy_pipeline']))

            if 'metamap' in input_json.keys():
                if 'semantic_types' not in input_json.keys():
                    raise ValueError("'semantic_types' must be a key when 'metamap' is a key.")

                metamap = MetaMap(input_json['metamap'])

                if input_json['semantic_types'] == 'all':
                    self.add_component(MetaMapAllTypesOverlayer, metamap)
                elif input_json['semantic_types'] == 'none':
                    self.add_component(MetaMapOverlayer, metamap, semantic_type_labels=[])
                elif isinstance(input_json['semantic_types'], list):
                    self.add_component(MetaMapOverlayer, metamap, semantic_type_labels=input_json['semantic_types'])
                else:
                    raise ValueError("'semantic_types' must be 'all', 'none', or a list of strings")

            # BERT values
            self.cuda_device = kwargs['cuda_device'] if 'cuda_device' in kwargs else -1
            self.batch_size = kwargs['batch_size'] if 'batch_size' in kwargs else 8
            self.learning_rate = kwargs['learning_rate'] if 'learning_rate' in kwargs else 1e-5
            self.epochs = kwargs['epochs'] if 'epochs' in kwargs else 3
            self.pretrained_model = kwargs['pretrained_model'] if 'pretrained_model' in kwargs else 'bert-large-cased'
            self.using_crf = kwargs['using_crf'] if 'using_crf' in kwargs else False

            # BiLSTM value
            if input_json['learner'] == 'BiLSTM':
                if 'word_embeddings' not in kwargs:
                    raise ValueError("BiLSTM learner requires word embeddings; use the parameter '--word_embeddings' "
                                     "to specify an embedding path")
            self.word_embeddings = kwargs['word_embeddings']

        def get_tokenizer(self):
            if 'tokenizer' not in input_json.keys():
                return None

            selection = input_json['tokenizer']
            options = {
                'clinical': ClinicalTokenizer,
                'systematic_review': SystematicReviewTokenizer,
                'character': CharacterTokenizer
            }

            if selection not in options:
                raise ValueError(f"Tokenizer selection '{selection}' not an option")

            Tokenizer = options[selection]
            return Tokenizer(self.spacy_pipeline)

        def get_learner(self):
            learner_selection = input_json['learner']

            if learner_selection == 'CRF':
                return "CRF_l2sgd", get_crf()
            if learner_selection == 'BiLSTM':
                return 'BiLSTM+CRF', BiLstmCrfLearner(self.word_embeddings, self.cuda_device)
            if learner_selection == 'BERT':
                learner = BertLearner(
                    self.cuda_device,
                    pretrained_model=self.pretrained_model,
                    batch_size=self.batch_size,
                    learning_rate=self.learning_rate,
                    epochs=self.epochs,
                    using_crf=self.using_crf
                )
                return 'BERT', learner
            else:
                raise ValueError(f"'learner' must be 'CRF', 'BiLSTM', or 'BERT', but is {learner_selection}")

        def get_feature_extractor(self):
            if input_json['learner'] == 'BERT':
                return TextExtractor()

            return FeatureExtractor(
                window_size=input_json['window_size'] if 'window_size' in input_json else 0,
                spacy_features=input_json['spacy_features'] if 'spacy_features' in input_json else ['text']
            )

        def get_report(self):
            report = super().get_report() + '\n'
            report += f"Pipeline configured from a JSON: {json.dumps(input_json)}\nJSON path: {json_path}"
            return report

    return CustomPipeline