medaCy / Git / [6c353a] /medacy/__main_

Models:
philipB/
medaCy
Downloads: 1
[6c353a]: / medacy / __main__.py
History
Download this file
201 lines (168 with data), 8.7 kB

"""
MedaCy CLI Setup
"""
import argparse
import datetime as dt
import importlib
import json
import logging
from sys import argv

from medacy import __version__
from medacy.data.dataset import Dataset
from medacy.model.model import Model, DEFAULT_NUM_FOLDS
from medacy.pipelines import bert_pipeline
from medacy.tools.json_to_pipeline import json_to_pipeline


def setup(args):
    """
    Sets up dataset and pipeline/model since it gets used by every command.
    :param args: Argparse args object.
    :return dataset, model: The dataset and model objects created.
    """
    dataset = Dataset(args.dataset)
    entities = list(dataset.get_labels())
    if args.test_mode:
        dataset.data_limit = 1

    if args.entities is not None:
        with open(args.entities, 'rb') as f:
            data = json.load(f)
        json_entities = data['entities']
        if not set(json_entities) <= set(entities):
            raise ValueError(f"The following entities from the json file are not in the provided dataset: {set(json_entities) - set(entities)}")
        entities = json_entities

    if args.custom_pipeline is not None:
        logging.info(f"Using custom pipeline configured at {args.custom_pipeline}")
        # Construct a pipeline class (not an instance) based on the provided json path;
        # args.custom_pipeline is that path
        Pipeline = json_to_pipeline(args.custom_pipeline)
    else:
        # Parse the argument as a class name in module medacy.pipelines
        module = importlib.import_module("medacy.pipelines")
        Pipeline = getattr(module, args.pipeline)
        logging.info('Using %s', args.pipeline)

    pipeline = Pipeline(
        entities=entities,
        cuda_device=args.cuda,
        word_embeddings=args.word_embeddings,
        batch_size=args.batch_size,
        learning_rate=args.learning_rate,
        epochs=args.epochs,
        pretrained_model=args.pretrained_model,
        using_crf=args.using_crf
    )

    model = Model(pipeline)
    return dataset, model


def train(args, dataset, model):
    """
    Used for training new models.
    :param args: Argparse args object.
    :param dataset: Dataset to use for training.
    :param model: Untrained model object to use.
    """
    if args.filename is None:
        raise RuntimeError("A file name must me specified with -f when training a model")

    model.fit(dataset, groundtruth_directory=args.groundtruth)
    model.dump(args.filename)


def predict(args, dataset, model):
    """
    Used for running predictions on new datasets.
    :param args: Argparse args object.
    :param dataset: Dataset to run prediction over.
    :param model: Trained model to use for predictions.
    """
    if not args.predictions:
        args.predictions = None

    model.load(args.model_path)
    model.predict(
        dataset,
        prediction_directory=args.predictions
    )


def cross_validate(args, dataset, model):
    """
    Used for running k-fold cross validations.
    :param args: Argparse args object.
    :param dataset: Dataset to use for training.
    :param model: Untrained model object to use.
    """
    model.cross_validate(
        num_folds=args.k_folds,
        training_dataset=dataset,
        prediction_directory=args.predictions,
        groundtruth_directory=args.groundtruth
    )


def main():
    """
    Main function where initial argument parsing happens.
    """
    # Argparse setup
    parser = argparse.ArgumentParser(prog='medacy', description='Train, evaluate, and predict with medaCy.')
    # Global variables
    parser.add_argument('-pl', '--pipeline', default='ClinicalPipeline', help='Pipeline to use for training. Write the exact name of the class.')
    parser.add_argument('-cpl', '--custom_pipeline', default=None, help='Path to a json file of a custom pipeline, as an alternative to a medaCy pipeline')
    parser.add_argument('-d', '--dataset', required=True, help='Directory of dataset to use for training.')
    parser.add_argument('-ent', '--entities', default=None,
                        help='Path to a json file containing an \"entities\" key of a list of entities to use; otherwise all the entities in the dataset will be used.')

    # Logging, testing variables
    test_group = parser.add_argument_group('Logging and testing arguments')
    test_group.add_argument('-lc', '--log_console', action='store_true', help='Use to print logs to console.')
    test_group.add_argument('-lf', '--log_file', default=None, help='Specify a log file path, if something other than the default is desired')
    test_group.add_argument('-t', '--test_mode', default=False, action='store_true',
                            help='Specify that the action is a test (automatically uses only a single '
                                 'data file from the dataset and sets logging to debug mode)')

    # GPU-specific
    gpu_group = parser.add_argument_group('GPU Arguments', 'Arguments that relate to the GPU, used by the BiLSTM and BERT')
    gpu_group.add_argument('-c', '--cuda', type=int, default=-1, help='Cuda device to use. -1 to use CPU.')

    # BiLSTM-specific
    bilstm_group = parser.add_argument_group('BiLSTM Arguments', 'Arguments for the BiLSTM learner')
    bilstm_group.add_argument('-w', '--word_embeddings', help='Path to word embeddings, needed for BiLSTM.')

    # BERT-specific
    bert_group = parser.add_argument_group('BERT Arguments', 'Arguments for the BERT learner')
    bert_group.add_argument('-b', '--batch_size', type=int, default=bert_pipeline.BATCH_SIZE, help='Batch size.')
    bert_group.add_argument('-lr', '--learning_rate', type=float, default=bert_pipeline.LEARNING_RATE, help='Learning rate for train and cross validate.')
    bert_group.add_argument('-e', '--epochs', type=int, default=bert_pipeline.EPOCHS, help='Number of epochs to train for.')
    bert_group.add_argument('-pm', '--pretrained_model', type=str, default='bert-large-cased', help='Which pretrained model to use.')
    bert_group.add_argument('-crf', '--using_crf', action='store_true', help='Use a CRF layer.')

    subparsers = parser.add_subparsers()

    # Cross Validation arguments
    parser_validate = subparsers.add_parser('validate', help='Cross validate a model on a given dataset.')
    parser_validate.add_argument('-k', '--k_folds', default=DEFAULT_NUM_FOLDS, type=int, help='Number of folds to use for cross-validation.')
    parser_validate.add_argument('-gt', '--groundtruth', type=str, default=None, help='Directory to write groundtruth files.')
    parser_validate.add_argument('-pd', '--predictions', type=str, default=None, help='Directory to write prediction files.')
    parser_validate.set_defaults(func=cross_validate)

    # Train arguments
    parser_train = subparsers.add_parser('train', help='Train a new model.')
    parser_train.add_argument('-f', '--filename', help='Filename to use for saved model.')
    parser_train.add_argument('-gt', '--groundtruth', type=str, default=None, help='Directory to write groundtruth files.')
    parser_train.set_defaults(func=train)

    # Predict arguments
    parser_predict = subparsers.add_parser('predict', help='Run predictions on the dataset using a trained model.')
    parser_predict.add_argument('-m', '--model_path', required=True, help='Trained model to load.')
    parser_predict.add_argument('-pd', '--predictions', default=None, help='Directory to store prediction files.')
    parser_predict.set_defaults(func=predict)

    # Parse initial args
    args = parser.parse_args()

    # Logging
    device = str(args.cuda) if args.cuda >= 0 else 'cpu'
    log_file_name = args.log_file or f"medacy_{device}.log"
    logging.basicConfig(filename=log_file_name, format='%(asctime)-15s: %(message)s', level=logging.INFO)
    logger = logging.getLogger()
    if args.log_console or args.test_mode:
        logger.addHandler(logging.StreamHandler())
    if args.test_mode:
        logger.setLevel(logging.DEBUG)
        logging.info("Test mode enabled: logging set to debug") 
    logging.info(f"medaCy v{__version__}\nCommand: python -m medacy {' '.join(argv[1:])}")
    start_time = dt.datetime.now()
    start_timestamp = start_time.strftime('%Y-%m-%d %H:%M:%S')
    logging.info(f'\n\nSTART TIME: {start_timestamp}')

    # Run proper function
    dataset, model = setup(args)
    args.func(args, dataset, model)

    # Calculate/print end time
    end_time = dt.datetime.now()
    end_timestamp = end_time.strftime('%Y-%m-%d %H:%M:%S')
    logging.info(f'END TIME: {end_timestamp}')

    # Calculate/print time elapsed
    elapsed_time: dt.timedelta = end_time - start_time
    logging.info(f'TIME ELAPSED: {elapsed_time}')


if __name__ == '__main__':
    main()