--- a +++ b/medacy/data/dataset.py @@ -0,0 +1,302 @@ +""" +A medaCy Dataset facilities the management of data for both model training and model prediction. + +A Dataset object provides a wrapper for a unix file directory containing training/prediction +data. If a Dataset, at training time, is fed into a pipeline requiring auxilary files +(Metamap for instance) the Dataset will automatically create those files in the most efficient way possible. + +Training +################# +When a directory contains **both** raw text files alongside annotation files, an instantiated Dataset +detects and facilitates access to those files. + +Assuming your directory looks like this (where .ann files are in `BRAT <http://brat.nlplab.org/standoff.html>`_ format): +:: + home/medacy/data + ├── file_one.ann + ├── file_one.txt + ├── file_two.ann + └── file_two.txt + +A common data work flow might look as follows. + +Running: +:: + >>> from medacy.data import Dataset + >>> from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap + + >>> dataset = Dataset('/home/datasets/some_dataset') + >>> for data_file in dataset: + ... (data_file.file_name, data_file.raw_path, dataset.ann_path) + (file_one, file_one.txt, file_one.ann) + (file_two, file_two.txt, file_two.ann) + >>> dataset + ['file_one', 'file_two'] + >>>> dataset.is_metamapped() + False + >>> metamap = MetaMap('/home/path/to/metamap/binary') + >>> with metamap: + ... metamap.metamap_dataset(dataset) + >>> dataset.is_metamapped() + True + +MedaCy **does not** alter the data you load in any way - it only reads from it. + +Prediction +########## +When a directory contains **only** raw text files, an instantiated Dataset object interprets this as +a directory of files that need to be predicted. This means that the internal Datafile that aggregates +meta-data for a given prediction file does not have fields for annotation_file_path set. + +When a directory contains **only** ann files, an instantiated Dataset object interprets this as +a directory of files that are predictions. Useful methods for analysis include :meth:`medacy.data.dataset.Dataset.compute_confusion_matrix`, +:meth:`medacy.data.dataset.Dataset.compute_ambiguity` and :meth:`medacy.data.dataset.Dataset.compute_counts`. + +External Datasets +################# + +In the real world, datasets (regardless of domain) are evolving entities. Hence, it is essential to version them. +A medaCy compatible dataset can be created to facilitate this versioning. A medaCy compatible dataset lives a python +packages that can be hooked into medaCy or used for any other purpose - it is simply a loose wrapper for this Dataset +object. Instructions for creating such a dataset can be found `here <https://github.com/NLPatVCU/medaCy/tree/master/examples/guide>`_. +wrap them. +""" + +import argparse +import json +import logging +import os +import pprint +from collections import Counter +from pathlib import Path + +from medacy.data.annotations import Annotations +from medacy.data.data_file import DataFile + + +class Dataset: + """ + A facilitation class for data management. + """ + + def __init__(self, data_directory, data_limit=None): + """ + Manages directory of training data along with other medaCy generated files. + + Only text files: considers a directory for managing metamapping. + Only ann files: considers a directory of predictions. + Both text and ann files: considers a directory for training. + + :param data_directory: Directory containing data for training or prediction. + :param data_limit: A limit to the number of files to process. Must be between 1 and number of raw text files in data_directory + """ + self.data_directory = Path(data_directory) + + metamap_dir = self.data_directory / 'metamapped' + self.metamapped_files_directory = metamap_dir if metamap_dir.is_dir() else None + + self.data_files = self._create_data_files() + self.data_limit = data_limit or len(self.data_files) + + def _create_data_files(self): + data_files = [] + all_files_in_directory = os.listdir(self.data_directory) + all_file_base_names = {f.split(".")[0] for f in all_files_in_directory} + + for file_name in all_file_base_names: + txt_path = None + ann_path = None + metamapped_path = None + + potential_txt_path = self.data_directory / (file_name + ".txt") + if potential_txt_path.exists(): + txt_path = potential_txt_path + + potential_ann_path = self.data_directory / (file_name + ".ann") + if potential_ann_path.exists(): + ann_path = potential_ann_path + + if self.metamapped_files_directory: + potential_mm_path = self.metamapped_files_directory / (file_name + ".metamapped") + if potential_mm_path.exists(): + metamapped_path = potential_mm_path + + if txt_path or ann_path: + new_df = DataFile(file_name, txt_path, ann_path, metamapped_path) + data_files.append(new_df) + + return sorted(data_files, key=lambda x: x.file_name) + + def __iter__(self): + return iter(self.data_files[0:self.data_limit]) + + def __len__(self): + return len(self.data_files) + + def is_metamapped(self): + """ + Verifies if all fil es in the Dataset are metamapped. + + :return: True if all data files are metamapped, False otherwise. + """ + if self.metamapped_files_directory is None or not self.metamapped_files_directory.exists(): + return False + + for file in self.data_files: + potential_file_path = self.metamapped_files_directory / f"{file.file_name}.metamapped" + if not potential_file_path.exists(): + return False + + # Metamapped file could exist, but metamapping it could have failed. + # If the file is less than 200 bytes, log a warning. + file_size_in_bytes = os.path.getsize(potential_file_path) + if file_size_in_bytes < 200: + logging.warning(f"Metamapped version of {file.file_name} is only {file_size_in_bytes} bytes. " + f"Metamapping could have failed: {potential_file_path}") + + return True + + def __str__(self): + """ + Prints a list-like string of the names of the Datafile objects up to the data limit + (can't be used if copied and pasted) + """ + return str([d.file_name for d in self]) + + def compute_counts(self): + """ + Computes entity counts over all documents in this dataset. + + :return: a Counter of entity counts + """ + total = Counter() + + for ann in self.generate_annotations(): + total += ann.compute_counts() + + return total + + def compute_confusion_matrix(self, other, leniency=0): + """ + Generates a confusion matrix where this Dataset serves as the gold standard annotations and `dataset` serves + as the predicted annotations. A typical workflow would involve creating a Dataset object with the prediction directory + outputted by a model and then passing it into this method. + + :param other: a Dataset object containing a predicted version of this dataset. + :param leniency: a floating point value between [0,1] defining the leniency of the character spans to count as different. A value of zero considers only exact character matches while a positive value considers entities that differ by up to :code:`ceil(leniency * len(span)/2)` on either side. + :return: two element tuple containing a label array (of entity names) and a matrix where rows are gold labels and columns are predicted labels. matrix[i][j] indicates that entities[i] in this dataset was predicted as entities[j] in 'annotation' matrix[i][j] times + """ + if not isinstance(other, Dataset): + raise ValueError("other must be instance of Dataset") + + # verify files are consistent + diff = {d.file_name for d in self} - {d.file_name for d in other} + if diff: + raise ValueError(f"Dataset of predictions is missing the files: {repr(diff)}") + + # sort entities in ascending order by count. + entities = [key for key, _ in sorted(self.compute_counts().items(), key=lambda x: x[1])] + confusion_matrix = [[0 * len(entities)] * len(entities)] + + for gold_data_file in self: + prediction_iter = iter(other) + prediction_data_file = next(prediction_iter) + while str(gold_data_file) != str(prediction_data_file): + prediction_data_file = next(prediction_iter) + + gold_annotation = Annotations(gold_data_file.ann_path) + pred_annotation = Annotations(prediction_data_file.ann_path) + + # compute matrix on the Annotation file level + ann_confusion_matrix = gold_annotation.compute_confusion_matrix(pred_annotation, entities, leniency=leniency) + for i in range(len(confusion_matrix)): + for j in range(len(confusion_matrix)): + confusion_matrix[i][j] += ann_confusion_matrix[i][j] + + return entities, confusion_matrix + + def compute_ambiguity(self, dataset): + """ + Finds occurrences of spans from 'dataset' that intersect with a span from this annotation but do not have this spans label. + label. If 'dataset' comprises a models predictions, this method provides a strong indicators + of a model's in-ability to dis-ambiguate between entities. For a full analysis, compute a confusion matrix. + + :param dataset: a Dataset object containing a predicted version of this dataset. + :return: a dictionary containing the ambiguity computations on each gold, predicted file pair + """ + if not isinstance(dataset, Dataset): + raise ValueError("dataset must be instance of Dataset") + + # verify files are consistent + diff = {d.file_name for d in self} - {d.file_name for d in dataset} + if diff: + raise ValueError(f"Dataset of predictions is missing the files: {repr(diff)}") + + # Dictionary storing ambiguity over dataset + ambiguity_dict = {} + + for gold_data_file in self: + prediction_iter = iter(dataset) + prediction_data_file = next(prediction_iter) + while str(gold_data_file) != str(prediction_data_file): + prediction_data_file = next(prediction_iter) + + gold_annotation = Annotations(gold_data_file.ann_path) + pred_annotation = Annotations(prediction_data_file.ann_path) + + # compute matrix on the Annotation file level + ambiguity_dict[str(gold_data_file)] = gold_annotation.compute_ambiguity(pred_annotation) + + return ambiguity_dict + + def get_labels(self, as_list=False): + """ + Get all of the entities/labels used in the dataset. + :param as_list: bool for if to return the results as a list; defaults to False + :return: A set of strings. Each string is a label used. + """ + labels = set() + + for ann in self.generate_annotations(): + labels.update(ann.get_labels()) + + if as_list: + return list(labels) + return labels + + def generate_annotations(self): + """Generates Annotation objects for all the files in this Dataset""" + for file in self: + if file.ann_path is not None: + yield Annotations(file.ann_path, source_text_path=file.txt_path) + else: + yield Annotations([]) + + def __getitem__(self, item): + """ + Creates and returns the Annotations object with the given file name, else raises FileNotFoundError; + useful for getting Annotations objects from parallel Datasets + :param item: the name of the file to be represented (not including the extension or parent directories) + :return: an Annotations object + """ + path = os.path.join(self.data_directory, item + '.ann') + return Annotations(path) + + +def main(): + """CLI for retrieving dataset information""" + parser = argparse.ArgumentParser(description='Calculate data about a given data directory') + parser.add_argument('directory') + args = parser.parse_args() + + dataset = Dataset(args.directory) + + entities = json.dumps(dataset.get_labels(as_list=True)) + counts = dataset.compute_counts() + + print(f"Entities: {entities}") + pprint.pprint(counts) + + +if __name__ == '__main__': + main()