Diff of /medacy/data/dataset.py [000000] .. [6c353a]

Switch to side-by-side view

--- a
+++ b/medacy/data/dataset.py
@@ -0,0 +1,302 @@
+"""
+A medaCy Dataset facilities the management of data for both model training and model prediction.
+
+A Dataset object provides a wrapper for a unix file directory containing training/prediction
+data. If a Dataset, at training time, is fed into a pipeline requiring auxilary files
+(Metamap for instance) the Dataset will automatically create those files in the most efficient way possible.
+
+Training
+#################
+When a directory contains **both** raw text files alongside annotation files, an instantiated Dataset
+detects and facilitates access to those files.
+
+Assuming your directory looks like this (where .ann files are in `BRAT <http://brat.nlplab.org/standoff.html>`_ format):
+::
+    home/medacy/data
+    ├── file_one.ann
+    ├── file_one.txt
+    ├── file_two.ann
+    └── file_two.txt
+
+A common data work flow might look as follows.
+
+Running:
+::
+    >>> from medacy.data import Dataset
+    >>> from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap
+
+    >>> dataset = Dataset('/home/datasets/some_dataset')
+    >>> for data_file in dataset:
+    ...    (data_file.file_name, data_file.raw_path, dataset.ann_path)
+    (file_one, file_one.txt, file_one.ann)
+    (file_two, file_two.txt, file_two.ann)
+    >>> dataset
+    ['file_one', 'file_two']
+    >>>> dataset.is_metamapped()
+    False
+    >>> metamap = MetaMap('/home/path/to/metamap/binary')
+    >>> with metamap:
+    ...     metamap.metamap_dataset(dataset)
+    >>> dataset.is_metamapped()
+    True
+
+MedaCy **does not** alter the data you load in any way - it only reads from it.
+
+Prediction
+##########
+When a directory contains **only** raw text files, an instantiated Dataset object interprets this as
+a directory of files that need to be predicted. This means that the internal Datafile that aggregates
+meta-data for a given prediction file does not have fields for annotation_file_path set.
+
+When a directory contains **only** ann files, an instantiated Dataset object interprets this as
+a directory of files that are predictions. Useful methods for analysis include :meth:`medacy.data.dataset.Dataset.compute_confusion_matrix`,
+:meth:`medacy.data.dataset.Dataset.compute_ambiguity` and :meth:`medacy.data.dataset.Dataset.compute_counts`.
+
+External Datasets
+#################
+
+In the real world, datasets (regardless of domain) are evolving entities. Hence, it is essential to version them.
+A medaCy compatible dataset can be created to facilitate this versioning. A medaCy compatible dataset lives a python
+packages that can be hooked into medaCy or used for any other purpose - it is simply a loose wrapper for this Dataset
+object. Instructions for creating such a dataset can be found `here <https://github.com/NLPatVCU/medaCy/tree/master/examples/guide>`_.
+wrap them.
+"""
+
+import argparse
+import json
+import logging
+import os
+import pprint
+from collections import Counter
+from pathlib import Path
+
+from medacy.data.annotations import Annotations
+from medacy.data.data_file import DataFile
+
+
+class Dataset:
+    """
+    A facilitation class for data management.
+    """
+
+    def __init__(self, data_directory, data_limit=None):
+        """
+        Manages directory of training data along with other medaCy generated files.
+
+        Only text files: considers a directory for managing metamapping.
+        Only ann files: considers a directory of predictions.
+        Both text and ann files: considers a directory for training.
+
+        :param data_directory: Directory containing data for training or prediction.
+        :param data_limit: A limit to the number of files to process. Must be between 1 and number of raw text files in data_directory
+        """
+        self.data_directory = Path(data_directory)
+
+        metamap_dir = self.data_directory / 'metamapped'
+        self.metamapped_files_directory = metamap_dir if metamap_dir.is_dir() else None
+
+        self.data_files = self._create_data_files()
+        self.data_limit = data_limit or len(self.data_files)
+
+    def _create_data_files(self):
+        data_files = []
+        all_files_in_directory = os.listdir(self.data_directory)
+        all_file_base_names = {f.split(".")[0] for f in all_files_in_directory}
+
+        for file_name in all_file_base_names:
+            txt_path = None
+            ann_path = None
+            metamapped_path = None
+
+            potential_txt_path = self.data_directory / (file_name + ".txt")
+            if potential_txt_path.exists():
+                txt_path = potential_txt_path
+
+            potential_ann_path = self.data_directory / (file_name + ".ann")
+            if potential_ann_path.exists():
+                ann_path = potential_ann_path
+
+            if self.metamapped_files_directory:
+                potential_mm_path = self.metamapped_files_directory / (file_name + ".metamapped")
+                if potential_mm_path.exists():
+                    metamapped_path = potential_mm_path
+
+            if txt_path or ann_path:
+                new_df = DataFile(file_name, txt_path, ann_path, metamapped_path)
+                data_files.append(new_df)
+
+        return sorted(data_files, key=lambda x: x.file_name)
+
+    def __iter__(self):
+        return iter(self.data_files[0:self.data_limit])
+
+    def __len__(self):
+        return len(self.data_files)
+
+    def is_metamapped(self):
+        """
+        Verifies if all fil es in the Dataset are metamapped.
+
+        :return: True if all data files are metamapped, False otherwise.
+        """
+        if self.metamapped_files_directory is None or not self.metamapped_files_directory.exists():
+            return False
+
+        for file in self.data_files:
+            potential_file_path = self.metamapped_files_directory / f"{file.file_name}.metamapped"
+            if not potential_file_path.exists():
+                return False
+
+            # Metamapped file could exist, but metamapping it could have failed.
+            # If the file is less than 200 bytes, log a warning.
+            file_size_in_bytes = os.path.getsize(potential_file_path)
+            if file_size_in_bytes < 200:
+                logging.warning(f"Metamapped version of {file.file_name} is only {file_size_in_bytes} bytes. "
+                                f"Metamapping could have failed: {potential_file_path}")
+
+        return True
+
+    def __str__(self):
+        """
+        Prints a list-like string of the names of the Datafile objects up to the data limit
+        (can't be used if copied and pasted)
+        """
+        return str([d.file_name for d in self])
+
+    def compute_counts(self):
+        """
+        Computes entity counts over all documents in this dataset.
+
+        :return: a Counter of entity counts
+        """
+        total = Counter()
+
+        for ann in self.generate_annotations():
+            total += ann.compute_counts()
+
+        return total
+
+    def compute_confusion_matrix(self, other, leniency=0):
+        """
+        Generates a confusion matrix where this Dataset serves as the gold standard annotations and `dataset` serves
+        as the predicted annotations. A typical workflow would involve creating a Dataset object with the prediction directory
+        outputted by a model and then passing it into this method.
+
+        :param other: a Dataset object containing a predicted version of this dataset.
+        :param leniency: a floating point value between [0,1] defining the leniency of the character spans to count as different. A value of zero considers only exact character matches while a positive value considers entities that differ by up to :code:`ceil(leniency * len(span)/2)` on either side.
+        :return: two element tuple containing a label array (of entity names) and a matrix where rows are gold labels and columns are predicted labels. matrix[i][j] indicates that entities[i] in this dataset was predicted as entities[j] in 'annotation' matrix[i][j] times
+        """
+        if not isinstance(other, Dataset):
+            raise ValueError("other must be instance of Dataset")
+
+        # verify files are consistent
+        diff = {d.file_name for d in self} - {d.file_name for d in other}
+        if diff:
+            raise ValueError(f"Dataset of predictions is missing the files: {repr(diff)}")
+
+        # sort entities in ascending order by count.
+        entities = [key for key, _ in sorted(self.compute_counts().items(), key=lambda x: x[1])]
+        confusion_matrix = [[0 * len(entities)] * len(entities)]
+
+        for gold_data_file in self:
+            prediction_iter = iter(other)
+            prediction_data_file = next(prediction_iter)
+            while str(gold_data_file) != str(prediction_data_file):
+                prediction_data_file = next(prediction_iter)
+
+            gold_annotation = Annotations(gold_data_file.ann_path)
+            pred_annotation = Annotations(prediction_data_file.ann_path)
+
+            # compute matrix on the Annotation file level
+            ann_confusion_matrix = gold_annotation.compute_confusion_matrix(pred_annotation, entities, leniency=leniency)
+            for i in range(len(confusion_matrix)):
+                for j in range(len(confusion_matrix)):
+                    confusion_matrix[i][j] += ann_confusion_matrix[i][j]
+
+        return entities, confusion_matrix
+
+    def compute_ambiguity(self, dataset):
+        """
+        Finds occurrences of spans from 'dataset' that intersect with a span from this annotation but do not have this spans label.
+        label. If 'dataset' comprises a models predictions, this method provides a strong indicators
+        of a model's in-ability to dis-ambiguate between entities. For a full analysis, compute a confusion matrix.
+
+        :param dataset: a Dataset object containing a predicted version of this dataset.
+        :return: a dictionary containing the ambiguity computations on each gold, predicted file pair
+        """
+        if not isinstance(dataset, Dataset):
+            raise ValueError("dataset must be instance of Dataset")
+
+        # verify files are consistent
+        diff = {d.file_name for d in self} - {d.file_name for d in dataset}
+        if diff:
+            raise ValueError(f"Dataset of predictions is missing the files: {repr(diff)}")
+
+        # Dictionary storing ambiguity over dataset
+        ambiguity_dict = {}
+
+        for gold_data_file in self:
+            prediction_iter = iter(dataset)
+            prediction_data_file = next(prediction_iter)
+            while str(gold_data_file) != str(prediction_data_file):
+                prediction_data_file = next(prediction_iter)
+
+            gold_annotation = Annotations(gold_data_file.ann_path)
+            pred_annotation = Annotations(prediction_data_file.ann_path)
+
+            # compute matrix on the Annotation file level
+            ambiguity_dict[str(gold_data_file)] = gold_annotation.compute_ambiguity(pred_annotation)
+
+        return ambiguity_dict
+
+    def get_labels(self, as_list=False):
+        """
+        Get all of the entities/labels used in the dataset.
+        :param as_list: bool for if to return the results as a list; defaults to False
+        :return: A set of strings. Each string is a label used.
+        """
+        labels = set()
+
+        for ann in self.generate_annotations():
+            labels.update(ann.get_labels())
+
+        if as_list:
+            return list(labels)
+        return labels
+
+    def generate_annotations(self):
+        """Generates Annotation objects for all the files in this Dataset"""
+        for file in self:
+            if file.ann_path is not None:
+                yield Annotations(file.ann_path, source_text_path=file.txt_path)
+            else:
+                yield Annotations([])
+
+    def __getitem__(self, item):
+        """
+        Creates and returns the Annotations object with the given file name, else raises FileNotFoundError;
+        useful for getting Annotations objects from parallel Datasets
+        :param item: the name of the file to be represented (not including the extension or parent directories)
+        :return: an Annotations object
+        """
+        path = os.path.join(self.data_directory, item + '.ann')
+        return Annotations(path)
+
+
+def main():
+    """CLI for retrieving dataset information"""
+    parser = argparse.ArgumentParser(description='Calculate data about a given data directory')
+    parser.add_argument('directory')
+    args = parser.parse_args()
+
+    dataset = Dataset(args.directory)
+
+    entities = json.dumps(dataset.get_labels(as_list=True))
+    counts = dataset.compute_counts()
+
+    print(f"Entities: {entities}")
+    pprint.pprint(counts)
+
+
+if __name__ == '__main__':
+    main()