a b/medacy/data/dataset.py
1
"""
2
A medaCy Dataset facilities the management of data for both model training and model prediction.
3
4
A Dataset object provides a wrapper for a unix file directory containing training/prediction
5
data. If a Dataset, at training time, is fed into a pipeline requiring auxilary files
6
(Metamap for instance) the Dataset will automatically create those files in the most efficient way possible.
7
8
Training
9
#################
10
When a directory contains **both** raw text files alongside annotation files, an instantiated Dataset
11
detects and facilitates access to those files.
12
13
Assuming your directory looks like this (where .ann files are in `BRAT <http://brat.nlplab.org/standoff.html>`_ format):
14
::
15
    home/medacy/data
16
    ├── file_one.ann
17
    ├── file_one.txt
18
    ├── file_two.ann
19
    └── file_two.txt
20
21
A common data work flow might look as follows.
22
23
Running:
24
::
25
    >>> from medacy.data import Dataset
26
    >>> from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap
27
28
    >>> dataset = Dataset('/home/datasets/some_dataset')
29
    >>> for data_file in dataset:
30
    ...    (data_file.file_name, data_file.raw_path, dataset.ann_path)
31
    (file_one, file_one.txt, file_one.ann)
32
    (file_two, file_two.txt, file_two.ann)
33
    >>> dataset
34
    ['file_one', 'file_two']
35
    >>>> dataset.is_metamapped()
36
    False
37
    >>> metamap = MetaMap('/home/path/to/metamap/binary')
38
    >>> with metamap:
39
    ...     metamap.metamap_dataset(dataset)
40
    >>> dataset.is_metamapped()
41
    True
42
43
MedaCy **does not** alter the data you load in any way - it only reads from it.
44
45
Prediction
46
##########
47
When a directory contains **only** raw text files, an instantiated Dataset object interprets this as
48
a directory of files that need to be predicted. This means that the internal Datafile that aggregates
49
meta-data for a given prediction file does not have fields for annotation_file_path set.
50
51
When a directory contains **only** ann files, an instantiated Dataset object interprets this as
52
a directory of files that are predictions. Useful methods for analysis include :meth:`medacy.data.dataset.Dataset.compute_confusion_matrix`,
53
:meth:`medacy.data.dataset.Dataset.compute_ambiguity` and :meth:`medacy.data.dataset.Dataset.compute_counts`.
54
55
External Datasets
56
#################
57
58
In the real world, datasets (regardless of domain) are evolving entities. Hence, it is essential to version them.
59
A medaCy compatible dataset can be created to facilitate this versioning. A medaCy compatible dataset lives a python
60
packages that can be hooked into medaCy or used for any other purpose - it is simply a loose wrapper for this Dataset
61
object. Instructions for creating such a dataset can be found `here <https://github.com/NLPatVCU/medaCy/tree/master/examples/guide>`_.
62
wrap them.
63
"""
64
65
import argparse
66
import json
67
import logging
68
import os
69
import pprint
70
from collections import Counter
71
from pathlib import Path
72
73
from medacy.data.annotations import Annotations
74
from medacy.data.data_file import DataFile
75
76
77
class Dataset:
78
    """
79
    A facilitation class for data management.
80
    """
81
82
    def __init__(self, data_directory, data_limit=None):
83
        """
84
        Manages directory of training data along with other medaCy generated files.
85
86
        Only text files: considers a directory for managing metamapping.
87
        Only ann files: considers a directory of predictions.
88
        Both text and ann files: considers a directory for training.
89
90
        :param data_directory: Directory containing data for training or prediction.
91
        :param data_limit: A limit to the number of files to process. Must be between 1 and number of raw text files in data_directory
92
        """
93
        self.data_directory = Path(data_directory)
94
95
        metamap_dir = self.data_directory / 'metamapped'
96
        self.metamapped_files_directory = metamap_dir if metamap_dir.is_dir() else None
97
98
        self.data_files = self._create_data_files()
99
        self.data_limit = data_limit or len(self.data_files)
100
101
    def _create_data_files(self):
102
        data_files = []
103
        all_files_in_directory = os.listdir(self.data_directory)
104
        all_file_base_names = {f.split(".")[0] for f in all_files_in_directory}
105
106
        for file_name in all_file_base_names:
107
            txt_path = None
108
            ann_path = None
109
            metamapped_path = None
110
111
            potential_txt_path = self.data_directory / (file_name + ".txt")
112
            if potential_txt_path.exists():
113
                txt_path = potential_txt_path
114
115
            potential_ann_path = self.data_directory / (file_name + ".ann")
116
            if potential_ann_path.exists():
117
                ann_path = potential_ann_path
118
119
            if self.metamapped_files_directory:
120
                potential_mm_path = self.metamapped_files_directory / (file_name + ".metamapped")
121
                if potential_mm_path.exists():
122
                    metamapped_path = potential_mm_path
123
124
            if txt_path or ann_path:
125
                new_df = DataFile(file_name, txt_path, ann_path, metamapped_path)
126
                data_files.append(new_df)
127
128
        return sorted(data_files, key=lambda x: x.file_name)
129
130
    def __iter__(self):
131
        return iter(self.data_files[0:self.data_limit])
132
133
    def __len__(self):
134
        return len(self.data_files)
135
136
    def is_metamapped(self):
137
        """
138
        Verifies if all fil es in the Dataset are metamapped.
139
140
        :return: True if all data files are metamapped, False otherwise.
141
        """
142
        if self.metamapped_files_directory is None or not self.metamapped_files_directory.exists():
143
            return False
144
145
        for file in self.data_files:
146
            potential_file_path = self.metamapped_files_directory / f"{file.file_name}.metamapped"
147
            if not potential_file_path.exists():
148
                return False
149
150
            # Metamapped file could exist, but metamapping it could have failed.
151
            # If the file is less than 200 bytes, log a warning.
152
            file_size_in_bytes = os.path.getsize(potential_file_path)
153
            if file_size_in_bytes < 200:
154
                logging.warning(f"Metamapped version of {file.file_name} is only {file_size_in_bytes} bytes. "
155
                                f"Metamapping could have failed: {potential_file_path}")
156
157
        return True
158
159
    def __str__(self):
160
        """
161
        Prints a list-like string of the names of the Datafile objects up to the data limit
162
        (can't be used if copied and pasted)
163
        """
164
        return str([d.file_name for d in self])
165
166
    def compute_counts(self):
167
        """
168
        Computes entity counts over all documents in this dataset.
169
170
        :return: a Counter of entity counts
171
        """
172
        total = Counter()
173
174
        for ann in self.generate_annotations():
175
            total += ann.compute_counts()
176
177
        return total
178
179
    def compute_confusion_matrix(self, other, leniency=0):
180
        """
181
        Generates a confusion matrix where this Dataset serves as the gold standard annotations and `dataset` serves
182
        as the predicted annotations. A typical workflow would involve creating a Dataset object with the prediction directory
183
        outputted by a model and then passing it into this method.
184
185
        :param other: a Dataset object containing a predicted version of this dataset.
186
        :param leniency: a floating point value between [0,1] defining the leniency of the character spans to count as different. A value of zero considers only exact character matches while a positive value considers entities that differ by up to :code:`ceil(leniency * len(span)/2)` on either side.
187
        :return: two element tuple containing a label array (of entity names) and a matrix where rows are gold labels and columns are predicted labels. matrix[i][j] indicates that entities[i] in this dataset was predicted as entities[j] in 'annotation' matrix[i][j] times
188
        """
189
        if not isinstance(other, Dataset):
190
            raise ValueError("other must be instance of Dataset")
191
192
        # verify files are consistent
193
        diff = {d.file_name for d in self} - {d.file_name for d in other}
194
        if diff:
195
            raise ValueError(f"Dataset of predictions is missing the files: {repr(diff)}")
196
197
        # sort entities in ascending order by count.
198
        entities = [key for key, _ in sorted(self.compute_counts().items(), key=lambda x: x[1])]
199
        confusion_matrix = [[0 * len(entities)] * len(entities)]
200
201
        for gold_data_file in self:
202
            prediction_iter = iter(other)
203
            prediction_data_file = next(prediction_iter)
204
            while str(gold_data_file) != str(prediction_data_file):
205
                prediction_data_file = next(prediction_iter)
206
207
            gold_annotation = Annotations(gold_data_file.ann_path)
208
            pred_annotation = Annotations(prediction_data_file.ann_path)
209
210
            # compute matrix on the Annotation file level
211
            ann_confusion_matrix = gold_annotation.compute_confusion_matrix(pred_annotation, entities, leniency=leniency)
212
            for i in range(len(confusion_matrix)):
213
                for j in range(len(confusion_matrix)):
214
                    confusion_matrix[i][j] += ann_confusion_matrix[i][j]
215
216
        return entities, confusion_matrix
217
218
    def compute_ambiguity(self, dataset):
219
        """
220
        Finds occurrences of spans from 'dataset' that intersect with a span from this annotation but do not have this spans label.
221
        label. If 'dataset' comprises a models predictions, this method provides a strong indicators
222
        of a model's in-ability to dis-ambiguate between entities. For a full analysis, compute a confusion matrix.
223
224
        :param dataset: a Dataset object containing a predicted version of this dataset.
225
        :return: a dictionary containing the ambiguity computations on each gold, predicted file pair
226
        """
227
        if not isinstance(dataset, Dataset):
228
            raise ValueError("dataset must be instance of Dataset")
229
230
        # verify files are consistent
231
        diff = {d.file_name for d in self} - {d.file_name for d in dataset}
232
        if diff:
233
            raise ValueError(f"Dataset of predictions is missing the files: {repr(diff)}")
234
235
        # Dictionary storing ambiguity over dataset
236
        ambiguity_dict = {}
237
238
        for gold_data_file in self:
239
            prediction_iter = iter(dataset)
240
            prediction_data_file = next(prediction_iter)
241
            while str(gold_data_file) != str(prediction_data_file):
242
                prediction_data_file = next(prediction_iter)
243
244
            gold_annotation = Annotations(gold_data_file.ann_path)
245
            pred_annotation = Annotations(prediction_data_file.ann_path)
246
247
            # compute matrix on the Annotation file level
248
            ambiguity_dict[str(gold_data_file)] = gold_annotation.compute_ambiguity(pred_annotation)
249
250
        return ambiguity_dict
251
252
    def get_labels(self, as_list=False):
253
        """
254
        Get all of the entities/labels used in the dataset.
255
        :param as_list: bool for if to return the results as a list; defaults to False
256
        :return: A set of strings. Each string is a label used.
257
        """
258
        labels = set()
259
260
        for ann in self.generate_annotations():
261
            labels.update(ann.get_labels())
262
263
        if as_list:
264
            return list(labels)
265
        return labels
266
267
    def generate_annotations(self):
268
        """Generates Annotation objects for all the files in this Dataset"""
269
        for file in self:
270
            if file.ann_path is not None:
271
                yield Annotations(file.ann_path, source_text_path=file.txt_path)
272
            else:
273
                yield Annotations([])
274
275
    def __getitem__(self, item):
276
        """
277
        Creates and returns the Annotations object with the given file name, else raises FileNotFoundError;
278
        useful for getting Annotations objects from parallel Datasets
279
        :param item: the name of the file to be represented (not including the extension or parent directories)
280
        :return: an Annotations object
281
        """
282
        path = os.path.join(self.data_directory, item + '.ann')
283
        return Annotations(path)
284
285
286
def main():
287
    """CLI for retrieving dataset information"""
288
    parser = argparse.ArgumentParser(description='Calculate data about a given data directory')
289
    parser.add_argument('directory')
290
    args = parser.parse_args()
291
292
    dataset = Dataset(args.directory)
293
294
    entities = json.dumps(dataset.get_labels(as_list=True))
295
    counts = dataset.compute_counts()
296
297
    print(f"Entities: {entities}")
298
    pprint.pprint(counts)
299
300
301
if __name__ == '__main__':
302
    main()