[6c353a]: / medacy / data / dataset.py

Download this file

303 lines (235 with data), 12.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
"""
A medaCy Dataset facilities the management of data for both model training and model prediction.
A Dataset object provides a wrapper for a unix file directory containing training/prediction
data. If a Dataset, at training time, is fed into a pipeline requiring auxilary files
(Metamap for instance) the Dataset will automatically create those files in the most efficient way possible.
Training
#################
When a directory contains **both** raw text files alongside annotation files, an instantiated Dataset
detects and facilitates access to those files.
Assuming your directory looks like this (where .ann files are in `BRAT <http://brat.nlplab.org/standoff.html>`_ format):
::
home/medacy/data
├── file_one.ann
├── file_one.txt
├── file_two.ann
└── file_two.txt
A common data work flow might look as follows.
Running:
::
>>> from medacy.data import Dataset
>>> from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap
>>> dataset = Dataset('/home/datasets/some_dataset')
>>> for data_file in dataset:
... (data_file.file_name, data_file.raw_path, dataset.ann_path)
(file_one, file_one.txt, file_one.ann)
(file_two, file_two.txt, file_two.ann)
>>> dataset
['file_one', 'file_two']
>>>> dataset.is_metamapped()
False
>>> metamap = MetaMap('/home/path/to/metamap/binary')
>>> with metamap:
... metamap.metamap_dataset(dataset)
>>> dataset.is_metamapped()
True
MedaCy **does not** alter the data you load in any way - it only reads from it.
Prediction
##########
When a directory contains **only** raw text files, an instantiated Dataset object interprets this as
a directory of files that need to be predicted. This means that the internal Datafile that aggregates
meta-data for a given prediction file does not have fields for annotation_file_path set.
When a directory contains **only** ann files, an instantiated Dataset object interprets this as
a directory of files that are predictions. Useful methods for analysis include :meth:`medacy.data.dataset.Dataset.compute_confusion_matrix`,
:meth:`medacy.data.dataset.Dataset.compute_ambiguity` and :meth:`medacy.data.dataset.Dataset.compute_counts`.
External Datasets
#################
In the real world, datasets (regardless of domain) are evolving entities. Hence, it is essential to version them.
A medaCy compatible dataset can be created to facilitate this versioning. A medaCy compatible dataset lives a python
packages that can be hooked into medaCy or used for any other purpose - it is simply a loose wrapper for this Dataset
object. Instructions for creating such a dataset can be found `here <https://github.com/NLPatVCU/medaCy/tree/master/examples/guide>`_.
wrap them.
"""
import argparse
import json
import logging
import os
import pprint
from collections import Counter
from pathlib import Path
from medacy.data.annotations import Annotations
from medacy.data.data_file import DataFile
class Dataset:
"""
A facilitation class for data management.
"""
def __init__(self, data_directory, data_limit=None):
"""
Manages directory of training data along with other medaCy generated files.
Only text files: considers a directory for managing metamapping.
Only ann files: considers a directory of predictions.
Both text and ann files: considers a directory for training.
:param data_directory: Directory containing data for training or prediction.
:param data_limit: A limit to the number of files to process. Must be between 1 and number of raw text files in data_directory
"""
self.data_directory = Path(data_directory)
metamap_dir = self.data_directory / 'metamapped'
self.metamapped_files_directory = metamap_dir if metamap_dir.is_dir() else None
self.data_files = self._create_data_files()
self.data_limit = data_limit or len(self.data_files)
def _create_data_files(self):
data_files = []
all_files_in_directory = os.listdir(self.data_directory)
all_file_base_names = {f.split(".")[0] for f in all_files_in_directory}
for file_name in all_file_base_names:
txt_path = None
ann_path = None
metamapped_path = None
potential_txt_path = self.data_directory / (file_name + ".txt")
if potential_txt_path.exists():
txt_path = potential_txt_path
potential_ann_path = self.data_directory / (file_name + ".ann")
if potential_ann_path.exists():
ann_path = potential_ann_path
if self.metamapped_files_directory:
potential_mm_path = self.metamapped_files_directory / (file_name + ".metamapped")
if potential_mm_path.exists():
metamapped_path = potential_mm_path
if txt_path or ann_path:
new_df = DataFile(file_name, txt_path, ann_path, metamapped_path)
data_files.append(new_df)
return sorted(data_files, key=lambda x: x.file_name)
def __iter__(self):
return iter(self.data_files[0:self.data_limit])
def __len__(self):
return len(self.data_files)
def is_metamapped(self):
"""
Verifies if all fil es in the Dataset are metamapped.
:return: True if all data files are metamapped, False otherwise.
"""
if self.metamapped_files_directory is None or not self.metamapped_files_directory.exists():
return False
for file in self.data_files:
potential_file_path = self.metamapped_files_directory / f"{file.file_name}.metamapped"
if not potential_file_path.exists():
return False
# Metamapped file could exist, but metamapping it could have failed.
# If the file is less than 200 bytes, log a warning.
file_size_in_bytes = os.path.getsize(potential_file_path)
if file_size_in_bytes < 200:
logging.warning(f"Metamapped version of {file.file_name} is only {file_size_in_bytes} bytes. "
f"Metamapping could have failed: {potential_file_path}")
return True
def __str__(self):
"""
Prints a list-like string of the names of the Datafile objects up to the data limit
(can't be used if copied and pasted)
"""
return str([d.file_name for d in self])
def compute_counts(self):
"""
Computes entity counts over all documents in this dataset.
:return: a Counter of entity counts
"""
total = Counter()
for ann in self.generate_annotations():
total += ann.compute_counts()
return total
def compute_confusion_matrix(self, other, leniency=0):
"""
Generates a confusion matrix where this Dataset serves as the gold standard annotations and `dataset` serves
as the predicted annotations. A typical workflow would involve creating a Dataset object with the prediction directory
outputted by a model and then passing it into this method.
:param other: a Dataset object containing a predicted version of this dataset.
:param leniency: a floating point value between [0,1] defining the leniency of the character spans to count as different. A value of zero considers only exact character matches while a positive value considers entities that differ by up to :code:`ceil(leniency * len(span)/2)` on either side.
:return: two element tuple containing a label array (of entity names) and a matrix where rows are gold labels and columns are predicted labels. matrix[i][j] indicates that entities[i] in this dataset was predicted as entities[j] in 'annotation' matrix[i][j] times
"""
if not isinstance(other, Dataset):
raise ValueError("other must be instance of Dataset")
# verify files are consistent
diff = {d.file_name for d in self} - {d.file_name for d in other}
if diff:
raise ValueError(f"Dataset of predictions is missing the files: {repr(diff)}")
# sort entities in ascending order by count.
entities = [key for key, _ in sorted(self.compute_counts().items(), key=lambda x: x[1])]
confusion_matrix = [[0 * len(entities)] * len(entities)]
for gold_data_file in self:
prediction_iter = iter(other)
prediction_data_file = next(prediction_iter)
while str(gold_data_file) != str(prediction_data_file):
prediction_data_file = next(prediction_iter)
gold_annotation = Annotations(gold_data_file.ann_path)
pred_annotation = Annotations(prediction_data_file.ann_path)
# compute matrix on the Annotation file level
ann_confusion_matrix = gold_annotation.compute_confusion_matrix(pred_annotation, entities, leniency=leniency)
for i in range(len(confusion_matrix)):
for j in range(len(confusion_matrix)):
confusion_matrix[i][j] += ann_confusion_matrix[i][j]
return entities, confusion_matrix
def compute_ambiguity(self, dataset):
"""
Finds occurrences of spans from 'dataset' that intersect with a span from this annotation but do not have this spans label.
label. If 'dataset' comprises a models predictions, this method provides a strong indicators
of a model's in-ability to dis-ambiguate between entities. For a full analysis, compute a confusion matrix.
:param dataset: a Dataset object containing a predicted version of this dataset.
:return: a dictionary containing the ambiguity computations on each gold, predicted file pair
"""
if not isinstance(dataset, Dataset):
raise ValueError("dataset must be instance of Dataset")
# verify files are consistent
diff = {d.file_name for d in self} - {d.file_name for d in dataset}
if diff:
raise ValueError(f"Dataset of predictions is missing the files: {repr(diff)}")
# Dictionary storing ambiguity over dataset
ambiguity_dict = {}
for gold_data_file in self:
prediction_iter = iter(dataset)
prediction_data_file = next(prediction_iter)
while str(gold_data_file) != str(prediction_data_file):
prediction_data_file = next(prediction_iter)
gold_annotation = Annotations(gold_data_file.ann_path)
pred_annotation = Annotations(prediction_data_file.ann_path)
# compute matrix on the Annotation file level
ambiguity_dict[str(gold_data_file)] = gold_annotation.compute_ambiguity(pred_annotation)
return ambiguity_dict
def get_labels(self, as_list=False):
"""
Get all of the entities/labels used in the dataset.
:param as_list: bool for if to return the results as a list; defaults to False
:return: A set of strings. Each string is a label used.
"""
labels = set()
for ann in self.generate_annotations():
labels.update(ann.get_labels())
if as_list:
return list(labels)
return labels
def generate_annotations(self):
"""Generates Annotation objects for all the files in this Dataset"""
for file in self:
if file.ann_path is not None:
yield Annotations(file.ann_path, source_text_path=file.txt_path)
else:
yield Annotations([])
def __getitem__(self, item):
"""
Creates and returns the Annotations object with the given file name, else raises FileNotFoundError;
useful for getting Annotations objects from parallel Datasets
:param item: the name of the file to be represented (not including the extension or parent directories)
:return: an Annotations object
"""
path = os.path.join(self.data_directory, item + '.ann')
return Annotations(path)
def main():
"""CLI for retrieving dataset information"""
parser = argparse.ArgumentParser(description='Calculate data about a given data directory')
parser.add_argument('directory')
args = parser.parse_args()
dataset = Dataset(args.directory)
entities = json.dumps(dataset.get_labels(as_list=True))
counts = dataset.compute_counts()
print(f"Entities: {entities}")
pprint.pprint(counts)
if __name__ == '__main__':
main()