|
a |
|
b/medacy/data/dataset.py |
|
|
1 |
""" |
|
|
2 |
A medaCy Dataset facilities the management of data for both model training and model prediction. |
|
|
3 |
|
|
|
4 |
A Dataset object provides a wrapper for a unix file directory containing training/prediction |
|
|
5 |
data. If a Dataset, at training time, is fed into a pipeline requiring auxilary files |
|
|
6 |
(Metamap for instance) the Dataset will automatically create those files in the most efficient way possible. |
|
|
7 |
|
|
|
8 |
Training |
|
|
9 |
################# |
|
|
10 |
When a directory contains **both** raw text files alongside annotation files, an instantiated Dataset |
|
|
11 |
detects and facilitates access to those files. |
|
|
12 |
|
|
|
13 |
Assuming your directory looks like this (where .ann files are in `BRAT <http://brat.nlplab.org/standoff.html>`_ format): |
|
|
14 |
:: |
|
|
15 |
home/medacy/data |
|
|
16 |
├── file_one.ann |
|
|
17 |
├── file_one.txt |
|
|
18 |
├── file_two.ann |
|
|
19 |
└── file_two.txt |
|
|
20 |
|
|
|
21 |
A common data work flow might look as follows. |
|
|
22 |
|
|
|
23 |
Running: |
|
|
24 |
:: |
|
|
25 |
>>> from medacy.data import Dataset |
|
|
26 |
>>> from medacy.pipeline_components.feature_overlayers.metamap.metamap import MetaMap |
|
|
27 |
|
|
|
28 |
>>> dataset = Dataset('/home/datasets/some_dataset') |
|
|
29 |
>>> for data_file in dataset: |
|
|
30 |
... (data_file.file_name, data_file.raw_path, dataset.ann_path) |
|
|
31 |
(file_one, file_one.txt, file_one.ann) |
|
|
32 |
(file_two, file_two.txt, file_two.ann) |
|
|
33 |
>>> dataset |
|
|
34 |
['file_one', 'file_two'] |
|
|
35 |
>>>> dataset.is_metamapped() |
|
|
36 |
False |
|
|
37 |
>>> metamap = MetaMap('/home/path/to/metamap/binary') |
|
|
38 |
>>> with metamap: |
|
|
39 |
... metamap.metamap_dataset(dataset) |
|
|
40 |
>>> dataset.is_metamapped() |
|
|
41 |
True |
|
|
42 |
|
|
|
43 |
MedaCy **does not** alter the data you load in any way - it only reads from it. |
|
|
44 |
|
|
|
45 |
Prediction |
|
|
46 |
########## |
|
|
47 |
When a directory contains **only** raw text files, an instantiated Dataset object interprets this as |
|
|
48 |
a directory of files that need to be predicted. This means that the internal Datafile that aggregates |
|
|
49 |
meta-data for a given prediction file does not have fields for annotation_file_path set. |
|
|
50 |
|
|
|
51 |
When a directory contains **only** ann files, an instantiated Dataset object interprets this as |
|
|
52 |
a directory of files that are predictions. Useful methods for analysis include :meth:`medacy.data.dataset.Dataset.compute_confusion_matrix`, |
|
|
53 |
:meth:`medacy.data.dataset.Dataset.compute_ambiguity` and :meth:`medacy.data.dataset.Dataset.compute_counts`. |
|
|
54 |
|
|
|
55 |
External Datasets |
|
|
56 |
################# |
|
|
57 |
|
|
|
58 |
In the real world, datasets (regardless of domain) are evolving entities. Hence, it is essential to version them. |
|
|
59 |
A medaCy compatible dataset can be created to facilitate this versioning. A medaCy compatible dataset lives a python |
|
|
60 |
packages that can be hooked into medaCy or used for any other purpose - it is simply a loose wrapper for this Dataset |
|
|
61 |
object. Instructions for creating such a dataset can be found `here <https://github.com/NLPatVCU/medaCy/tree/master/examples/guide>`_. |
|
|
62 |
wrap them. |
|
|
63 |
""" |
|
|
64 |
|
|
|
65 |
import argparse |
|
|
66 |
import json |
|
|
67 |
import logging |
|
|
68 |
import os |
|
|
69 |
import pprint |
|
|
70 |
from collections import Counter |
|
|
71 |
from pathlib import Path |
|
|
72 |
|
|
|
73 |
from medacy.data.annotations import Annotations |
|
|
74 |
from medacy.data.data_file import DataFile |
|
|
75 |
|
|
|
76 |
|
|
|
77 |
class Dataset: |
|
|
78 |
""" |
|
|
79 |
A facilitation class for data management. |
|
|
80 |
""" |
|
|
81 |
|
|
|
82 |
def __init__(self, data_directory, data_limit=None): |
|
|
83 |
""" |
|
|
84 |
Manages directory of training data along with other medaCy generated files. |
|
|
85 |
|
|
|
86 |
Only text files: considers a directory for managing metamapping. |
|
|
87 |
Only ann files: considers a directory of predictions. |
|
|
88 |
Both text and ann files: considers a directory for training. |
|
|
89 |
|
|
|
90 |
:param data_directory: Directory containing data for training or prediction. |
|
|
91 |
:param data_limit: A limit to the number of files to process. Must be between 1 and number of raw text files in data_directory |
|
|
92 |
""" |
|
|
93 |
self.data_directory = Path(data_directory) |
|
|
94 |
|
|
|
95 |
metamap_dir = self.data_directory / 'metamapped' |
|
|
96 |
self.metamapped_files_directory = metamap_dir if metamap_dir.is_dir() else None |
|
|
97 |
|
|
|
98 |
self.data_files = self._create_data_files() |
|
|
99 |
self.data_limit = data_limit or len(self.data_files) |
|
|
100 |
|
|
|
101 |
def _create_data_files(self): |
|
|
102 |
data_files = [] |
|
|
103 |
all_files_in_directory = os.listdir(self.data_directory) |
|
|
104 |
all_file_base_names = {f.split(".")[0] for f in all_files_in_directory} |
|
|
105 |
|
|
|
106 |
for file_name in all_file_base_names: |
|
|
107 |
txt_path = None |
|
|
108 |
ann_path = None |
|
|
109 |
metamapped_path = None |
|
|
110 |
|
|
|
111 |
potential_txt_path = self.data_directory / (file_name + ".txt") |
|
|
112 |
if potential_txt_path.exists(): |
|
|
113 |
txt_path = potential_txt_path |
|
|
114 |
|
|
|
115 |
potential_ann_path = self.data_directory / (file_name + ".ann") |
|
|
116 |
if potential_ann_path.exists(): |
|
|
117 |
ann_path = potential_ann_path |
|
|
118 |
|
|
|
119 |
if self.metamapped_files_directory: |
|
|
120 |
potential_mm_path = self.metamapped_files_directory / (file_name + ".metamapped") |
|
|
121 |
if potential_mm_path.exists(): |
|
|
122 |
metamapped_path = potential_mm_path |
|
|
123 |
|
|
|
124 |
if txt_path or ann_path: |
|
|
125 |
new_df = DataFile(file_name, txt_path, ann_path, metamapped_path) |
|
|
126 |
data_files.append(new_df) |
|
|
127 |
|
|
|
128 |
return sorted(data_files, key=lambda x: x.file_name) |
|
|
129 |
|
|
|
130 |
def __iter__(self): |
|
|
131 |
return iter(self.data_files[0:self.data_limit]) |
|
|
132 |
|
|
|
133 |
def __len__(self): |
|
|
134 |
return len(self.data_files) |
|
|
135 |
|
|
|
136 |
def is_metamapped(self): |
|
|
137 |
""" |
|
|
138 |
Verifies if all fil es in the Dataset are metamapped. |
|
|
139 |
|
|
|
140 |
:return: True if all data files are metamapped, False otherwise. |
|
|
141 |
""" |
|
|
142 |
if self.metamapped_files_directory is None or not self.metamapped_files_directory.exists(): |
|
|
143 |
return False |
|
|
144 |
|
|
|
145 |
for file in self.data_files: |
|
|
146 |
potential_file_path = self.metamapped_files_directory / f"{file.file_name}.metamapped" |
|
|
147 |
if not potential_file_path.exists(): |
|
|
148 |
return False |
|
|
149 |
|
|
|
150 |
# Metamapped file could exist, but metamapping it could have failed. |
|
|
151 |
# If the file is less than 200 bytes, log a warning. |
|
|
152 |
file_size_in_bytes = os.path.getsize(potential_file_path) |
|
|
153 |
if file_size_in_bytes < 200: |
|
|
154 |
logging.warning(f"Metamapped version of {file.file_name} is only {file_size_in_bytes} bytes. " |
|
|
155 |
f"Metamapping could have failed: {potential_file_path}") |
|
|
156 |
|
|
|
157 |
return True |
|
|
158 |
|
|
|
159 |
def __str__(self): |
|
|
160 |
""" |
|
|
161 |
Prints a list-like string of the names of the Datafile objects up to the data limit |
|
|
162 |
(can't be used if copied and pasted) |
|
|
163 |
""" |
|
|
164 |
return str([d.file_name for d in self]) |
|
|
165 |
|
|
|
166 |
def compute_counts(self): |
|
|
167 |
""" |
|
|
168 |
Computes entity counts over all documents in this dataset. |
|
|
169 |
|
|
|
170 |
:return: a Counter of entity counts |
|
|
171 |
""" |
|
|
172 |
total = Counter() |
|
|
173 |
|
|
|
174 |
for ann in self.generate_annotations(): |
|
|
175 |
total += ann.compute_counts() |
|
|
176 |
|
|
|
177 |
return total |
|
|
178 |
|
|
|
179 |
def compute_confusion_matrix(self, other, leniency=0): |
|
|
180 |
""" |
|
|
181 |
Generates a confusion matrix where this Dataset serves as the gold standard annotations and `dataset` serves |
|
|
182 |
as the predicted annotations. A typical workflow would involve creating a Dataset object with the prediction directory |
|
|
183 |
outputted by a model and then passing it into this method. |
|
|
184 |
|
|
|
185 |
:param other: a Dataset object containing a predicted version of this dataset. |
|
|
186 |
:param leniency: a floating point value between [0,1] defining the leniency of the character spans to count as different. A value of zero considers only exact character matches while a positive value considers entities that differ by up to :code:`ceil(leniency * len(span)/2)` on either side. |
|
|
187 |
:return: two element tuple containing a label array (of entity names) and a matrix where rows are gold labels and columns are predicted labels. matrix[i][j] indicates that entities[i] in this dataset was predicted as entities[j] in 'annotation' matrix[i][j] times |
|
|
188 |
""" |
|
|
189 |
if not isinstance(other, Dataset): |
|
|
190 |
raise ValueError("other must be instance of Dataset") |
|
|
191 |
|
|
|
192 |
# verify files are consistent |
|
|
193 |
diff = {d.file_name for d in self} - {d.file_name for d in other} |
|
|
194 |
if diff: |
|
|
195 |
raise ValueError(f"Dataset of predictions is missing the files: {repr(diff)}") |
|
|
196 |
|
|
|
197 |
# sort entities in ascending order by count. |
|
|
198 |
entities = [key for key, _ in sorted(self.compute_counts().items(), key=lambda x: x[1])] |
|
|
199 |
confusion_matrix = [[0 * len(entities)] * len(entities)] |
|
|
200 |
|
|
|
201 |
for gold_data_file in self: |
|
|
202 |
prediction_iter = iter(other) |
|
|
203 |
prediction_data_file = next(prediction_iter) |
|
|
204 |
while str(gold_data_file) != str(prediction_data_file): |
|
|
205 |
prediction_data_file = next(prediction_iter) |
|
|
206 |
|
|
|
207 |
gold_annotation = Annotations(gold_data_file.ann_path) |
|
|
208 |
pred_annotation = Annotations(prediction_data_file.ann_path) |
|
|
209 |
|
|
|
210 |
# compute matrix on the Annotation file level |
|
|
211 |
ann_confusion_matrix = gold_annotation.compute_confusion_matrix(pred_annotation, entities, leniency=leniency) |
|
|
212 |
for i in range(len(confusion_matrix)): |
|
|
213 |
for j in range(len(confusion_matrix)): |
|
|
214 |
confusion_matrix[i][j] += ann_confusion_matrix[i][j] |
|
|
215 |
|
|
|
216 |
return entities, confusion_matrix |
|
|
217 |
|
|
|
218 |
def compute_ambiguity(self, dataset): |
|
|
219 |
""" |
|
|
220 |
Finds occurrences of spans from 'dataset' that intersect with a span from this annotation but do not have this spans label. |
|
|
221 |
label. If 'dataset' comprises a models predictions, this method provides a strong indicators |
|
|
222 |
of a model's in-ability to dis-ambiguate between entities. For a full analysis, compute a confusion matrix. |
|
|
223 |
|
|
|
224 |
:param dataset: a Dataset object containing a predicted version of this dataset. |
|
|
225 |
:return: a dictionary containing the ambiguity computations on each gold, predicted file pair |
|
|
226 |
""" |
|
|
227 |
if not isinstance(dataset, Dataset): |
|
|
228 |
raise ValueError("dataset must be instance of Dataset") |
|
|
229 |
|
|
|
230 |
# verify files are consistent |
|
|
231 |
diff = {d.file_name for d in self} - {d.file_name for d in dataset} |
|
|
232 |
if diff: |
|
|
233 |
raise ValueError(f"Dataset of predictions is missing the files: {repr(diff)}") |
|
|
234 |
|
|
|
235 |
# Dictionary storing ambiguity over dataset |
|
|
236 |
ambiguity_dict = {} |
|
|
237 |
|
|
|
238 |
for gold_data_file in self: |
|
|
239 |
prediction_iter = iter(dataset) |
|
|
240 |
prediction_data_file = next(prediction_iter) |
|
|
241 |
while str(gold_data_file) != str(prediction_data_file): |
|
|
242 |
prediction_data_file = next(prediction_iter) |
|
|
243 |
|
|
|
244 |
gold_annotation = Annotations(gold_data_file.ann_path) |
|
|
245 |
pred_annotation = Annotations(prediction_data_file.ann_path) |
|
|
246 |
|
|
|
247 |
# compute matrix on the Annotation file level |
|
|
248 |
ambiguity_dict[str(gold_data_file)] = gold_annotation.compute_ambiguity(pred_annotation) |
|
|
249 |
|
|
|
250 |
return ambiguity_dict |
|
|
251 |
|
|
|
252 |
def get_labels(self, as_list=False): |
|
|
253 |
""" |
|
|
254 |
Get all of the entities/labels used in the dataset. |
|
|
255 |
:param as_list: bool for if to return the results as a list; defaults to False |
|
|
256 |
:return: A set of strings. Each string is a label used. |
|
|
257 |
""" |
|
|
258 |
labels = set() |
|
|
259 |
|
|
|
260 |
for ann in self.generate_annotations(): |
|
|
261 |
labels.update(ann.get_labels()) |
|
|
262 |
|
|
|
263 |
if as_list: |
|
|
264 |
return list(labels) |
|
|
265 |
return labels |
|
|
266 |
|
|
|
267 |
def generate_annotations(self): |
|
|
268 |
"""Generates Annotation objects for all the files in this Dataset""" |
|
|
269 |
for file in self: |
|
|
270 |
if file.ann_path is not None: |
|
|
271 |
yield Annotations(file.ann_path, source_text_path=file.txt_path) |
|
|
272 |
else: |
|
|
273 |
yield Annotations([]) |
|
|
274 |
|
|
|
275 |
def __getitem__(self, item): |
|
|
276 |
""" |
|
|
277 |
Creates and returns the Annotations object with the given file name, else raises FileNotFoundError; |
|
|
278 |
useful for getting Annotations objects from parallel Datasets |
|
|
279 |
:param item: the name of the file to be represented (not including the extension or parent directories) |
|
|
280 |
:return: an Annotations object |
|
|
281 |
""" |
|
|
282 |
path = os.path.join(self.data_directory, item + '.ann') |
|
|
283 |
return Annotations(path) |
|
|
284 |
|
|
|
285 |
|
|
|
286 |
def main(): |
|
|
287 |
"""CLI for retrieving dataset information""" |
|
|
288 |
parser = argparse.ArgumentParser(description='Calculate data about a given data directory') |
|
|
289 |
parser.add_argument('directory') |
|
|
290 |
args = parser.parse_args() |
|
|
291 |
|
|
|
292 |
dataset = Dataset(args.directory) |
|
|
293 |
|
|
|
294 |
entities = json.dumps(dataset.get_labels(as_list=True)) |
|
|
295 |
counts = dataset.compute_counts() |
|
|
296 |
|
|
|
297 |
print(f"Entities: {entities}") |
|
|
298 |
pprint.pprint(counts) |
|
|
299 |
|
|
|
300 |
|
|
|
301 |
if __name__ == '__main__': |
|
|
302 |
main() |