[7fc5df]: / deidentify / dataset / nursing2brat.py

Download this file

97 lines (72 with data), 2.6 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
import argparse
import os
import re
from typing import Dict, List
from deidentify.base import Annotation, Document
from deidentify.dataset import brat
def readlines(filename):
with open(filename) as file:
lines = file.readlines()
return lines
def documents_iter(notes):
lines = readlines(notes)
record_lines = []
for line in lines:
if line.startswith('START_OF_RECORD'):
record_lines = []
patient_id, record_id = re.findall(r'\d+', line)
elif line.startswith('||||END_OF_RECORD'):
yield Document(
name='note-{}-{}'.format(patient_id, record_id),
text=''.join(record_lines).rstrip(),
annotations=[]
)
else:
record_lines.append(line)
def annotations_iter(annotations):
lines = readlines(annotations)
current_pid, current_rid = lines[0].split(maxsplit=5)[0:2]
annotations = []
i = 1
for line in lines:
pid, rid, start, end, tag, text = line.strip().split(maxsplit=5)
if pid != current_pid or rid != current_rid:
yield annotations
annotations = []
i = 1
current_pid = pid
current_rid = rid
annotations.append(Annotation(
text=text,
start=int(start),
end=int(end),
tag=tag,
ann_id='T{}'.format(i),
doc_id='note-{}-{}'.format(current_pid, current_rid)
))
i += 1
yield annotations
def _map_annotations(annotations):
# Mapping: doc_id -> List[Annotation]
mapping: Dict[str, List[Annotation]] = {}
for doc_anns in annotations:
mapping[doc_anns[0].doc_id] = doc_anns
return mapping
def main(args):
documents = documents_iter(args.notes_file)
annotations = annotations_iter(args.phi_file)
doc_annotations_mapping = _map_annotations(annotations)
for doc in documents:
anns = doc_annotations_mapping.get(doc.name, [])
brat.write_brat_document(args.output_dir, doc_name=doc.name,
text=doc.text, annotations=anns)
def arg_parser():
parser = argparse.ArgumentParser()
parser.add_argument("notes_file", help="Full path to raw notes file (notes-raw.txt)")
parser.add_argument("phi_file", help="Full path to annotations file (id-phi.phrase)")
parser.add_argument("output_dir", help="Path to output directory.")
return parser.parse_args()
if __name__ == '__main__':
ARGS = arg_parser()
os.makedirs(ARGS.output_dir, exist_ok=True)
main(ARGS)