Switch to unified view

a b/deidentify/dataset/nursing2brat.py
1
import argparse
2
import os
3
import re
4
from typing import Dict, List
5
6
from deidentify.base import Annotation, Document
7
from deidentify.dataset import brat
8
9
10
def readlines(filename):
11
    with open(filename) as file:
12
        lines = file.readlines()
13
    return lines
14
15
16
def documents_iter(notes):
17
    lines = readlines(notes)
18
19
    record_lines = []
20
    for line in lines:
21
        if line.startswith('START_OF_RECORD'):
22
            record_lines = []
23
            patient_id, record_id = re.findall(r'\d+', line)
24
        elif line.startswith('||||END_OF_RECORD'):
25
            yield Document(
26
                name='note-{}-{}'.format(patient_id, record_id),
27
                text=''.join(record_lines).rstrip(),
28
                annotations=[]
29
            )
30
        else:
31
            record_lines.append(line)
32
33
34
def annotations_iter(annotations):
35
    lines = readlines(annotations)
36
37
    current_pid, current_rid = lines[0].split(maxsplit=5)[0:2]
38
39
    annotations = []
40
    i = 1
41
    for line in lines:
42
        pid, rid, start, end, tag, text = line.strip().split(maxsplit=5)
43
        if pid != current_pid or rid != current_rid:
44
            yield annotations
45
            annotations = []
46
            i = 1
47
            current_pid = pid
48
            current_rid = rid
49
50
        annotations.append(Annotation(
51
            text=text,
52
            start=int(start),
53
            end=int(end),
54
            tag=tag,
55
            ann_id='T{}'.format(i),
56
            doc_id='note-{}-{}'.format(current_pid, current_rid)
57
        ))
58
        i += 1
59
60
    yield annotations
61
62
63
def _map_annotations(annotations):
64
    # Mapping: doc_id -> List[Annotation]
65
    mapping: Dict[str, List[Annotation]] = {}
66
67
    for doc_anns in annotations:
68
        mapping[doc_anns[0].doc_id] = doc_anns
69
70
    return mapping
71
72
73
def main(args):
74
    documents = documents_iter(args.notes_file)
75
    annotations = annotations_iter(args.phi_file)
76
77
    doc_annotations_mapping = _map_annotations(annotations)
78
79
    for doc in documents:
80
        anns = doc_annotations_mapping.get(doc.name, [])
81
        brat.write_brat_document(args.output_dir, doc_name=doc.name,
82
                                 text=doc.text, annotations=anns)
83
84
85
def arg_parser():
86
    parser = argparse.ArgumentParser()
87
    parser.add_argument("notes_file", help="Full path to raw notes file (notes-raw.txt)")
88
    parser.add_argument("phi_file", help="Full path to annotations file (id-phi.phrase)")
89
    parser.add_argument("output_dir", help="Path to output directory.")
90
    return parser.parse_args()
91
92
93
if __name__ == '__main__':
94
    ARGS = arg_parser()
95
    os.makedirs(ARGS.output_dir, exist_ok=True)
96
    main(ARGS)