--- a +++ b/deidentify/dataset/nursing2brat.py @@ -0,0 +1,96 @@ +import argparse +import os +import re +from typing import Dict, List + +from deidentify.base import Annotation, Document +from deidentify.dataset import brat + + +def readlines(filename): + with open(filename) as file: + lines = file.readlines() + return lines + + +def documents_iter(notes): + lines = readlines(notes) + + record_lines = [] + for line in lines: + if line.startswith('START_OF_RECORD'): + record_lines = [] + patient_id, record_id = re.findall(r'\d+', line) + elif line.startswith('||||END_OF_RECORD'): + yield Document( + name='note-{}-{}'.format(patient_id, record_id), + text=''.join(record_lines).rstrip(), + annotations=[] + ) + else: + record_lines.append(line) + + +def annotations_iter(annotations): + lines = readlines(annotations) + + current_pid, current_rid = lines[0].split(maxsplit=5)[0:2] + + annotations = [] + i = 1 + for line in lines: + pid, rid, start, end, tag, text = line.strip().split(maxsplit=5) + if pid != current_pid or rid != current_rid: + yield annotations + annotations = [] + i = 1 + current_pid = pid + current_rid = rid + + annotations.append(Annotation( + text=text, + start=int(start), + end=int(end), + tag=tag, + ann_id='T{}'.format(i), + doc_id='note-{}-{}'.format(current_pid, current_rid) + )) + i += 1 + + yield annotations + + +def _map_annotations(annotations): + # Mapping: doc_id -> List[Annotation] + mapping: Dict[str, List[Annotation]] = {} + + for doc_anns in annotations: + mapping[doc_anns[0].doc_id] = doc_anns + + return mapping + + +def main(args): + documents = documents_iter(args.notes_file) + annotations = annotations_iter(args.phi_file) + + doc_annotations_mapping = _map_annotations(annotations) + + for doc in documents: + anns = doc_annotations_mapping.get(doc.name, []) + brat.write_brat_document(args.output_dir, doc_name=doc.name, + text=doc.text, annotations=anns) + + +def arg_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("notes_file", help="Full path to raw notes file (notes-raw.txt)") + parser.add_argument("phi_file", help="Full path to annotations file (id-phi.phrase)") + parser.add_argument("output_dir", help="Path to output directory.") + return parser.parse_args() + + +if __name__ == '__main__': + ARGS = arg_parser() + os.makedirs(ARGS.output_dir, exist_ok=True) + main(ARGS)