[6c353a]: / medacy / tools / converters / add_ast_to_brat.py

Download this file

138 lines (103 with data), 4.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
"""
Contains a function for adding assertion annotations from i2b2 data
to pre-existing BRAT annotations.
Assertion annotations look like this:
c=”prostate cancer” 5:7 5:8||t=”problem”||a=”present”
The desired output looks like this:
T1 problem 55 70 prostate cancer
A1 present T1
"""
import argparse
import logging
import os
import re
from medacy.tools.converters.brat_to_con import switch_extension
from medacy.tools.converters.con_to_brat import get_absolute_index, line_to_dict
from medacy.tools.converters.conversion_tools.line import Line
from medacy.tools.entity import Entity
assertion_pattern = r'c="([^"]*)" \d+:\d+ \d+:\d+\|\|t="([^"]*)"\|\|a="([^"]*)"'
def is_valid_assert(sample):
"""Validates that a line is a valid assertion annotation"""
return isinstance(sample, str) and re.fullmatch(assertion_pattern, sample)
def add_ast_to_brat(ast_file_path, ann_file_path, txt_file_path):
"""
Adds the assertion annotations to a given ann file
:param ast_file_path: The assertion file to get the assertion annotations from
:param ann_file_path: The ann file to add the assertion annotations to
:param txt_file_path: The text file that the previous two are annotating
:return: None
"""
with open(txt_file_path) as f:
text = f.read()
text_lines = Line.init_lines(text)
with open(ast_file_path) as f:
ast_text = f.read()
if ast_text == "":
logging.info(f"There were no assertions in file {ast_file_path}, no conversion was performed")
return
assertions = ast_text.split('\n')
entities = Entity.init_from_doc(ann_file_path)
a = 1 # used to keep track of the assertion number
add_to_ann = ""
for line in assertions:
if not is_valid_assert(line):
logging.warning(f"Invalid line of ast text in file {ast_file_path} was skipped: {line}")
continue
# Get the part of the assertion annotation that is an entity (up to the '||a')
print(line)
a_part_index = line.index('||a')
assertion_text = line[a_part_index + 5:-1]
entity_part = line[:a_part_index]
# Break up entity_part into named substrings
ent_dict = line_to_dict(entity_part)
# Get the BRAT-formatted (relative to the start of the document) index for the start of the entity
ent_text = ent_dict['data_item']
ent_type = ent_dict['data_type']
start_ind = get_absolute_index(text_lines, ent_dict['start_ind'], ent_text)
end_ind = start_ind + len(ent_text)
# Get the text of the entity as it appears in the document, since it might not match what's provided
# in the assertion file
real_ent_text = text[start_ind:end_ind]
if real_ent_text != ent_text:
logging.info(f"Enity text in document {ast_file_path} didn't match; expected '{ent_text}', actual {real_ent_text}")
ent_text = real_ent_text
ent = Entity(ent_type, start_ind, end_ind, ent_text)
ent_match = None
# See if the entity already exists in the ann file
for e in entities:
if ent == e:
ent_match = e
break
# If not, add it to the new entities
if ent_match is None:
add_to_ann += str(ent) + '\n'
else:
# If the ent does have a match, we will use that from now on instead of the one we made
ent = ent_match
add_to_ann += f"A{a}\t{assertion_text} T{ent.num}\n"
a += 1
# End for
with open(ann_file_path, 'a') as f:
print("WRITING", add_to_ann)
f.write(add_to_ann)
def main():
description = """Add assertion annotations to BRAT ann files. Files that refer to the same document must all have
the same name, except for the file extensions."""
parser = argparse.ArgumentParser(prog='Add Assert to BRAT', description=description)
parser.add_argument('-a', '--ast_file_dir', required=True, help='Directory containing the assertion files.')
parser.add_argument('-t', '--txt_file_dir', required=True, help='Directory containing the text files.')
parser.add_argument('-b', '--ann_file_dir', required=True, help='Directory containing the BRAT ann files.')
args = parser.parse_args()
file_tuples = []
ast_files = os.listdir(args.ast_file_dir)
for file_name in ast_files:
txt_file_name = switch_extension(file_name, ".txt")
txt_file_path = os.path.join(args.txt_file_dir, txt_file_name)
ann_file_name = switch_extension(file_name, ".ann")
ann_file_path = os.path.join(args.ann_file_dir, ann_file_name)
ast_file_path = os.path.join(args.ast_file_dir, file_name)
file_tuples.append((ast_file_path, ann_file_path, txt_file_path))
for a, b, t in file_tuples:
add_ast_to_brat(a, b, t)
if __name__ == '__main__':
main()