[0eda78]: / utils / jsonl_to_conll.py

Download this file

85 lines (69 with data), 4.0 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
import json
import re
def jsonl_to_conll(input_filepath, output_filepath, annotation_type, annotation_type_name):
with open(input_filepath, 'r', encoding='utf-8') as file:
jsonl_data = [json.loads(line) for line in file]
with open(output_filepath, 'w', encoding='utf-8') as file:
for entry in jsonl_data:
text = entry['text'].replace('\n', ' ')
entities = [e for e in entry['entities'] if e['label'] == annotation_type_name]
entities.sort(key=lambda e: e['start_offset']) #sort by start offset
tokens = re.findall(r"\w+|\w+(?='s)|'s|['\".,!?;]", text, re.UNICODE) #split on spaces, keep punctuation
current_pos = 0
entity_index = 0
for token in tokens:
start_offset = text.find(token, current_pos)
end_offset = start_offset + len(token)
current_pos = end_offset
tag = 'O'
if entity_index < len(entities):
entity = entities[entity_index]
if start_offset == entity['start_offset']:
tag = 'B-' + annotation_type
elif start_offset > entity['start_offset'] and end_offset <= entity['end_offset']:
tag = 'I-' + annotation_type
if end_offset == entity['end_offset'] and entity_index < len(entities) - 1:
entity_index += 1
file.write(f"{token} {tag}\n")
file.write("\n")
import argparse
parser = argparse.ArgumentParser(
description='This script is used to convert JSONL data into CONLL format.')
parser.add_argument('-o', '--output', type=str, default="all.conll",
help='Choose where to save the file after modifying.')
parser.add_argument('-i', '--input', type=str, default="all.jsonl",
help='Choose the file to modify.')
parser.add_argument('-t', '--type', type=str, required=True,
help='Specify the type of annotation to process. Type of annotation needs to be one of the following: Medical Condition, Symptom, Medication, Vital Statistic, Measurement Value, Negation Cue, Medical Procedure')
args = parser.parse_args()
if not args.output.endswith('.conll'):
raise ValueError('Output file needs to be defined as a CONLL-file')
if not args.input.endswith('.jsonl'):
raise ValueError('Input file needs to be defined as a JSONL-file')
if args.type not in ['Medical Condition', 'Symptom', 'Medication', 'Vital Statistic', 'Measurement Value', 'Negation Cue', 'Medical Procedure']:
raise ValueError('Type of annotation needs to be one of the following: Medical Condition, Symptom, Medication, Vital Statistic, Measurement Value, Negation Cue, Medical Procedure')
if args.type == 'Medical Condition':
annotation_type = 'MEDCOND'
annotation_type_name = 'Medical Condition'
elif args.type == 'Symptom':
annotation_type = 'SYMPTOM'
annotation_type_name = 'Symptom'
elif args.type == 'Medication':
annotation_type = 'MEDICATION'
annotation_type_name = 'Medication/Treatment'
elif args.type == 'Vital Statistic':
annotation_type = 'VITALSTAT'
annotation_type_name = 'Vital Statistic'
elif args.type == 'Measurement Value':
annotation_type = 'MEASVAL'
annotation_type_name = 'Measurement Value'
elif args.type == 'Negation Cue':
annotation_type = 'NEGATION'
annotation_type_name = 'Negation Cue'
elif args.type == 'Medical Procedure':
annotation_type = 'PROCEDURE'
annotation_type_name = 'Medical Procedure'
else:
raise ValueError('Type of annotation needs to be one of the following: Medical Condition, Symptom, Medication, Vital Statistic, Measurement Value, Negation Cue, Medical Procedure')
jsonl_to_conll(args.input, args.output, annotation_type, annotation_type_name)
print(f"Conversion of {args.input} to {args.output} for type \"{annotation_type_name}\" completed.")