Disease-Detection-NLP / Git / [0eda78] /utils/jsonl_to

Models:
philipB/
Disease-Detection-NLP
Downloads: 1
[0eda78]: / utils / jsonl_to_conll.py
History
Download this file
85 lines (69 with data), 4.0 kB

import json
import re

def jsonl_to_conll(input_filepath, output_filepath, annotation_type, annotation_type_name):
    with open(input_filepath, 'r', encoding='utf-8') as file:
        jsonl_data = [json.loads(line) for line in file]

    with open(output_filepath, 'w', encoding='utf-8') as file:
        for entry in jsonl_data:
            text = entry['text'].replace('\n', ' ')
            entities = [e for e in entry['entities'] if e['label'] == annotation_type_name]
            entities.sort(key=lambda e: e['start_offset']) #sort by start offset
            
            tokens = re.findall(r"\w+|\w+(?='s)|'s|['\".,!?;]", text, re.UNICODE) #split on spaces, keep punctuation
            
            current_pos = 0
            entity_index = 0
            for token in tokens:
                start_offset = text.find(token, current_pos)
                end_offset = start_offset + len(token)
                current_pos = end_offset

                tag = 'O'
                if entity_index < len(entities):
                    entity = entities[entity_index]
                    if start_offset == entity['start_offset']:
                        tag = 'B-' + annotation_type
                    elif start_offset > entity['start_offset'] and end_offset <= entity['end_offset']:
                        tag = 'I-' + annotation_type
                    if end_offset == entity['end_offset'] and entity_index < len(entities) - 1:
                        entity_index += 1
                    file.write(f"{token} {tag}\n")

            file.write("\n")

import argparse

parser = argparse.ArgumentParser(
        description='This script is used to convert JSONL data into CONLL format.')

parser.add_argument('-o', '--output', type=str, default="all.conll",
                    help='Choose where to save the file after modifying.')
parser.add_argument('-i', '--input', type=str, default="all.jsonl",
                    help='Choose the file to modify.')
parser.add_argument('-t', '--type', type=str, required=True,
                    help='Specify the type of annotation to process. Type of annotation needs to be one of the following: Medical Condition, Symptom, Medication, Vital Statistic, Measurement Value, Negation Cue, Medical Procedure')

args = parser.parse_args()

if not args.output.endswith('.conll'):
    raise ValueError('Output file needs to be defined as a CONLL-file')

if not args.input.endswith('.jsonl'):
    raise ValueError('Input file needs to be defined as a JSONL-file')

if args.type not in ['Medical Condition', 'Symptom', 'Medication', 'Vital Statistic', 'Measurement Value', 'Negation Cue', 'Medical Procedure']:
    raise ValueError('Type of annotation needs to be one of the following: Medical Condition, Symptom, Medication, Vital Statistic, Measurement Value, Negation Cue, Medical Procedure')

if args.type == 'Medical Condition':
    annotation_type = 'MEDCOND'
    annotation_type_name = 'Medical Condition'
elif args.type == 'Symptom':
    annotation_type = 'SYMPTOM'
    annotation_type_name = 'Symptom'
elif args.type == 'Medication':
    annotation_type = 'MEDICATION'
    annotation_type_name = 'Medication/Treatment'
elif args.type == 'Vital Statistic':
    annotation_type = 'VITALSTAT'
    annotation_type_name = 'Vital Statistic'
elif args.type == 'Measurement Value':
    annotation_type = 'MEASVAL'
    annotation_type_name = 'Measurement Value'
elif args.type == 'Negation Cue':
    annotation_type = 'NEGATION'
    annotation_type_name = 'Negation Cue'
elif args.type == 'Medical Procedure':
    annotation_type = 'PROCEDURE'
    annotation_type_name = 'Medical Procedure'
else:    
    raise ValueError('Type of annotation needs to be one of the following: Medical Condition, Symptom, Medication, Vital Statistic, Measurement Value, Negation Cue, Medical Procedure')

jsonl_to_conll(args.input, args.output, annotation_type, annotation_type_name)

print(f"Conversion of {args.input} to {args.output} for type \"{annotation_type_name}\" completed.")