--- a +++ b/data_description/extract_text_entities.py @@ -0,0 +1,26 @@ +import json + +def extract_text_entities(input_file_path, text_file_path, entities_file_path): + with open(input_file_path, 'r') as file: + lines = file.readlines() + + for line in lines: + jsonl_entry = json.loads(line) + text_content = jsonl_entry['text'] + entities_content = jsonl_entry['entities'] + + extracted_entities = [text_content[entity['start_offset']:entity['end_offset']] for entity in entities_content] + + with open(text_file_path, 'a') as text_file: + text_file.write(text_content) + + with open(entities_file_path, 'a') as entities_file: + for entity in extracted_entities: + entities_file.write(entity + '\n') + +# Define file paths +input_jsonl_file_path = '../datasets/labelled_data/all.jsonl' +output_text_file_path = './text.txt' +output_entities_file_path = './entities.txt' + +extract_text_entities(input_jsonl_file_path, output_text_file_path, output_entities_file_path)