import argparse
import json
from tqdm import tqdm
from datetime import datetime
from collections import defaultdict
import pdb
import pandas as pd
"""
Script to parse JSON file exported from md.ai
JSON structure is detailed in https://docs.md.ai/data/json/
Output is either:
(1) a dict mapping image (SOPInstanceUID) to list of annotation data
OR
(2) dataset object JSON with annotation dict for each image series
Notes:
- JSON: ['id', 'createdAt', 'updatedAt', 'name', 'description', 'isPrivate', 'users', 'labelGroups', 'datasets']
- meta['labelGroups']: list of dicts with keys:
['id','createdAt', 'updatedAt', 'name', 'description', 'type',
'labels': [{'id', 'parentId', 'createdAt', 'updatedAt', 'name', 'shortName',
'description', 'color', 'type,', ' scope', 'annotationMode', 'radlexTagIds': []}] ]
- meta['datasets']: list of dicts: ['id', 'type', 'createdAt', 'updatedAt', 'name', 'description', 'studies', 'annotations']
- meta['datasets'][i]['annotations']: list of dicts for each annotation (image-level):
['id', 'parentId', 'createdAt', 'createdById', 'updatedAt', 'updatedById', 'modelId', 'StudyInstanceUID', 'SeriesInstanceUID',
'SOPInstanceUID', 'labelId', 'annotationNumber', 'height', 'width', 'data', 'note', 'radlexTagIds', 'isImported',
'reviewsPositiveCount', 'reviewsNegativeCount']
- date format: %Y-%m-%dT%H:%M:%S.%fZ
- for box annotations: x,y are upper left corner
- 'SOPInstanceUID': slice id (single image)
- 'SeriesInstanceUID': series id (single volume)
- 'StudyInstanceUID': exam id (multiple volumes)
- 'height', 'width': image h, w (not annotation)
"""
INCLUDE_AFTER = datetime.strptime('10-08-2020', '%m-%d-%Y') # annotations made before INCLUDE_AFTER were used to test md.ai
ANNOT_COMMENTS = pd.read_csv('/Mounts/rbg-storage1/datasets/NLST/mdai/annotation_comments_12062020.csv')
def scale_annotations(annotation, annotation_meta):
'''
Scale annotation (bounding boxes) to values in [0,1] by dividing by image height and width (annotation_meta)
'''
annotation['x'] /= annotation_meta['width']
annotation['width'] /= annotation_meta['width']
annotation['y'] /= annotation_meta['height']
annotation['height'] /= annotation_meta['height']
return annotation
parser = argparse.ArgumentParser()
parser.add_argument('--annotation_json_path', type = str, help = 'JSON exported from md.ai', default = '/Mounts/rbg-storage1/datasets/NLST/mdai/mdai_mit_project_poBGbqle_annotations_labelgroup_all_2020-11-25-030811.json')
parser.add_argument('--output_json_path', type = str, help = 'Where to export parsed annotations data')
parser.add_argument('--output_is_dataset_obj', action = 'store_true', default = False, help = 'Whether incorporating annotations into dataset json directly')
if __name__ == "__main__":
args = parser.parse_args()
annotation_metadata_json = json.load(open(args.annotation_json_path, 'r'))
annotation_dict = {}
USERS = { user['id']: user['name'] for user in annotation_metadata_json['users'] }
for dataset_dict in annotation_metadata_json['datasets']:
for annotation_meta_dict in tqdm(dataset_dict['annotations']):
if annotation_meta_dict['data'] is None:
continue
date = datetime.strptime(annotation_meta_dict['createdAt'], '%Y-%m-%dT%H:%M:%S.%fZ')
if not(date > INCLUDE_AFTER):
continue
img_id = annotation_meta_dict['SOPInstanceUID']
series_id = annotation_meta_dict['SeriesInstanceUID']
exam_id = annotation_meta_dict['StudyInstanceUID']
if series_id not in annotation_dict:
annotation_dict[series_id] = defaultdict(list)
annotation_meta_dict['data'] = scale_annotations(annotation_meta_dict['data'], annotation_meta_dict)
annotation_meta_dict['data']['user'] = USERS[annotation_meta_dict['createdById']]
annotation_dict[series_id][img_id].append( annotation_meta_dict['data'] )
for series_id in annotation_dict.keys():
if series_id in list(ANNOT_COMMENTS['series_uid']):
if 'FF' in str(ANNOT_COMMENTS[ANNOT_COMMENTS['series_uid'] == series_id]['comments for Peter']):
for img_id in annotation_dict[series_id].keys():
annotation_dict[series_id][img_id] = [ i for i in annotation_dict[series_id][img_id] if 'fintelmann' in i['user'] ]
if not args.output_is_dataset_obj:
json.dump(annotation_dict, open(args.output_json_path, 'w'))
else:
output_json = json.load(open(args.output_json_path, 'r'))
for mrn_row in tqdm(output_json):
for exam_dict in mrn_row['accessions']:
for series_id, series_dict in exam_dict['image_series'].items():
if series_id in annotation_dict:
series_dict['annotations'] = annotation_dict[series_id]
else:
continue
json.dump(output_json, open(args.output_json_path, 'r'))