Sybil / Git / [d9566e] /scripts/data/parse_mdai

Models:
RichardZick/
Sybil
Downloads: 1
[d9566e]: / scripts / data / parse_mdai_annotations.py
History
Download this file
103 lines (84 with data), 5.2 kB

import argparse
import json
from tqdm import tqdm
from datetime import datetime
from collections import defaultdict
import pdb
import pandas as pd

"""
Script to parse JSON file exported from md.ai
JSON structure is detailed in https://docs.md.ai/data/json/ 

Output is either:
    (1) a dict mapping image (SOPInstanceUID) to list of annotation data
    OR
    (2) dataset object JSON with annotation dict for each image series

Notes:
    - JSON: ['id', 'createdAt', 'updatedAt', 'name', 'description', 'isPrivate', 'users', 'labelGroups', 'datasets']
    
    - meta['labelGroups']: list of dicts with keys: 
        ['id','createdAt', 'updatedAt', 'name', 'description', 'type', 
            'labels': [{'id', 'parentId', 'createdAt', 'updatedAt', 'name', 'shortName', 
            'description', 'color', 'type,', ' scope', 'annotationMode', 'radlexTagIds': []}] ]
    
    - meta['datasets']: list of dicts: ['id', 'type', 'createdAt', 'updatedAt', 'name', 'description', 'studies', 'annotations']
    
    - meta['datasets'][i]['annotations']: list of dicts for each annotation (image-level):
        ['id', 'parentId', 'createdAt', 'createdById', 'updatedAt', 'updatedById', 'modelId', 'StudyInstanceUID', 'SeriesInstanceUID', 
            'SOPInstanceUID', 'labelId', 'annotationNumber', 'height', 'width', 'data', 'note', 'radlexTagIds', 'isImported', 
            'reviewsPositiveCount', 'reviewsNegativeCount']

    - date format: %Y-%m-%dT%H:%M:%S.%fZ
    - for box annotations: x,y are upper left corner 
    - 'SOPInstanceUID': slice id (single image)
    - 'SeriesInstanceUID': series id (single volume) 
    - 'StudyInstanceUID': exam id (multiple volumes)
    - 'height', 'width': image h, w (not annotation)
"""

INCLUDE_AFTER = datetime.strptime('10-08-2020', '%m-%d-%Y')  # annotations made before INCLUDE_AFTER were used to test md.ai
ANNOT_COMMENTS = pd.read_csv('/Mounts/rbg-storage1/datasets/NLST/mdai/annotation_comments_12062020.csv')

def scale_annotations(annotation, annotation_meta):
    '''
    Scale annotation (bounding boxes) to values in [0,1] by dividing by image height and width (annotation_meta)
    '''
    annotation['x'] /= annotation_meta['width']
    annotation['width'] /= annotation_meta['width']
    annotation['y'] /= annotation_meta['height']
    annotation['height'] /= annotation_meta['height']
    return annotation

parser = argparse.ArgumentParser()
parser.add_argument('--annotation_json_path', type = str, help = 'JSON exported from md.ai', default = '/Mounts/rbg-storage1/datasets/NLST/mdai/mdai_mit_project_poBGbqle_annotations_labelgroup_all_2020-11-25-030811.json')
parser.add_argument('--output_json_path', type = str, help = 'Where to export parsed annotations data')
parser.add_argument('--output_is_dataset_obj', action = 'store_true', default = False, help = 'Whether incorporating annotations into dataset json directly')

if __name__ == "__main__":
    args = parser.parse_args()
    annotation_metadata_json = json.load(open(args.annotation_json_path, 'r'))
    annotation_dict =  {}

    USERS = { user['id']: user['name'] for user in annotation_metadata_json['users']  }
    
    for dataset_dict in annotation_metadata_json['datasets']:
        for annotation_meta_dict in tqdm(dataset_dict['annotations']):
            if annotation_meta_dict['data'] is None:
                continue
            date = datetime.strptime(annotation_meta_dict['createdAt'], '%Y-%m-%dT%H:%M:%S.%fZ')
            if not(date > INCLUDE_AFTER):
                continue
            img_id = annotation_meta_dict['SOPInstanceUID']
            series_id = annotation_meta_dict['SeriesInstanceUID']
            exam_id = annotation_meta_dict['StudyInstanceUID']
            if series_id not in annotation_dict:
                annotation_dict[series_id] = defaultdict(list)
            annotation_meta_dict['data'] = scale_annotations(annotation_meta_dict['data'], annotation_meta_dict)
            annotation_meta_dict['data']['user'] = USERS[annotation_meta_dict['createdById']]
            annotation_dict[series_id][img_id].append( annotation_meta_dict['data'] )
     
    for series_id in annotation_dict.keys():
        if series_id in list(ANNOT_COMMENTS['series_uid']):
            if 'FF' in str(ANNOT_COMMENTS[ANNOT_COMMENTS['series_uid'] == series_id]['comments for Peter']):
                for img_id in annotation_dict[series_id].keys():
                    annotation_dict[series_id][img_id] = [ i for i in annotation_dict[series_id][img_id]  if 'fintelmann' in i['user'] ]
    
    if not args.output_is_dataset_obj:
        json.dump(annotation_dict, open(args.output_json_path, 'w'))
    
    else:
        output_json = json.load(open(args.output_json_path, 'r')) 

        for mrn_row in tqdm(output_json):
            for exam_dict in mrn_row['accessions']:
                for series_id, series_dict in exam_dict['image_series'].items():
                    if series_id in annotation_dict:
                        series_dict['annotations'] = annotation_dict[series_id]
                    else:
                        continue
        
        json.dump(output_json, open(args.output_json_path, 'r'))