a b/scripts/data/parse_mdai_annotations.py
1
import argparse
2
import json
3
from tqdm import tqdm
4
from datetime import datetime
5
from collections import defaultdict
6
import pdb
7
import pandas as pd
8
9
"""
10
Script to parse JSON file exported from md.ai
11
JSON structure is detailed in https://docs.md.ai/data/json/ 
12
13
Output is either:
14
    (1) a dict mapping image (SOPInstanceUID) to list of annotation data
15
    OR
16
    (2) dataset object JSON with annotation dict for each image series
17
18
Notes:
19
    - JSON: ['id', 'createdAt', 'updatedAt', 'name', 'description', 'isPrivate', 'users', 'labelGroups', 'datasets']
20
    
21
    - meta['labelGroups']: list of dicts with keys: 
22
        ['id','createdAt', 'updatedAt', 'name', 'description', 'type', 
23
            'labels': [{'id', 'parentId', 'createdAt', 'updatedAt', 'name', 'shortName', 
24
            'description', 'color', 'type,', ' scope', 'annotationMode', 'radlexTagIds': []}] ]
25
    
26
    - meta['datasets']: list of dicts: ['id', 'type', 'createdAt', 'updatedAt', 'name', 'description', 'studies', 'annotations']
27
    
28
    - meta['datasets'][i]['annotations']: list of dicts for each annotation (image-level):
29
        ['id', 'parentId', 'createdAt', 'createdById', 'updatedAt', 'updatedById', 'modelId', 'StudyInstanceUID', 'SeriesInstanceUID', 
30
            'SOPInstanceUID', 'labelId', 'annotationNumber', 'height', 'width', 'data', 'note', 'radlexTagIds', 'isImported', 
31
            'reviewsPositiveCount', 'reviewsNegativeCount']
32
33
    - date format: %Y-%m-%dT%H:%M:%S.%fZ
34
    - for box annotations: x,y are upper left corner 
35
    - 'SOPInstanceUID': slice id (single image)
36
    - 'SeriesInstanceUID': series id (single volume) 
37
    - 'StudyInstanceUID': exam id (multiple volumes)
38
    - 'height', 'width': image h, w (not annotation)
39
"""
40
41
INCLUDE_AFTER = datetime.strptime('10-08-2020', '%m-%d-%Y')  # annotations made before INCLUDE_AFTER were used to test md.ai
42
ANNOT_COMMENTS = pd.read_csv('/Mounts/rbg-storage1/datasets/NLST/mdai/annotation_comments_12062020.csv')
43
44
def scale_annotations(annotation, annotation_meta):
45
    '''
46
    Scale annotation (bounding boxes) to values in [0,1] by dividing by image height and width (annotation_meta)
47
    '''
48
    annotation['x'] /= annotation_meta['width']
49
    annotation['width'] /= annotation_meta['width']
50
    annotation['y'] /= annotation_meta['height']
51
    annotation['height'] /= annotation_meta['height']
52
    return annotation
53
54
parser = argparse.ArgumentParser()
55
parser.add_argument('--annotation_json_path', type = str, help = 'JSON exported from md.ai', default = '/Mounts/rbg-storage1/datasets/NLST/mdai/mdai_mit_project_poBGbqle_annotations_labelgroup_all_2020-11-25-030811.json')
56
parser.add_argument('--output_json_path', type = str, help = 'Where to export parsed annotations data')
57
parser.add_argument('--output_is_dataset_obj', action = 'store_true', default = False, help = 'Whether incorporating annotations into dataset json directly')
58
59
if __name__ == "__main__":
60
    args = parser.parse_args()
61
    annotation_metadata_json = json.load(open(args.annotation_json_path, 'r'))
62
    annotation_dict =  {}
63
64
    USERS = { user['id']: user['name'] for user in annotation_metadata_json['users']  }
65
    
66
    for dataset_dict in annotation_metadata_json['datasets']:
67
        for annotation_meta_dict in tqdm(dataset_dict['annotations']):
68
            if annotation_meta_dict['data'] is None:
69
                continue
70
            date = datetime.strptime(annotation_meta_dict['createdAt'], '%Y-%m-%dT%H:%M:%S.%fZ')
71
            if not(date > INCLUDE_AFTER):
72
                continue
73
            img_id = annotation_meta_dict['SOPInstanceUID']
74
            series_id = annotation_meta_dict['SeriesInstanceUID']
75
            exam_id = annotation_meta_dict['StudyInstanceUID']
76
            if series_id not in annotation_dict:
77
                annotation_dict[series_id] = defaultdict(list)
78
            annotation_meta_dict['data'] = scale_annotations(annotation_meta_dict['data'], annotation_meta_dict)
79
            annotation_meta_dict['data']['user'] = USERS[annotation_meta_dict['createdById']]
80
            annotation_dict[series_id][img_id].append( annotation_meta_dict['data'] )
81
     
82
    for series_id in annotation_dict.keys():
83
        if series_id in list(ANNOT_COMMENTS['series_uid']):
84
            if 'FF' in str(ANNOT_COMMENTS[ANNOT_COMMENTS['series_uid'] == series_id]['comments for Peter']):
85
                for img_id in annotation_dict[series_id].keys():
86
                    annotation_dict[series_id][img_id] = [ i for i in annotation_dict[series_id][img_id]  if 'fintelmann' in i['user'] ]
87
    
88
    if not args.output_is_dataset_obj:
89
        json.dump(annotation_dict, open(args.output_json_path, 'w'))
90
    
91
    else:
92
        output_json = json.load(open(args.output_json_path, 'r')) 
93
94
        for mrn_row in tqdm(output_json):
95
            for exam_dict in mrn_row['accessions']:
96
                for series_id, series_dict in exam_dict['image_series'].items():
97
                    if series_id in annotation_dict:
98
                        series_dict['annotations'] = annotation_dict[series_id]
99
                    else:
100
                        continue
101
        
102
        json.dump(output_json, open(args.output_json_path, 'r'))