[a9a70b]: / examples / irhythm / build_datasets.py

Download this file

152 lines (129 with data), 4.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
from __future__ import division
from __future__ import print_function
import argparse
import fnmatch
import glob
import json
import numpy as np
import os
import random
import tqdm
STEP = 256
RELABEL = {"NSR": "SINUS", "SUDDEN_BRADY": "AVB",
"AVB_TYPE2": "AVB", "AFIB": "AF", "AFL": "AF"}
def get_all_records(path, blacklist=set()):
records = []
for root, dirnames, filenames in os.walk(path):
for filename in fnmatch.filter(filenames, '*.ecg'):
if patient_id(filename) not in blacklist:
records.append(os.path.abspath(
os.path.join(root, filename)))
return records
def patient_id(record):
return os.path.basename(record).split("_")[0]
def round_to_step(n, step):
diff = (n - 1) % step
if diff < (step / 2):
return n - diff
else:
return n + (step - diff)
def load_episodes(record, epi_ext):
base = os.path.splitext(record)[0]
ep_json = base + epi_ext
ep_json = glob.glob(ep_json)[0]
with open(ep_json, 'r') as fid:
episodes = json.load(fid)['episodes']
episodes = sorted(episodes, key=lambda x: x['onset'])
for episode in episodes:
episode['onset_round'] = round_to_step(episode['onset'], STEP)
rn = episode['rhythm_name']
episode['rhythm_name'] = RELABEL.get(rn, rn)
for e, episode in enumerate(episodes):
if e == len(episodes) - 1:
episode['offset_round'] = episode['offset']
else:
episode['offset_round'] = episodes[e+1]['onset_round'] - 1
return episodes
def make_labels(episodes):
labels = []
for episode in episodes:
rhythm_len = episode['offset_round'] - episode['onset_round'] + 1
rhythm_labels = int(rhythm_len / STEP)
rhythm = [episode['rhythm_name']] * rhythm_labels
labels.extend(rhythm)
trunc_samp = int(episodes[-1]['offset'] / STEP)
labels = labels[:trunc_samp]
return labels
def build_blacklist(blacklist_paths):
print('Building blacklist...')
blacklist = set()
for blacklist_path in blacklist_paths:
print(blacklist_path)
for record in get_all_records(blacklist_path):
blacklist.add(patient_id(record))
return blacklist
def construct_dataset(records, epi_ext='.episodes.json'):
data = []
for record in tqdm.tqdm(records):
labels = make_labels(load_episodes(record, epi_ext))
assert len(labels) != 0, "Zero labels?"
data.append((record, labels))
return data
def stratify(records, dev_frac):
pids = list(set(patient_id(record) for record in records))
random.shuffle(pids)
cut = int(len(pids) * dev_frac)
dev_pids = set(pids[:cut])
train = [r for r in records if patient_id(r) not in dev_pids]
dev = [r for r in records if patient_id(r) in dev_pids]
return train, dev
def load_train(data_path, dev_frac, blacklist_paths):
blacklist = build_blacklist(blacklist_paths)
records = get_all_records(data_path, blacklist)
train, dev = stratify(records, dev_frac)
print("Constructing train...")
train = construct_dataset(train)
print("Constructing dev...")
dev = construct_dataset(dev)
return train, dev
def load_rev_id(record, epi_ext):
base = os.path.splitext(record)[0]
ep_json = base + epi_ext
ep_json = glob.glob(ep_json)[0]
with open(ep_json, 'r') as fid:
return json.load(fid)['reviewer_id']
def load_test(data_path, epi_ext):
records = get_all_records(data_path)
print("Constructing test...")
test = construct_dataset(records, epi_ext)
# Get the reviewer id
reviewers = [load_rev_id(r, epi_ext) for r in records]
test = [(e, l, r)
for (e, l), r in zip(test, reviewers)]
return test
def make_json(save_path, dataset):
with open(save_path, 'w') as fid:
for d in dataset:
datum = {'ecg' : d[0],
'labels' : d[1]}
if len(d) == 3:
datum['reviewer'] = d[2]
json.dump(datum, fid)
fid.write('\n')
if __name__ == "__main__":
data_dir = "/deep/group/med/irhythm/ecg/clean_30sec_recs/"
blacklist_paths = [
os.path.join(data_dir, "label_review/CARDIOL_MAY_2017/"),
os.path.join(data_dir, "batches/kids_blacklist"),
os.path.join(data_dir, "batches/vf_blacklist")]
data_path = os.path.join(data_dir, "batches")
dev_frac = 0.1
train, dev = load_train(data_path, dev_frac, blacklist_paths)
make_json("train.json", train)
make_json("dev.json", dev)
test_dir = os.path.join(data_dir, "label_review/CARDIOL_UNIQ_P/")
test = load_test(test_dir, '_grp*.episodes.json')
make_json("test.json", test)
for i in range(6):
test = load_test(test_dir, "_rev{}.episodes.json".format(i))
make_json("test_rev{}.json".format(i), test)