Switch to unified view

a b/notebooks/stratification_checks_v2.py
1
# %%
2
from collections import defaultdict
3
import json
4
import os
5
6
import numpy as np
7
import pandas as pd
8
9
from matplotlib.pyplot import hist
10
from skmultilearn.model_selection import iterative_train_test_split
11
12
# enable lib loading even if not installed as a pip package or in PYTHONPATH
13
# also convenient for relative paths in example config files
14
from pathlib import Path
15
os.chdir(Path(__file__).resolve().parent.parent)
16
17
# %%
18
from adpkd_segmentation.data.data_utils import (  # noqa
19
    display_sample,
20
    get_labeled,
21
    get_y_Path,
22
    make_dcmdicts,
23
    path_2dcm_int16,
24
    path_2label,
25
)
26
from adpkd_segmentation.data.data_utils import (  # noqa
27
    PATIENT,
28
    SEQUENCE,
29
    KIDNEY_PIXELS,
30
    MR,
31
    VOXEL_VOLUME,
32
)
33
34
STUDY_TKV = "study_tkv"
35
36
# %%
37
dcm_paths = sorted(get_labeled())
38
dcm2attribs, patient2dcm = make_dcmdicts(tuple(dcm_paths))
39
all_patient_IDS = list(patient2dcm.keys())
40
41
42
# TKV checks
43
# %%
44
def TKV_update(dcm2attribs):
45
    studies = defaultdict(int)
46
    for dcm, attribs in dcm2attribs.items():
47
        study_id = (attribs[PATIENT], attribs[MR])
48
        studies[study_id] += attribs[KIDNEY_PIXELS] * attribs[VOXEL_VOLUME]
49
50
    for dcm, attribs in dcm2attribs.items():
51
        tkv = studies[(attribs[PATIENT], attribs[MR])]
52
        attribs[STUDY_TKV] = tkv
53
54
    return studies, dcm2attribs
55
56
57
# %%
58
studies, dcm2attribs = TKV_update(dcm2attribs)
59
hist(studies.values(), bins=40)
60
61
# %%
62
hist(np.log(list(studies.values())), bins=40)
63
64
# %%
65
# check outliers
66
outliers = []
67
for study, tkv in studies.items():
68
    if np.log(tkv) < 11:
69
        outliers.append((study, tkv))
70
71
print(outliers)
72
73
# %%
74
images = []
75
labels = []
76
sequences = set()
77
for dcm_path, attribs in dcm2attribs.items():
78
    study_id = (attribs[PATIENT], attribs[MR])
79
    if study_id == outliers[0][0] and attribs[KIDNEY_PIXELS] > 0:
80
        im = path_2dcm_int16(dcm_path)
81
        label = path_2label(get_y_Path(dcm_path))
82
        images.append(im)
83
        labels.append(label)
84
        sequences.add(attribs[SEQUENCE])
85
86
# %%
87
for im, label in zip(images, labels):
88
    display_sample((im, label))
89
90
# %%
91
print("Outlier sequence: ", sequences)
92
93
# %%
94
# Patient info
95
96
patient_info = set()
97
for dcm_path, attribs in dcm2attribs.items():
98
    patient = attribs[PATIENT]
99
    seq = attribs[SEQUENCE]
100
    tkv = attribs[STUDY_TKV]
101
    mr = attribs[MR]
102
    patient_info.add((patient, seq, mr, tkv))
103
104
print(patient_info)
105
106
107
# %%
108
df = pd.DataFrame.from_records(
109
    list(patient_info),
110
    columns=[PATIENT, SEQUENCE, MR, STUDY_TKV],
111
    index=PATIENT,
112
).sort_index()
113
114
# %%
115
df.to_csv("./notebooks/patients_2020_09_06.csv")
116
117
# %%
118
print(df.index.value_counts())
119
120
# %%
121
print(df.seq.value_counts())
122
123
# %%
124
print(df.study_tkv.describe())
125
126
print(np.log(df.study_tkv).describe())
127
128
# %%
129
# patient IDs, wtihout the outlier
130
outlier_ids = {out[0][0] for out in outliers}
131
print(outlier_ids)
132
all_ids = [id_ for id_ in all_patient_IDS if id_ not in outlier_ids]
133
134
135
# %%
136
def create_label_arrays(patient_info, all_ids):
137
    patient_to_label = {}
138
    for id_ in all_ids:
139
        # previously 7
140
        # SSFSE, FIESTA, OTHER, LOG_TKV_1, LOG_TKV_2,
141
        # LOG_TKV_3, LOG_TKV_4
142
        patient_to_label[id_] = np.zeros(6, dtype=np.uint8)
143
    for patient, seq, mr, tkv in patient_info:
144
        # outlier
145
        if patient not in patient_to_label:
146
            continue
147
        # sequence labeling
148
        # the same patient can have more
149
        if "SSFSE" in seq:
150
            patient_to_label[patient][0] = 1
151
        elif "FIESTA" in seq:
152
            patient_to_label[patient][1] = 1
153
        else:
154
            # previously a separate label
155
            patient_to_label[patient][1] = 1
156
        # LOG TKV category
157
        # old values:
158
        # 13.6 to 15.1 interquartile range
159
        # 14.2 median
160
        log_tkv = np.log(tkv)
161
        if log_tkv < 13.7:
162
            patient_to_label[patient][2] = 1
163
        elif 13.7 <= log_tkv < 14.2:
164
            patient_to_label[patient][3] = 1
165
        elif 14.2 <= log_tkv < 14.8:
166
            patient_to_label[patient][4] = 1
167
        elif 14.8 <= log_tkv:
168
            patient_to_label[patient][5] = 1
169
170
    return patient_to_label
171
172
173
# %%
174
patient2label = create_label_arrays(patient_info, all_ids)
175
176
# %%
177
labels = [patient2label[id_] for id_ in all_ids]
178
X = np.array(all_ids)[..., np.newaxis]
179
y = np.stack(labels)
180
181
print(y.shape)
182
print(len(all_ids))
183
print(X.shape)
184
185
# %%
186
# Split to train, val, test
187
TRAIN = 0.7
188
VAL = 0.15
189
TEST = 0.15
190
np.random.seed = 42
191
192
X_train_val, y_train_val, X_test, y_test = iterative_train_test_split(
193
    X, y, test_size=TEST
194
)
195
196
X_train, y_train, X_val, y_val = iterative_train_test_split(
197
    X_train_val, y_train_val, test_size=VAL / (TRAIN + VAL)
198
)
199
200
# %%
201
print("Sizes: ", len(X_train), len(X_val), len(X_test))
202
203
# %%
204
print(y_train)
205
print(y_val)
206
print(y_test)
207
208
# %%
209
print(df[df.index.isin(X_test.squeeze())])
210
print(df[df.index.isin(X_val.squeeze())])
211
212
# %%
213
split_dict = {
214
    "train": list(X_train.squeeze()),
215
    "val": list(X_val.squeeze()),
216
    "test": list(X_test.squeeze()),
217
}
218
219
# %%
220
with open("./stratification/strat_split_2020_09_06.json", "w") as f:
221
    json.dump(split_dict, f, indent=4)
222
223
# %%