|
a |
|
b/preprocess/prepare-data-2d.py |
|
|
1 |
import os, shutil, sys |
|
|
2 |
from pathlib import Path |
|
|
3 |
import numpy as np |
|
|
4 |
import pandas as pd |
|
|
5 |
from tqdm import tqdm |
|
|
6 |
import re |
|
|
7 |
from PIL import Image |
|
|
8 |
sys.path.append("./") |
|
|
9 |
from preprocessingutils import pwr_transform |
|
|
10 |
|
|
|
11 |
from argparse import ArgumentParser |
|
|
12 |
|
|
|
13 |
parser = ArgumentParser() |
|
|
14 |
parser.add_argument('--out-dir', default='', help='place to put the slices (leave empty for default)') |
|
|
15 |
parser.add_argument('--in-dir', default='', help='input directory (leave empty for default)') |
|
|
16 |
parser.add_argument('--min-size', default=20, type=int, help='minimal size of noduls in mm') |
|
|
17 |
parser.add_argument('--splits', default='train,valid', help='which spits to create, separate with ,') |
|
|
18 |
parser.add_argument('--manual-seed', default=12345, help='seed for generating splits') |
|
|
19 |
parser.add_argument('--silent', action='store_false', dest='verbose', help='dont print stuff') |
|
|
20 |
parser.add_argument('--valid_min', default=int(1000), type=int, help='minimal number of validation and test imgs, otherwise do 20%') |
|
|
21 |
parser.add_argument('--no-imgs', action='store_false', dest='copy_imgs', help='dont copy, only update dataframe') |
|
|
22 |
parser.add_argument('--out-size', default=70, type=int, help='out size, uses center crop, if None, no resizing will be done') |
|
|
23 |
parser.add_argument('--test', action='store_true', help='do only a few imgs for testing the code') |
|
|
24 |
parser.set_defaults(verbose=True, copy_imgs=True, test=False) |
|
|
25 |
|
|
|
26 |
def get_center_crop_bbox(in_size, out_size): |
|
|
27 |
""" |
|
|
28 |
get bounding box for center crop |
|
|
29 |
in_size is (width, height for PIL.Image) |
|
|
30 |
""" |
|
|
31 |
|
|
|
32 |
center = np.array(in_size) / 2 |
|
|
33 |
left = int(center[0] - out_size / 2) |
|
|
34 |
right = int(left + out_size) |
|
|
35 |
upper = int(center[0] - out_size / 2) |
|
|
36 |
lower = int(upper + out_size) |
|
|
37 |
|
|
|
38 |
return (left, upper, right, lower) |
|
|
39 |
|
|
|
40 |
def main(args): |
|
|
41 |
# find location for resources |
|
|
42 |
resourcedir = Path.cwd().parent / 'resources' |
|
|
43 |
|
|
|
44 |
# load dataframe with annotation data per nodule, made in the step 'lidc-preprocessing' |
|
|
45 |
df_ann = pd.read_csv(resourcedir / "annotation_df.csv") |
|
|
46 |
|
|
|
47 |
# show source files |
|
|
48 |
imgs = os.listdir(os.path.join(args.in_dir, "imgs")) |
|
|
49 |
imgs = [x for x in imgs if x.endswith(".png")] |
|
|
50 |
if args.verbose: |
|
|
51 |
print(f"found {len(imgs)} files") |
|
|
52 |
|
|
|
53 |
# img files are like 0001n01a2s086.png |
|
|
54 |
# imgs = imgs[:10] |
|
|
55 |
pids = [x.split("n")[0] for x in imgs] |
|
|
56 |
nods = [re.search(r"(?<=n)\d+", x).group() for x in imgs] |
|
|
57 |
anns = [re.search(r"(?<=a)\d", x).group() for x in imgs] |
|
|
58 |
zvals = [re.search(r"(?<=s)\d+", x).group() for x in imgs] |
|
|
59 |
ann_ids = [x.split("s")[0] for x in imgs] |
|
|
60 |
nod_ids = [x.split("a")[0] for x in imgs] |
|
|
61 |
slice_ids = [x.split(".png")[0] for x in imgs] |
|
|
62 |
nodule_slice_ids = [f"{nod_id}s{zval}" for nod_id, zval in zip(nod_ids, zvals)] |
|
|
63 |
|
|
|
64 |
slice_df = pd.DataFrame({ |
|
|
65 |
'in_name': imgs, |
|
|
66 |
'pid': pids, |
|
|
67 |
'nodule_idx': nods, |
|
|
68 |
"annotation_idx": anns, |
|
|
69 |
"annotation_id": ann_ids, |
|
|
70 |
"nodule_id": nod_ids, |
|
|
71 |
"zval": zvals, |
|
|
72 |
"slice_id": slice_ids, |
|
|
73 |
"nodule_slice_id": nodule_slice_ids |
|
|
74 |
}) |
|
|
75 |
|
|
|
76 |
# add max number of annotations per nodule |
|
|
77 |
annotation_counts = df_ann.groupby('nodule_id').nodule_id.count().reset_index(name="annotation_count") |
|
|
78 |
slice_df = pd.merge(slice_df, annotation_counts, on="nodule_id") |
|
|
79 |
max_annotation_count_pid = slice_df.groupby("pid").annotation_count.max().reset_index(name='max_ann_count_per_pid') |
|
|
80 |
slice_df = pd.merge(slice_df, max_annotation_count_pid, on="pid") |
|
|
81 |
|
|
|
82 |
slice_counts = slice_df.groupby(["nodule_id", "zval"]).size().reset_index(name="slice_count") |
|
|
83 |
slice_df = pd.merge(slice_df, slice_counts, on=["nodule_id", "zval"]) |
|
|
84 |
slice_df["all_anns_agree"] = slice_df.slice_count == slice_df.max_ann_count_per_pid |
|
|
85 |
|
|
|
86 |
slice_df.to_csv(resourcedir / "slice_df.csv", index=False) |
|
|
87 |
|
|
|
88 |
# keep only those slices where all annotators included the slice in their segmentation |
|
|
89 |
df = slice_df[(slice_df.all_anns_agree)] |
|
|
90 |
|
|
|
91 |
# import measurements |
|
|
92 |
measurements = pd.read_csv(os.path.join(args.in_dir, "measurements.csv")) |
|
|
93 |
df = pd.merge(df, measurements, left_on="in_name", right_on="name") |
|
|
94 |
|
|
|
95 |
# keep only slices greater than the cutoff |
|
|
96 |
df = df[df["size"] > args.min_size] |
|
|
97 |
|
|
|
98 |
print(f"number of slices left: {len(df)}") |
|
|
99 |
|
|
|
100 |
slices_per_pid = len(df) / len(df.pid.unique()) |
|
|
101 |
# divide by 4 because only 1 of the annotations gets selected |
|
|
102 |
slices_per_nodule = (len(df) / len(df.nodule_id.unique())) / 4 |
|
|
103 |
|
|
|
104 |
np.random.seed(args.manual_seed) |
|
|
105 |
VALID_PROP = 0.3 |
|
|
106 |
TEST_PROP = 0.0 |
|
|
107 |
|
|
|
108 |
df["uid"] = df.nodule_id |
|
|
109 |
# df.set_index("slice_id", drop=False, inplace=True) |
|
|
110 |
uids = df['uid'].unique().tolist() |
|
|
111 |
|
|
|
112 |
# valid_size = int(min(args.valid_min, int(len(uids) * VALID_PROP * slices_per_pid / 4)) / (slices_per_pid / 4)) |
|
|
113 |
# test_size = int(min(args.valid_min, int(len(uids) * TEST_PROP * slices_per_pid / 4)) / (slices_per_pid / 4)) |
|
|
114 |
valid_size = int(min(args.valid_min, int(len(uids) * VALID_PROP * slices_per_nodule)) / (slices_per_nodule)) |
|
|
115 |
test_size = int(min(args.valid_min, int(len(uids) * TEST_PROP * slices_per_nodule)) / (slices_per_nodule)) |
|
|
116 |
|
|
|
117 |
test_uids = list(np.random.choice(uids, replace = False, size = test_size)) |
|
|
118 |
valid_uids = list(np.random.choice(list(set(uids) - set(test_uids)), size = valid_size)) |
|
|
119 |
train_uids = list(set(uids) - (set(valid_uids + test_uids))) |
|
|
120 |
split_dict = dict(zip(train_uids + valid_uids + test_uids, |
|
|
121 |
["train"] *len(train_uids) + ["valid"]*len(valid_uids) + ["test"] * len(test_uids))) |
|
|
122 |
|
|
|
123 |
df["split"] = df.uid.map(split_dict) |
|
|
124 |
|
|
|
125 |
# normalize continuous variables |
|
|
126 |
cont_vars = ["size", "variance", "min", "max", "mean"] |
|
|
127 |
train_idxs = np.where(df.uid.isin(train_uids)) |
|
|
128 |
df[cont_vars] = df[cont_vars].apply(pwr_transform, train_ids=train_idxs) |
|
|
129 |
|
|
|
130 |
# average measurements over annotations, pick single slice per measurement |
|
|
131 |
df = df.groupby("nodule_slice_id").agg({ |
|
|
132 |
'size': 'mean', |
|
|
133 |
'variance': 'mean', |
|
|
134 |
"min": 'mean', |
|
|
135 |
"max": 'mean', |
|
|
136 |
"mean": 'mean', |
|
|
137 |
'in_name': 'first', |
|
|
138 |
'split': 'first', |
|
|
139 |
}) |
|
|
140 |
|
|
|
141 |
df["name"] = df.in_name.apply(lambda x: os.path.join("imgs", x)) |
|
|
142 |
|
|
|
143 |
if args.test: |
|
|
144 |
df = df.iloc[:10,] |
|
|
145 |
|
|
|
146 |
if args.out_size: |
|
|
147 |
print("resizing and saving images") |
|
|
148 |
|
|
|
149 |
# create output directories |
|
|
150 |
for split in args.splits.split(","): |
|
|
151 |
for subdir in ["imgs", "masks"]: |
|
|
152 |
if not os.path.isdir(os.path.join(args.out_dir, split, subdir)): |
|
|
153 |
os.makedirs(os.path.join(args.out_dir, split, subdir)) |
|
|
154 |
|
|
|
155 |
# crop and copy images |
|
|
156 |
for slice_id, row in tqdm(df.iterrows()): |
|
|
157 |
img = Image.open(os.path.join(args.in_dir, 'imgs', row['in_name']), 'r') |
|
|
158 |
img_crop = img.crop(get_center_crop_bbox(img.size, args.out_size)) |
|
|
159 |
img_crop.save(os.path.join(args.out_dir, row["split"], "imgs", row["in_name"])) |
|
|
160 |
|
|
|
161 |
mask = Image.open(os.path.join(args.in_dir, 'masks', row['in_name']), 'r') |
|
|
162 |
mask_crop = mask.crop(get_center_crop_bbox(mask.size, args.out_size)) |
|
|
163 |
mask_crop.save(os.path.join(args.out_dir, row["split"], "masks", row["in_name"])) |
|
|
164 |
|
|
|
165 |
else: |
|
|
166 |
if args.copy_imgs: |
|
|
167 |
print("copying images") |
|
|
168 |
for split in args.splits.split(","): |
|
|
169 |
for subdir in ["imgs", "masks"]: |
|
|
170 |
if not os.path.isdir(os.path.join(args.out_dir, split, subdir)): |
|
|
171 |
os.makedirs(os.path.join(args.out_dir, split, subdir)) |
|
|
172 |
|
|
|
173 |
for slice_id, row in tqdm(df.iterrows()): |
|
|
174 |
shutil.copy(os.path.join(args.in_dir, 'imgs', row["in_name"]), |
|
|
175 |
os.path.join(args.out_dir, row["split"], "imgs", row["in_name"])) |
|
|
176 |
shutil.copy(os.path.join(args.in_dir, 'masks', row["in_name"]), |
|
|
177 |
os.path.join(args.out_dir, row["split"], 'masks', row["in_name"])) |
|
|
178 |
|
|
|
179 |
df.to_csv(os.path.join(args.out_dir, "labels.csv"), index=False) |
|
|
180 |
print(df.split.value_counts()) |
|
|
181 |
|
|
|
182 |
if __name__ == "__main__": |
|
|
183 |
args = parser.parse_args() |
|
|
184 |
if args.out_dir == '': |
|
|
185 |
args.out_dir = (Path.cwd().parent) / 'data' / 'slices' |
|
|
186 |
if args.in_dir == '': |
|
|
187 |
args.in_dir = (Path.cwd().parent) / 'data' / 'nodules2d' |
|
|
188 |
main(args) |