Switch to unified view

a b/preprocess/prepare-data-2d.py
1
import os, shutil, sys
2
from pathlib import Path
3
import numpy as np
4
import pandas as pd
5
from tqdm import tqdm
6
import re
7
from PIL import Image
8
sys.path.append("./")
9
from preprocessingutils import pwr_transform
10
11
from argparse import ArgumentParser
12
13
parser = ArgumentParser()
14
parser.add_argument('--out-dir', default='', help='place to put the slices (leave empty for default)')
15
parser.add_argument('--in-dir', default='', help='input directory (leave empty for default)')
16
parser.add_argument('--min-size', default=20, type=int, help='minimal size of noduls in mm')
17
parser.add_argument('--splits', default='train,valid', help='which spits to create, separate with ,')
18
parser.add_argument('--manual-seed', default=12345, help='seed for generating splits')
19
parser.add_argument('--silent', action='store_false', dest='verbose', help='dont print stuff')
20
parser.add_argument('--valid_min', default=int(1000), type=int, help='minimal number of validation and test imgs, otherwise do 20%')
21
parser.add_argument('--no-imgs', action='store_false', dest='copy_imgs', help='dont copy, only update dataframe')
22
parser.add_argument('--out-size', default=70, type=int, help='out size, uses center crop, if None, no resizing will be done')
23
parser.add_argument('--test', action='store_true', help='do only a few imgs for testing the code')
24
parser.set_defaults(verbose=True, copy_imgs=True, test=False)
25
26
def get_center_crop_bbox(in_size, out_size):
27
    """
28
    get bounding box for center crop
29
    in_size is (width, height for PIL.Image)
30
    """
31
32
    center = np.array(in_size) / 2
33
    left   = int(center[0] - out_size / 2)
34
    right  = int(left + out_size)
35
    upper  = int(center[0] - out_size / 2)
36
    lower  = int(upper + out_size)
37
    
38
    return (left, upper, right, lower)
39
40
def main(args):
41
    # find location for resources
42
    resourcedir = Path.cwd().parent / 'resources'
43
44
    # load dataframe with annotation data per nodule, made in the step 'lidc-preprocessing'
45
    df_ann = pd.read_csv(resourcedir / "annotation_df.csv")
46
47
    # show source files
48
    imgs = os.listdir(os.path.join(args.in_dir, "imgs"))
49
    imgs = [x for x in imgs if x.endswith(".png")]
50
    if args.verbose:
51
        print(f"found {len(imgs)} files")
52
53
    # img files are like 0001n01a2s086.png
54
    # imgs = imgs[:10]
55
    pids      = [x.split("n")[0] for x in imgs]
56
    nods      = [re.search(r"(?<=n)\d+", x).group() for x in imgs]
57
    anns      = [re.search(r"(?<=a)\d", x).group() for x in imgs]
58
    zvals     = [re.search(r"(?<=s)\d+", x).group() for x in imgs]
59
    ann_ids   = [x.split("s")[0] for x in imgs]
60
    nod_ids   = [x.split("a")[0] for x in imgs]
61
    slice_ids = [x.split(".png")[0] for x in imgs]
62
    nodule_slice_ids = [f"{nod_id}s{zval}" for nod_id, zval in zip(nod_ids, zvals)]
63
64
    slice_df = pd.DataFrame({
65
        'in_name': imgs,
66
        'pid': pids,
67
        'nodule_idx': nods,
68
        "annotation_idx": anns,
69
        "annotation_id": ann_ids,
70
        "nodule_id": nod_ids,
71
        "zval": zvals,
72
        "slice_id": slice_ids,
73
        "nodule_slice_id": nodule_slice_ids 
74
    })
75
76
    # add max number of annotations per nodule
77
    annotation_counts = df_ann.groupby('nodule_id').nodule_id.count().reset_index(name="annotation_count")
78
    slice_df = pd.merge(slice_df, annotation_counts, on="nodule_id")
79
    max_annotation_count_pid = slice_df.groupby("pid").annotation_count.max().reset_index(name='max_ann_count_per_pid')
80
    slice_df = pd.merge(slice_df, max_annotation_count_pid, on="pid")
81
82
    slice_counts = slice_df.groupby(["nodule_id", "zval"]).size().reset_index(name="slice_count")
83
    slice_df = pd.merge(slice_df, slice_counts, on=["nodule_id", "zval"])
84
    slice_df["all_anns_agree"] = slice_df.slice_count == slice_df.max_ann_count_per_pid
85
86
    slice_df.to_csv(resourcedir / "slice_df.csv", index=False)
87
88
    # keep only those slices where all annotators included the slice in their segmentation
89
    df = slice_df[(slice_df.all_anns_agree)]
90
91
    # import measurements
92
    measurements = pd.read_csv(os.path.join(args.in_dir, "measurements.csv"))
93
    df = pd.merge(df, measurements, left_on="in_name", right_on="name")
94
95
    # keep only slices greater than the cutoff
96
    df = df[df["size"] > args.min_size]
97
98
    print(f"number of slices left: {len(df)}")
99
100
    slices_per_pid    = len(df) / len(df.pid.unique())
101
    # divide by 4 because only 1 of the annotations gets selected
102
    slices_per_nodule = (len(df) / len(df.nodule_id.unique())) / 4
103
104
    np.random.seed(args.manual_seed)
105
    VALID_PROP = 0.3
106
    TEST_PROP  = 0.0
107
    
108
    df["uid"] = df.nodule_id
109
    # df.set_index("slice_id", drop=False, inplace=True)
110
    uids = df['uid'].unique().tolist()
111
112
    # valid_size  = int(min(args.valid_min, int(len(uids) * VALID_PROP * slices_per_pid / 4)) / (slices_per_pid / 4))
113
    # test_size   = int(min(args.valid_min, int(len(uids) * TEST_PROP * slices_per_pid / 4)) / (slices_per_pid / 4))
114
    valid_size  = int(min(args.valid_min, int(len(uids) * VALID_PROP * slices_per_nodule)) / (slices_per_nodule))
115
    test_size   = int(min(args.valid_min, int(len(uids) * TEST_PROP * slices_per_nodule)) / (slices_per_nodule))
116
117
    test_uids  = list(np.random.choice(uids, replace = False, size = test_size))
118
    valid_uids = list(np.random.choice(list(set(uids) - set(test_uids)), size = valid_size))
119
    train_uids = list(set(uids) - (set(valid_uids +  test_uids)))
120
    split_dict = dict(zip(train_uids + valid_uids + test_uids,
121
                        ["train"] *len(train_uids) + ["valid"]*len(valid_uids) + ["test"] * len(test_uids)))
122
123
    df["split"] = df.uid.map(split_dict)
124
125
    # normalize continuous variables
126
    cont_vars = ["size", "variance", "min", "max", "mean"]
127
    train_idxs = np.where(df.uid.isin(train_uids))
128
    df[cont_vars] = df[cont_vars].apply(pwr_transform, train_ids=train_idxs)
129
130
    # average measurements over annotations, pick single slice per measurement
131
    df = df.groupby("nodule_slice_id").agg({
132
        'size': 'mean', 
133
        'variance': 'mean',
134
        "min": 'mean',
135
        "max": 'mean',
136
        "mean": 'mean',
137
        'in_name': 'first',
138
        'split': 'first',
139
    })
140
141
    df["name"] = df.in_name.apply(lambda x: os.path.join("imgs", x))
142
143
    if args.test:
144
        df = df.iloc[:10,]
145
146
    if args.out_size:
147
        print("resizing and saving images")
148
149
        # create output directories
150
        for split in args.splits.split(","):
151
            for subdir in ["imgs", "masks"]:
152
                if not os.path.isdir(os.path.join(args.out_dir, split, subdir)):
153
                    os.makedirs(os.path.join(args.out_dir, split, subdir))
154
155
        # crop and copy images
156
        for slice_id, row in tqdm(df.iterrows()):
157
            img = Image.open(os.path.join(args.in_dir, 'imgs', row['in_name']), 'r')
158
            img_crop = img.crop(get_center_crop_bbox(img.size, args.out_size))
159
            img_crop.save(os.path.join(args.out_dir, row["split"], "imgs", row["in_name"]))
160
161
            mask = Image.open(os.path.join(args.in_dir, 'masks', row['in_name']), 'r')
162
            mask_crop = mask.crop(get_center_crop_bbox(mask.size, args.out_size))
163
            mask_crop.save(os.path.join(args.out_dir, row["split"], "masks", row["in_name"]))
164
165
    else:
166
        if args.copy_imgs:
167
            print("copying images")
168
            for split in args.splits.split(","):
169
                for subdir in ["imgs", "masks"]:
170
                    if not os.path.isdir(os.path.join(args.out_dir, split, subdir)):
171
                        os.makedirs(os.path.join(args.out_dir, split, subdir))
172
173
            for slice_id, row in tqdm(df.iterrows()):
174
                shutil.copy(os.path.join(args.in_dir, 'imgs', row["in_name"]),
175
                            os.path.join(args.out_dir, row["split"], "imgs", row["in_name"]))
176
                shutil.copy(os.path.join(args.in_dir, 'masks', row["in_name"]),
177
                            os.path.join(args.out_dir, row["split"], 'masks', row["in_name"]))
178
179
    df.to_csv(os.path.join(args.out_dir, "labels.csv"), index=False)
180
    print(df.split.value_counts())
181
182
if __name__ == "__main__":
183
    args = parser.parse_args()
184
    if args.out_dir == '':
185
        args.out_dir = (Path.cwd().parent) / 'data' / 'slices'
186
    if args.in_dir == '':
187
        args.in_dir = (Path.cwd().parent) / 'data' / 'nodules2d'
188
    main(args)