Diff of /code/utils_processing.py [000000] .. [594161]

Switch to unified view

a b/code/utils_processing.py
1
"""
2
DeepSlide
3
Contains all functions for processing.
4
5
Authors: Jason Wei, Behnaz Abdollahi, Saeed Hassanpour
6
"""
7
8
import functools
9
import itertools
10
import math
11
import time
12
from multiprocessing import (Process, Queue, RawArray)
13
from pathlib import Path
14
from shutil import copyfile
15
from typing import (Callable, Dict, List, Tuple)
16
17
import numpy as np
18
from PIL import Image
19
from imageio import (imsave, imread)
20
from skimage.measure import block_reduce
21
22
from utils import (get_all_image_paths, get_image_names, get_image_paths,
23
                   get_subfolder_paths)
24
25
Image.MAX_IMAGE_PIXELS = None
26
27
28
def is_purple(crop: np.ndarray, purple_threshold: int,
29
              purple_scale_size: int) -> bool:
30
    """
31
    Determines if a given portion of an image is purple.
32
33
    Args:
34
        crop: Portion of the image to check for being purple.
35
        purple_threshold: Number of purple points for region to be considered purple.
36
        purple_scale_size: Scalar to use for reducing image to check for purple.
37
38
    Returns:
39
        A boolean representing whether the image is purple or not.
40
    """
41
    block_size = (crop.shape[0] // purple_scale_size,
42
                  crop.shape[1] // purple_scale_size, 1)
43
    pooled = block_reduce(image=crop, block_size=block_size, func=np.average)
44
45
    # Calculate boolean arrays for determining if portion is purple.
46
    r, g, b = pooled[..., 0], pooled[..., 1], pooled[..., 2]
47
    cond1 = r > g - 10
48
    cond2 = b > g - 10
49
    cond3 = ((r + b) / 2) > g + 20
50
51
    # Find the indexes of pooled satisfying all 3 conditions.
52
    pooled = pooled[cond1 & cond2 & cond3]
53
    num_purple = pooled.shape[0]
54
55
    return num_purple > purple_threshold
56
57
58
###########################################
59
#         GENERATING TRAINING DATA        #
60
###########################################
61
62
63
def get_folder_size_and_num_images(folder: Path) -> Tuple[float, int]:
64
    """
65
    Finds the number and size of images in a folder path.
66
    Used to decide how much to slide windows.
67
68
    Args:
69
        folder: Folder containing images.
70
71
    Returns:
72
        A tuple containing the total size of the images and the number of images.
73
    """
74
    image_paths = get_image_paths(folder=folder)
75
76
    file_size = 0
77
    for image_path in image_paths:
78
        file_size += image_path.stat().st_size
79
80
    file_size_mb = file_size / 1e6
81
    return file_size_mb, len(image_paths)
82
83
84
def get_subfolder_to_overlap(subfolders: List[Path],
85
                             desired_crops_per_class: int
86
                             ) -> Dict[Path, float]:
87
    """
88
    Find how much the inverse overlap factor should be for each folder so that
89
    the class distributions are approximately equal.
90
91
    Args:
92
        subfolders: Subfolders to calculate the overlap factors for.
93
        desired_crops_per_class: Desired number of patches per class.
94
95
    Returns:
96
        A dictionary mapping subfolder paths to inverse overlap factor.
97
    """
98
    subfolder_to_overlap_factor = {}
99
    for subfolder in subfolders:
100
        subfolder_size, subfolder_num_images = get_folder_size_and_num_images(
101
            folder=subfolder)
102
103
        # Each image is 13KB = 0.013MB, idk I just added two randomly.
104
        overlap_factor = max(
105
            1.0,
106
            math.pow(
107
                math.sqrt(desired_crops_per_class / (subfolder_size / 0.013)),
108
                1.5))
109
        subfolder_to_overlap_factor[subfolder] = overlap_factor
110
        print(f"{subfolder}: {subfolder_size}MB, "
111
              f"{subfolder_num_images} images, "
112
              f"overlap_factor={overlap_factor:.2f}")
113
114
    return subfolder_to_overlap_factor
115
116
117
def gen_train_patches(input_folder: Path, output_folder: Path,
118
                      num_train_per_class: int, num_workers: int,
119
                      patch_size: int, purple_threshold: int,
120
                      purple_scale_size: int, image_ext: str,
121
                      type_histopath: bool) -> None:
122
    """
123
    Generates all patches for subfolders in the training set.
124
125
    Args:
126
        input_folder: Folder containing the subfolders containing WSI.
127
        output_folder: Folder to save the patches to.
128
        num_train_per_class: The desired number of training patches per class.
129
        num_workers: Number of workers to use for IO.
130
        patch_size: Size of the patches extracted from the WSI.
131
        purple_threshold: Number of purple points for region to be considered purple.
132
        purple_scale_size: Scalar to use for reducing image to check for purple.
133
        image_ext: Image extension for saving patches.
134
        type_histopath: Only look for purple histopathology images and filter whitespace.
135
    """
136
    # Find the subfolders and how much patches should overlap for each.
137
    subfolders = get_subfolder_paths(folder=input_folder)
138
    print(f"{subfolders} subfolders found from {input_folder}")
139
    subfolder_to_overlap_factor = get_subfolder_to_overlap(
140
        subfolders=subfolders, desired_crops_per_class=num_train_per_class)
141
142
    # Produce the patches.
143
    for input_subfolder in subfolders:
144
        produce_patches(input_folder=input_subfolder,
145
                        output_folder=output_folder.joinpath(
146
                            input_subfolder.name),
147
                        inverse_overlap_factor=subfolder_to_overlap_factor[
148
                            input_subfolder],
149
                        by_folder=False,
150
                        num_workers=num_workers,
151
                        patch_size=patch_size,
152
                        purple_threshold=purple_threshold,
153
                        purple_scale_size=purple_scale_size,
154
                        image_ext=image_ext,
155
                        type_histopath=type_histopath)
156
157
    print("\nfinished all folders\n")
158
159
160
def gen_val_patches(input_folder: Path, output_folder: Path,
161
                    overlap_factor: float, num_workers: int, patch_size: int,
162
                    purple_threshold: int, purple_scale_size: int,
163
                    image_ext: str, type_histopath: bool) -> None:
164
    """
165
    Generates all patches for subfolders in the validation set.
166
167
    Args:
168
        input_folder: Folder containing the subfolders containing WSI.
169
        output_folder: Folder to save the patches to.
170
        overlap_factor: The amount of overlap between patches.
171
        num_workers: Number of workers to use for IO.
172
        patch_size: Size of the patches extracted from the WSI.
173
        purple_threshold: Number of purple points for region to be considered purple.
174
        purple_scale_size: Scalar to use for reducing image to check for purple.
175
        image_ext: Image extension for saving patches.
176
        type_histopath: Only look for purple histopathology images and filter whitespace.
177
    """
178
    # Find the subfolders and how much patches should overlap for each.
179
    subfolders = get_subfolder_paths(folder=input_folder)
180
    print(f"{len(subfolders)} subfolders found from {input_folder}")
181
182
    # Produce the patches.
183
    for input_subfolder in subfolders:
184
        produce_patches(input_folder=input_subfolder,
185
                        output_folder=output_folder.joinpath(
186
                            input_subfolder.name),
187
                        inverse_overlap_factor=overlap_factor,
188
                        by_folder=False,
189
                        num_workers=num_workers,
190
                        patch_size=patch_size,
191
                        purple_threshold=purple_threshold,
192
                        purple_scale_size=purple_scale_size,
193
                        image_ext=image_ext,
194
                        type_histopath=type_histopath)
195
196
    print("\nfinished all folders\n")
197
198
199
###########################################
200
#       BALANCING CLASS DISTRIBUTION      #
201
###########################################
202
203
204
def duplicate_until_n(image_paths: List[Path], n: int) -> None:
205
    """
206
    Duplicate the underrepresented classes to balance class distributions.
207
208
    Args:
209
        image_paths: Image paths to check for balance.
210
        n: Desired number of images.
211
    """
212
    num_dupls = n - len(image_paths)
213
214
    print(f"balancing {image_paths[0].parent} by duplicating {num_dupls}")
215
216
    for i in range(num_dupls):
217
        image_path = image_paths[i % len(image_paths)]
218
219
        xys = image_path.name.split("_")
220
        x = xys[:-2]
221
        y = xys[-2:]
222
223
        copyfile(src=image_path,
224
                 dst=Path(
225
                     image_path.parent, f"{'_'.join(x)}dup"
226
                     f"{(i // len(image_paths)) + 2}_"
227
                     f"{'_'.join(y)}"))
228
229
230
def balance_classes(training_folder: Path) -> None:
231
    """
232
    Balancing class distribution so that training isn't skewed.
233
234
    Args:
235
        training_folder: Folder containing the subfolders to be balanced.
236
    """
237
    subfolders = get_subfolder_paths(folder=training_folder)
238
    subfolder_to_images = {
239
        subfolder: get_image_paths(folder=subfolder)
240
        for subfolder in subfolders
241
    }
242
243
    # Find the class with the most images.
244
    biggest_size = max({
245
        subfolder: len(subfolder_to_images[subfolder])
246
        for subfolder in subfolders
247
    }.values())
248
249
    for subfolder in subfolder_to_images:
250
        duplicate_until_n(image_paths=subfolder_to_images[subfolder],
251
                          n=biggest_size)
252
253
    print(f"balanced all training classes to have {biggest_size} images\n")
254
255
256
def find_patch_mp(func: Callable[[Tuple[int, int]], int], in_queue: Queue,
257
                  out_queue: Queue) -> None:
258
    """
259
    Find the patches from the WSI using multiprocessing.
260
    Helper function to ensure values are sent to each process
261
    correctly.
262
263
    Args:
264
        func: Function to call in multiprocessing.
265
        in_queue: Queue containing input data.
266
        out_queue: Queue to put output in.
267
    """
268
    while True:
269
        xy = in_queue.get()
270
        if xy is None:
271
            break
272
        out_queue.put(obj=func(xy))
273
274
275
def find_patch(xy_start: Tuple[int, int], output_folder: Path,
276
               image: np.ndarray, by_folder: bool, image_loc: Path,
277
               patch_size: int, image_ext: str, type_histopath: bool,
278
               purple_threshold: int, purple_scale_size: int) -> int:
279
    """
280
    Find the patches for a WSI.
281
282
    Args:
283
        output_folder: Folder to save the patches to.
284
        image: WSI to extract patches from.
285
        xy_start: Starting coordinates of the patch.
286
        by_folder: Whether to generate the patches by folder or by image.
287
        image_loc: Location of the image to use for creating output filename.
288
        patch_size: Size of the patches extracted from the WSI.
289
        image_ext: Image extension for saving patches.
290
        type_histopath: Only look for purple histopathology images and filter whitespace.
291
        purple_threshold: Number of purple points for region to be considered purple.
292
        purple_scale_size: Scalar to use for reducing image to check for purple.
293
294
    Returns:
295
        The number 1 if the image was saved successfully and a 0 otherwise.
296
        Used to determine the number of patches produced per WSI.
297
    """
298
    x_start, y_start = xy_start
299
300
    patch = image[x_start:x_start + patch_size, y_start:y_start +
301
                  patch_size, :]
302
    # Sometimes the images are RGBA instead of RGB. Only keep RGB channels.
303
    patch = patch[..., [0, 1, 2]]
304
305
    if by_folder:
306
        output_subsubfolder = output_folder.joinpath(
307
            Path(image_loc.name).with_suffix(""))
308
        output_subsubfolder = output_subsubfolder.joinpath(
309
            output_subsubfolder.name)
310
        output_subsubfolder.mkdir(parents=True, exist_ok=True)
311
        output_path = output_subsubfolder.joinpath(
312
            f"{str(x_start).zfill(5)};{str(y_start).zfill(5)}.{image_ext}")
313
    else:
314
        output_path = output_folder.joinpath(
315
            f"{image_loc.stem}_{x_start}_{y_start}.{image_ext}")
316
317
    if type_histopath:
318
        if is_purple(crop=patch,
319
                     purple_threshold=purple_threshold,
320
                     purple_scale_size=purple_scale_size):
321
            imsave(uri=output_path, im=patch)
322
        else:
323
            return 0
324
    else:
325
        imsave(uri=output_path, im=patch)
326
    return 1
327
328
329
def produce_patches(input_folder: Path, output_folder: Path,
330
                    inverse_overlap_factor: float, by_folder: bool,
331
                    num_workers: int, patch_size: int, purple_threshold: int,
332
                    purple_scale_size: int, image_ext: str,
333
                    type_histopath: bool) -> None:
334
    """
335
    Produce the patches from the WSI in parallel.
336
337
    Args:
338
        input_folder: Folder containing the WSI.
339
        output_folder: Folder to save the patches to.
340
        inverse_overlap_factor: Overlap factor used in patch creation.
341
        by_folder: Whether to generate the patches by folder or by image.
342
        num_workers: Number of workers to use for IO.
343
        patch_size: Size of the patches extracted from the WSI.
344
        purple_threshold: Number of purple points for region to be considered purple.
345
        purple_scale_size: Scalar to use for reducing image to check for purple.
346
        image_ext: Image extension for saving patches.
347
        type_histopath: Only look for purple histopathology images and filter whitespace.
348
    """
349
    output_folder.mkdir(parents=True, exist_ok=True)
350
    image_locs = get_all_image_paths(
351
        master_folder=input_folder) if by_folder else get_image_names(
352
            folder=input_folder)
353
    outputted_patches = 0
354
355
    print(f"\ngetting small crops from {len(image_locs)} "
356
          f"images in {input_folder} "
357
          f"with inverse overlap factor {inverse_overlap_factor:.2f} "
358
          f"outputting in {output_folder}")
359
360
    start_time = time.time()
361
362
    for image_loc in image_locs:
363
        image = imread(
364
            uri=(image_loc if by_folder else input_folder.joinpath(image_loc)))
365
366
        # Sources:
367
        # 1. https://research.wmz.ninja/articles/2018/03/on-sharing-large-arrays-when-using-pythons-multiprocessing.html
368
        # 2. https://stackoverflow.com/questions/33247262/the-corresponding-ctypes-type-of-a-numpy-dtype
369
        # 3. https://stackoverflow.com/questions/7894791/use-numpy-array-in-shared-memory-for-multiprocessing
370
        img = RawArray(
371
            typecode_or_type=np.ctypeslib.as_ctypes_type(dtype=image.dtype),
372
            size_or_initializer=image.size)
373
        img_np = np.frombuffer(buffer=img,
374
                               dtype=image.dtype).reshape(image.shape)
375
        np.copyto(dst=img_np, src=image)
376
377
        # Number of x starting points.
378
        x_steps = int((image.shape[0] - patch_size) / patch_size *
379
                      inverse_overlap_factor) + 1
380
        # Number of y starting points.
381
        y_steps = int((image.shape[1] - patch_size) / patch_size *
382
                      inverse_overlap_factor) + 1
383
        # Step size, same for x and y.
384
        step_size = int(patch_size / inverse_overlap_factor)
385
386
        # Create the queues for passing data back and forth.
387
        in_queue = Queue()
388
        out_queue = Queue(maxsize=-1)
389
390
        # Create the processes for multiprocessing.
391
        processes = [
392
            Process(target=find_patch_mp,
393
                    args=(functools.partial(
394
                        find_patch,
395
                        output_folder=output_folder,
396
                        image=img_np,
397
                        by_folder=by_folder,
398
                        image_loc=image_loc,
399
                        purple_threshold=purple_threshold,
400
                        purple_scale_size=purple_scale_size,
401
                        image_ext=image_ext,
402
                        type_histopath=type_histopath,
403
                        patch_size=patch_size), in_queue, out_queue))
404
            for __ in range(num_workers)
405
        ]
406
        for p in processes:
407
            p.daemon = True
408
            p.start()
409
410
        # Put the (x, y) coordinates in the input queue.
411
        for xy in itertools.product(range(0, x_steps * step_size, step_size),
412
                                    range(0, y_steps * step_size, step_size)):
413
            in_queue.put(obj=xy)
414
415
        # Store num_workers None values so the processes exit when not enough jobs left.
416
        for __ in range(num_workers):
417
            in_queue.put(obj=None)
418
419
        num_patches = sum([out_queue.get() for __ in range(x_steps * y_steps)])
420
421
        # Join the processes as they finish.
422
        for p in processes:
423
            p.join(timeout=1)
424
425
        if by_folder:
426
            print(f"{image_loc}: num outputted windows: {num_patches}")
427
        else:
428
            outputted_patches += num_patches
429
430
    if not by_folder:
431
        print(
432
            f"finished patches from {input_folder} "
433
            f"with inverse overlap factor {inverse_overlap_factor:.2f} in {time.time() - start_time:.2f} seconds "
434
            f"outputting in {output_folder} "
435
            f"for {outputted_patches} patches")