|
a |
|
b/code/utils_processing.py |
|
|
1 |
""" |
|
|
2 |
DeepSlide |
|
|
3 |
Contains all functions for processing. |
|
|
4 |
|
|
|
5 |
Authors: Jason Wei, Behnaz Abdollahi, Saeed Hassanpour |
|
|
6 |
""" |
|
|
7 |
|
|
|
8 |
import functools |
|
|
9 |
import itertools |
|
|
10 |
import math |
|
|
11 |
import time |
|
|
12 |
from multiprocessing import (Process, Queue, RawArray) |
|
|
13 |
from pathlib import Path |
|
|
14 |
from shutil import copyfile |
|
|
15 |
from typing import (Callable, Dict, List, Tuple) |
|
|
16 |
|
|
|
17 |
import numpy as np |
|
|
18 |
from PIL import Image |
|
|
19 |
from imageio import (imsave, imread) |
|
|
20 |
from skimage.measure import block_reduce |
|
|
21 |
|
|
|
22 |
from utils import (get_all_image_paths, get_image_names, get_image_paths, |
|
|
23 |
get_subfolder_paths) |
|
|
24 |
|
|
|
25 |
Image.MAX_IMAGE_PIXELS = None |
|
|
26 |
|
|
|
27 |
|
|
|
28 |
def is_purple(crop: np.ndarray, purple_threshold: int, |
|
|
29 |
purple_scale_size: int) -> bool: |
|
|
30 |
""" |
|
|
31 |
Determines if a given portion of an image is purple. |
|
|
32 |
|
|
|
33 |
Args: |
|
|
34 |
crop: Portion of the image to check for being purple. |
|
|
35 |
purple_threshold: Number of purple points for region to be considered purple. |
|
|
36 |
purple_scale_size: Scalar to use for reducing image to check for purple. |
|
|
37 |
|
|
|
38 |
Returns: |
|
|
39 |
A boolean representing whether the image is purple or not. |
|
|
40 |
""" |
|
|
41 |
block_size = (crop.shape[0] // purple_scale_size, |
|
|
42 |
crop.shape[1] // purple_scale_size, 1) |
|
|
43 |
pooled = block_reduce(image=crop, block_size=block_size, func=np.average) |
|
|
44 |
|
|
|
45 |
# Calculate boolean arrays for determining if portion is purple. |
|
|
46 |
r, g, b = pooled[..., 0], pooled[..., 1], pooled[..., 2] |
|
|
47 |
cond1 = r > g - 10 |
|
|
48 |
cond2 = b > g - 10 |
|
|
49 |
cond3 = ((r + b) / 2) > g + 20 |
|
|
50 |
|
|
|
51 |
# Find the indexes of pooled satisfying all 3 conditions. |
|
|
52 |
pooled = pooled[cond1 & cond2 & cond3] |
|
|
53 |
num_purple = pooled.shape[0] |
|
|
54 |
|
|
|
55 |
return num_purple > purple_threshold |
|
|
56 |
|
|
|
57 |
|
|
|
58 |
########################################### |
|
|
59 |
# GENERATING TRAINING DATA # |
|
|
60 |
########################################### |
|
|
61 |
|
|
|
62 |
|
|
|
63 |
def get_folder_size_and_num_images(folder: Path) -> Tuple[float, int]: |
|
|
64 |
""" |
|
|
65 |
Finds the number and size of images in a folder path. |
|
|
66 |
Used to decide how much to slide windows. |
|
|
67 |
|
|
|
68 |
Args: |
|
|
69 |
folder: Folder containing images. |
|
|
70 |
|
|
|
71 |
Returns: |
|
|
72 |
A tuple containing the total size of the images and the number of images. |
|
|
73 |
""" |
|
|
74 |
image_paths = get_image_paths(folder=folder) |
|
|
75 |
|
|
|
76 |
file_size = 0 |
|
|
77 |
for image_path in image_paths: |
|
|
78 |
file_size += image_path.stat().st_size |
|
|
79 |
|
|
|
80 |
file_size_mb = file_size / 1e6 |
|
|
81 |
return file_size_mb, len(image_paths) |
|
|
82 |
|
|
|
83 |
|
|
|
84 |
def get_subfolder_to_overlap(subfolders: List[Path], |
|
|
85 |
desired_crops_per_class: int |
|
|
86 |
) -> Dict[Path, float]: |
|
|
87 |
""" |
|
|
88 |
Find how much the inverse overlap factor should be for each folder so that |
|
|
89 |
the class distributions are approximately equal. |
|
|
90 |
|
|
|
91 |
Args: |
|
|
92 |
subfolders: Subfolders to calculate the overlap factors for. |
|
|
93 |
desired_crops_per_class: Desired number of patches per class. |
|
|
94 |
|
|
|
95 |
Returns: |
|
|
96 |
A dictionary mapping subfolder paths to inverse overlap factor. |
|
|
97 |
""" |
|
|
98 |
subfolder_to_overlap_factor = {} |
|
|
99 |
for subfolder in subfolders: |
|
|
100 |
subfolder_size, subfolder_num_images = get_folder_size_and_num_images( |
|
|
101 |
folder=subfolder) |
|
|
102 |
|
|
|
103 |
# Each image is 13KB = 0.013MB, idk I just added two randomly. |
|
|
104 |
overlap_factor = max( |
|
|
105 |
1.0, |
|
|
106 |
math.pow( |
|
|
107 |
math.sqrt(desired_crops_per_class / (subfolder_size / 0.013)), |
|
|
108 |
1.5)) |
|
|
109 |
subfolder_to_overlap_factor[subfolder] = overlap_factor |
|
|
110 |
print(f"{subfolder}: {subfolder_size}MB, " |
|
|
111 |
f"{subfolder_num_images} images, " |
|
|
112 |
f"overlap_factor={overlap_factor:.2f}") |
|
|
113 |
|
|
|
114 |
return subfolder_to_overlap_factor |
|
|
115 |
|
|
|
116 |
|
|
|
117 |
def gen_train_patches(input_folder: Path, output_folder: Path, |
|
|
118 |
num_train_per_class: int, num_workers: int, |
|
|
119 |
patch_size: int, purple_threshold: int, |
|
|
120 |
purple_scale_size: int, image_ext: str, |
|
|
121 |
type_histopath: bool) -> None: |
|
|
122 |
""" |
|
|
123 |
Generates all patches for subfolders in the training set. |
|
|
124 |
|
|
|
125 |
Args: |
|
|
126 |
input_folder: Folder containing the subfolders containing WSI. |
|
|
127 |
output_folder: Folder to save the patches to. |
|
|
128 |
num_train_per_class: The desired number of training patches per class. |
|
|
129 |
num_workers: Number of workers to use for IO. |
|
|
130 |
patch_size: Size of the patches extracted from the WSI. |
|
|
131 |
purple_threshold: Number of purple points for region to be considered purple. |
|
|
132 |
purple_scale_size: Scalar to use for reducing image to check for purple. |
|
|
133 |
image_ext: Image extension for saving patches. |
|
|
134 |
type_histopath: Only look for purple histopathology images and filter whitespace. |
|
|
135 |
""" |
|
|
136 |
# Find the subfolders and how much patches should overlap for each. |
|
|
137 |
subfolders = get_subfolder_paths(folder=input_folder) |
|
|
138 |
print(f"{subfolders} subfolders found from {input_folder}") |
|
|
139 |
subfolder_to_overlap_factor = get_subfolder_to_overlap( |
|
|
140 |
subfolders=subfolders, desired_crops_per_class=num_train_per_class) |
|
|
141 |
|
|
|
142 |
# Produce the patches. |
|
|
143 |
for input_subfolder in subfolders: |
|
|
144 |
produce_patches(input_folder=input_subfolder, |
|
|
145 |
output_folder=output_folder.joinpath( |
|
|
146 |
input_subfolder.name), |
|
|
147 |
inverse_overlap_factor=subfolder_to_overlap_factor[ |
|
|
148 |
input_subfolder], |
|
|
149 |
by_folder=False, |
|
|
150 |
num_workers=num_workers, |
|
|
151 |
patch_size=patch_size, |
|
|
152 |
purple_threshold=purple_threshold, |
|
|
153 |
purple_scale_size=purple_scale_size, |
|
|
154 |
image_ext=image_ext, |
|
|
155 |
type_histopath=type_histopath) |
|
|
156 |
|
|
|
157 |
print("\nfinished all folders\n") |
|
|
158 |
|
|
|
159 |
|
|
|
160 |
def gen_val_patches(input_folder: Path, output_folder: Path, |
|
|
161 |
overlap_factor: float, num_workers: int, patch_size: int, |
|
|
162 |
purple_threshold: int, purple_scale_size: int, |
|
|
163 |
image_ext: str, type_histopath: bool) -> None: |
|
|
164 |
""" |
|
|
165 |
Generates all patches for subfolders in the validation set. |
|
|
166 |
|
|
|
167 |
Args: |
|
|
168 |
input_folder: Folder containing the subfolders containing WSI. |
|
|
169 |
output_folder: Folder to save the patches to. |
|
|
170 |
overlap_factor: The amount of overlap between patches. |
|
|
171 |
num_workers: Number of workers to use for IO. |
|
|
172 |
patch_size: Size of the patches extracted from the WSI. |
|
|
173 |
purple_threshold: Number of purple points for region to be considered purple. |
|
|
174 |
purple_scale_size: Scalar to use for reducing image to check for purple. |
|
|
175 |
image_ext: Image extension for saving patches. |
|
|
176 |
type_histopath: Only look for purple histopathology images and filter whitespace. |
|
|
177 |
""" |
|
|
178 |
# Find the subfolders and how much patches should overlap for each. |
|
|
179 |
subfolders = get_subfolder_paths(folder=input_folder) |
|
|
180 |
print(f"{len(subfolders)} subfolders found from {input_folder}") |
|
|
181 |
|
|
|
182 |
# Produce the patches. |
|
|
183 |
for input_subfolder in subfolders: |
|
|
184 |
produce_patches(input_folder=input_subfolder, |
|
|
185 |
output_folder=output_folder.joinpath( |
|
|
186 |
input_subfolder.name), |
|
|
187 |
inverse_overlap_factor=overlap_factor, |
|
|
188 |
by_folder=False, |
|
|
189 |
num_workers=num_workers, |
|
|
190 |
patch_size=patch_size, |
|
|
191 |
purple_threshold=purple_threshold, |
|
|
192 |
purple_scale_size=purple_scale_size, |
|
|
193 |
image_ext=image_ext, |
|
|
194 |
type_histopath=type_histopath) |
|
|
195 |
|
|
|
196 |
print("\nfinished all folders\n") |
|
|
197 |
|
|
|
198 |
|
|
|
199 |
########################################### |
|
|
200 |
# BALANCING CLASS DISTRIBUTION # |
|
|
201 |
########################################### |
|
|
202 |
|
|
|
203 |
|
|
|
204 |
def duplicate_until_n(image_paths: List[Path], n: int) -> None: |
|
|
205 |
""" |
|
|
206 |
Duplicate the underrepresented classes to balance class distributions. |
|
|
207 |
|
|
|
208 |
Args: |
|
|
209 |
image_paths: Image paths to check for balance. |
|
|
210 |
n: Desired number of images. |
|
|
211 |
""" |
|
|
212 |
num_dupls = n - len(image_paths) |
|
|
213 |
|
|
|
214 |
print(f"balancing {image_paths[0].parent} by duplicating {num_dupls}") |
|
|
215 |
|
|
|
216 |
for i in range(num_dupls): |
|
|
217 |
image_path = image_paths[i % len(image_paths)] |
|
|
218 |
|
|
|
219 |
xys = image_path.name.split("_") |
|
|
220 |
x = xys[:-2] |
|
|
221 |
y = xys[-2:] |
|
|
222 |
|
|
|
223 |
copyfile(src=image_path, |
|
|
224 |
dst=Path( |
|
|
225 |
image_path.parent, f"{'_'.join(x)}dup" |
|
|
226 |
f"{(i // len(image_paths)) + 2}_" |
|
|
227 |
f"{'_'.join(y)}")) |
|
|
228 |
|
|
|
229 |
|
|
|
230 |
def balance_classes(training_folder: Path) -> None: |
|
|
231 |
""" |
|
|
232 |
Balancing class distribution so that training isn't skewed. |
|
|
233 |
|
|
|
234 |
Args: |
|
|
235 |
training_folder: Folder containing the subfolders to be balanced. |
|
|
236 |
""" |
|
|
237 |
subfolders = get_subfolder_paths(folder=training_folder) |
|
|
238 |
subfolder_to_images = { |
|
|
239 |
subfolder: get_image_paths(folder=subfolder) |
|
|
240 |
for subfolder in subfolders |
|
|
241 |
} |
|
|
242 |
|
|
|
243 |
# Find the class with the most images. |
|
|
244 |
biggest_size = max({ |
|
|
245 |
subfolder: len(subfolder_to_images[subfolder]) |
|
|
246 |
for subfolder in subfolders |
|
|
247 |
}.values()) |
|
|
248 |
|
|
|
249 |
for subfolder in subfolder_to_images: |
|
|
250 |
duplicate_until_n(image_paths=subfolder_to_images[subfolder], |
|
|
251 |
n=biggest_size) |
|
|
252 |
|
|
|
253 |
print(f"balanced all training classes to have {biggest_size} images\n") |
|
|
254 |
|
|
|
255 |
|
|
|
256 |
def find_patch_mp(func: Callable[[Tuple[int, int]], int], in_queue: Queue, |
|
|
257 |
out_queue: Queue) -> None: |
|
|
258 |
""" |
|
|
259 |
Find the patches from the WSI using multiprocessing. |
|
|
260 |
Helper function to ensure values are sent to each process |
|
|
261 |
correctly. |
|
|
262 |
|
|
|
263 |
Args: |
|
|
264 |
func: Function to call in multiprocessing. |
|
|
265 |
in_queue: Queue containing input data. |
|
|
266 |
out_queue: Queue to put output in. |
|
|
267 |
""" |
|
|
268 |
while True: |
|
|
269 |
xy = in_queue.get() |
|
|
270 |
if xy is None: |
|
|
271 |
break |
|
|
272 |
out_queue.put(obj=func(xy)) |
|
|
273 |
|
|
|
274 |
|
|
|
275 |
def find_patch(xy_start: Tuple[int, int], output_folder: Path, |
|
|
276 |
image: np.ndarray, by_folder: bool, image_loc: Path, |
|
|
277 |
patch_size: int, image_ext: str, type_histopath: bool, |
|
|
278 |
purple_threshold: int, purple_scale_size: int) -> int: |
|
|
279 |
""" |
|
|
280 |
Find the patches for a WSI. |
|
|
281 |
|
|
|
282 |
Args: |
|
|
283 |
output_folder: Folder to save the patches to. |
|
|
284 |
image: WSI to extract patches from. |
|
|
285 |
xy_start: Starting coordinates of the patch. |
|
|
286 |
by_folder: Whether to generate the patches by folder or by image. |
|
|
287 |
image_loc: Location of the image to use for creating output filename. |
|
|
288 |
patch_size: Size of the patches extracted from the WSI. |
|
|
289 |
image_ext: Image extension for saving patches. |
|
|
290 |
type_histopath: Only look for purple histopathology images and filter whitespace. |
|
|
291 |
purple_threshold: Number of purple points for region to be considered purple. |
|
|
292 |
purple_scale_size: Scalar to use for reducing image to check for purple. |
|
|
293 |
|
|
|
294 |
Returns: |
|
|
295 |
The number 1 if the image was saved successfully and a 0 otherwise. |
|
|
296 |
Used to determine the number of patches produced per WSI. |
|
|
297 |
""" |
|
|
298 |
x_start, y_start = xy_start |
|
|
299 |
|
|
|
300 |
patch = image[x_start:x_start + patch_size, y_start:y_start + |
|
|
301 |
patch_size, :] |
|
|
302 |
# Sometimes the images are RGBA instead of RGB. Only keep RGB channels. |
|
|
303 |
patch = patch[..., [0, 1, 2]] |
|
|
304 |
|
|
|
305 |
if by_folder: |
|
|
306 |
output_subsubfolder = output_folder.joinpath( |
|
|
307 |
Path(image_loc.name).with_suffix("")) |
|
|
308 |
output_subsubfolder = output_subsubfolder.joinpath( |
|
|
309 |
output_subsubfolder.name) |
|
|
310 |
output_subsubfolder.mkdir(parents=True, exist_ok=True) |
|
|
311 |
output_path = output_subsubfolder.joinpath( |
|
|
312 |
f"{str(x_start).zfill(5)};{str(y_start).zfill(5)}.{image_ext}") |
|
|
313 |
else: |
|
|
314 |
output_path = output_folder.joinpath( |
|
|
315 |
f"{image_loc.stem}_{x_start}_{y_start}.{image_ext}") |
|
|
316 |
|
|
|
317 |
if type_histopath: |
|
|
318 |
if is_purple(crop=patch, |
|
|
319 |
purple_threshold=purple_threshold, |
|
|
320 |
purple_scale_size=purple_scale_size): |
|
|
321 |
imsave(uri=output_path, im=patch) |
|
|
322 |
else: |
|
|
323 |
return 0 |
|
|
324 |
else: |
|
|
325 |
imsave(uri=output_path, im=patch) |
|
|
326 |
return 1 |
|
|
327 |
|
|
|
328 |
|
|
|
329 |
def produce_patches(input_folder: Path, output_folder: Path, |
|
|
330 |
inverse_overlap_factor: float, by_folder: bool, |
|
|
331 |
num_workers: int, patch_size: int, purple_threshold: int, |
|
|
332 |
purple_scale_size: int, image_ext: str, |
|
|
333 |
type_histopath: bool) -> None: |
|
|
334 |
""" |
|
|
335 |
Produce the patches from the WSI in parallel. |
|
|
336 |
|
|
|
337 |
Args: |
|
|
338 |
input_folder: Folder containing the WSI. |
|
|
339 |
output_folder: Folder to save the patches to. |
|
|
340 |
inverse_overlap_factor: Overlap factor used in patch creation. |
|
|
341 |
by_folder: Whether to generate the patches by folder or by image. |
|
|
342 |
num_workers: Number of workers to use for IO. |
|
|
343 |
patch_size: Size of the patches extracted from the WSI. |
|
|
344 |
purple_threshold: Number of purple points for region to be considered purple. |
|
|
345 |
purple_scale_size: Scalar to use for reducing image to check for purple. |
|
|
346 |
image_ext: Image extension for saving patches. |
|
|
347 |
type_histopath: Only look for purple histopathology images and filter whitespace. |
|
|
348 |
""" |
|
|
349 |
output_folder.mkdir(parents=True, exist_ok=True) |
|
|
350 |
image_locs = get_all_image_paths( |
|
|
351 |
master_folder=input_folder) if by_folder else get_image_names( |
|
|
352 |
folder=input_folder) |
|
|
353 |
outputted_patches = 0 |
|
|
354 |
|
|
|
355 |
print(f"\ngetting small crops from {len(image_locs)} " |
|
|
356 |
f"images in {input_folder} " |
|
|
357 |
f"with inverse overlap factor {inverse_overlap_factor:.2f} " |
|
|
358 |
f"outputting in {output_folder}") |
|
|
359 |
|
|
|
360 |
start_time = time.time() |
|
|
361 |
|
|
|
362 |
for image_loc in image_locs: |
|
|
363 |
image = imread( |
|
|
364 |
uri=(image_loc if by_folder else input_folder.joinpath(image_loc))) |
|
|
365 |
|
|
|
366 |
# Sources: |
|
|
367 |
# 1. https://research.wmz.ninja/articles/2018/03/on-sharing-large-arrays-when-using-pythons-multiprocessing.html |
|
|
368 |
# 2. https://stackoverflow.com/questions/33247262/the-corresponding-ctypes-type-of-a-numpy-dtype |
|
|
369 |
# 3. https://stackoverflow.com/questions/7894791/use-numpy-array-in-shared-memory-for-multiprocessing |
|
|
370 |
img = RawArray( |
|
|
371 |
typecode_or_type=np.ctypeslib.as_ctypes_type(dtype=image.dtype), |
|
|
372 |
size_or_initializer=image.size) |
|
|
373 |
img_np = np.frombuffer(buffer=img, |
|
|
374 |
dtype=image.dtype).reshape(image.shape) |
|
|
375 |
np.copyto(dst=img_np, src=image) |
|
|
376 |
|
|
|
377 |
# Number of x starting points. |
|
|
378 |
x_steps = int((image.shape[0] - patch_size) / patch_size * |
|
|
379 |
inverse_overlap_factor) + 1 |
|
|
380 |
# Number of y starting points. |
|
|
381 |
y_steps = int((image.shape[1] - patch_size) / patch_size * |
|
|
382 |
inverse_overlap_factor) + 1 |
|
|
383 |
# Step size, same for x and y. |
|
|
384 |
step_size = int(patch_size / inverse_overlap_factor) |
|
|
385 |
|
|
|
386 |
# Create the queues for passing data back and forth. |
|
|
387 |
in_queue = Queue() |
|
|
388 |
out_queue = Queue(maxsize=-1) |
|
|
389 |
|
|
|
390 |
# Create the processes for multiprocessing. |
|
|
391 |
processes = [ |
|
|
392 |
Process(target=find_patch_mp, |
|
|
393 |
args=(functools.partial( |
|
|
394 |
find_patch, |
|
|
395 |
output_folder=output_folder, |
|
|
396 |
image=img_np, |
|
|
397 |
by_folder=by_folder, |
|
|
398 |
image_loc=image_loc, |
|
|
399 |
purple_threshold=purple_threshold, |
|
|
400 |
purple_scale_size=purple_scale_size, |
|
|
401 |
image_ext=image_ext, |
|
|
402 |
type_histopath=type_histopath, |
|
|
403 |
patch_size=patch_size), in_queue, out_queue)) |
|
|
404 |
for __ in range(num_workers) |
|
|
405 |
] |
|
|
406 |
for p in processes: |
|
|
407 |
p.daemon = True |
|
|
408 |
p.start() |
|
|
409 |
|
|
|
410 |
# Put the (x, y) coordinates in the input queue. |
|
|
411 |
for xy in itertools.product(range(0, x_steps * step_size, step_size), |
|
|
412 |
range(0, y_steps * step_size, step_size)): |
|
|
413 |
in_queue.put(obj=xy) |
|
|
414 |
|
|
|
415 |
# Store num_workers None values so the processes exit when not enough jobs left. |
|
|
416 |
for __ in range(num_workers): |
|
|
417 |
in_queue.put(obj=None) |
|
|
418 |
|
|
|
419 |
num_patches = sum([out_queue.get() for __ in range(x_steps * y_steps)]) |
|
|
420 |
|
|
|
421 |
# Join the processes as they finish. |
|
|
422 |
for p in processes: |
|
|
423 |
p.join(timeout=1) |
|
|
424 |
|
|
|
425 |
if by_folder: |
|
|
426 |
print(f"{image_loc}: num outputted windows: {num_patches}") |
|
|
427 |
else: |
|
|
428 |
outputted_patches += num_patches |
|
|
429 |
|
|
|
430 |
if not by_folder: |
|
|
431 |
print( |
|
|
432 |
f"finished patches from {input_folder} " |
|
|
433 |
f"with inverse overlap factor {inverse_overlap_factor:.2f} in {time.time() - start_time:.2f} seconds " |
|
|
434 |
f"outputting in {output_folder} " |
|
|
435 |
f"for {outputted_patches} patches") |