Diff of /tile_generator.py [000000] .. [bf1564]

Switch to unified view

a b/tile_generator.py
1
# System
2
import json
3
import multiprocessing
4
import os
5
import warnings
6
7
# Advanced
8
import xml.etree.ElementTree as ET
9
from argparse import ArgumentParser
10
from pathlib import Path
11
import pandas as pd
12
import cv2
13
import matplotlib.pyplot as plt
14
15
# Numpy
16
import numpy as np
17
18
# Image Processing
19
from PIL import Image
20
21
# # Fix to get the dlls to load properly under python >= 3.8 and windows
22
script_dir = os.path.dirname(os.path.realpath(__file__))
23
try:
24
    openslide_dll_path = os.path.join(script_dir, "..", "openslide-win64-20171122", "bin")
25
    os.add_dll_directory(openslide_dll_path)
26
    # print(openslide_dll_path)
27
28
except Exception as e:
29
    pass
30
31
# noinspection PyPep8
32
import openslide
33
34
# Custom
35
# noinspection PyPep8
36
import tissue_detection
37
38
_MULTIPROCESS = True
39
40
global lock
41
42
43
class WSIHandler:
44
    def __init__(self, config_path="resources/config.json"):
45
        self.slide = None
46
        self.output_path = None
47
        self.total_width = 0
48
        self.total_height = 0
49
        self.levels = 0
50
        self.current_level = 0
51
        self.annotation_list = None
52
        self.annotation_dict = None
53
        self.config = self.load_config(config_path)
54
        assert "save_annotated_only" in self.config.keys()
55
        self.annotated_only = self.config["save_annotated_only"]
56
        self.scanner = None
57
58
        self.res_x = None
59
        self.res_y = None
60
61
        self.validate_label_dict()
62
63
    def validate_label_dict(self):
64
        self.check_at_most_one_unannotated_label()
65
        self.check_unannotated_label_first()
66
67
    def check_at_most_one_unannotated_label(self):
68
        label_dict = self.config["label_dict"]
69
        unannotated_labels = []
70
        for label, label_config in label_dict.items():
71
            if not label_config["annotated"]:
72
                unannotated_labels.append(label)
73
        assert len(unannotated_labels) < 2, (f"More than one label (=tissue type) is marked as unannotated in the "
74
                                             f"config.label_dict. Please make sure that at most one type (usually "
75
                                             f"non-tumor) is marked as unannotated. The labels in question are "
76
                                             f"{unannotated_labels}.")
77
78
    def check_unannotated_label_first(self):
79
        label_dict = self.config["label_dict"]
80
        for label in list(label_dict)[1:]:
81
            assert label_dict[label]["annotated"], (f"WSIHandler requires the unannotated label to be located in the "
82
                                                    f"first position in config.label_dict. Please move the unannotated "
83
                                                    f"tissue type '{label}' to the first position.")
84
85
    def print_and_log_slide_error(self, slide_name, error_msg, method_name):
86
        print(f"Error in slide {slide_name}. The error is: {type(error_msg).__name__}: {error_msg} in method: "
87
              f"{method_name}.")
88
        with lock:
89
            with open(os.path.join(self.config["output_path"], "error_log.txt"), "a") as f:
90
                f.write(f"Error in slide {slide_name}. The error is: {type(error_msg).__name__}: {error_msg} in "
91
                        f"method: {method_name}.")
92
93
    @staticmethod
94
    def load_config(config_path):
95
        assert os.path.exists(config_path), "Cannot find " + config_path
96
        with open(config_path) as json_file:
97
            config = json.load(json_file)
98
99
        assert 1 >= config["tissue_coverage"] >= 0, "Tissue coverage must be between 1 and 0"
100
        assert config["blocked_threads"] >= 0
101
        assert config["patches_per_tile"] >= 1, "Patches per tile must be >= 1"
102
        assert 0 <= config["overlap"] < 1, "Overlap must be between 1 and 0"
103
        assert config["annotation_overlap"] >= 0 and config["overlap"] < 1, "Annotation overlap must be between 1 and 0"
104
105
        return config
106
107
    def load_slide(self, slide_path):
108
109
        self.slide = openslide.OpenSlide(slide_path)
110
        self.total_width = self.slide.dimensions[0]
111
        self.total_height = self.slide.dimensions[1]
112
        self.levels = self.slide.level_count - 1
113
114
        processing_level = self.config["processing_level"]
115
116
        if self.levels < self.config["processing_level"]:
117
            print("###############################################")
118
            print(
119
                "WARNING: Processing level above highest available slide level. Maximum slide level is "
120
                + str(self.levels)
121
                + ", processing level is "
122
                + str(self.config["processing_level"])
123
                + ". Setting processing level to "
124
                + str(self.levels)
125
            )
126
            print("###############################################")
127
            processing_level = self.levels
128
129
        return processing_level
130
131
    def load_annotation(self, annotation_path):
132
        annotation_dict = {}
133
        file_format = Path(annotation_path).suffix
134
135
        # QuPath exports
136
        if file_format == ".geojson" or file_format == ".txt":
137
            with open(annotation_path) as annotation_file:
138
                annotations = json.load(annotation_file)
139
140
            for polygon_nb in range(len(annotations["features"])):
141
                if annotations["features"][polygon_nb]["geometry"]["type"] == "Polygon":
142
                    if (annotations["features"][polygon_nb]["properties"]["classification"]["name"] in
143
                            self.config["label_dict"].keys()):
144
                        annotation_dict.update({polygon_nb: {
145
                            "coordinates": annotations["features"][polygon_nb]["geometry"]["coordinates"][0],
146
                            "tissue_type": annotations["features"][polygon_nb]["properties"]["classification"][
147
                                "name"]}})
148
                    else:
149
                        warnings.warn(f'Unknown annotation type in file {annotation_file.name}: The annotation label '
150
                                      f'"{annotations["features"][polygon_nb]["properties"]["classification"]["name"]}"'
151
                                      f' is not part of the provided label dictionary '
152
                                      f'(keys: {list(self.config["label_dict"].keys())}. Skipping.')
153
                else:
154
                    warnings.warn(f'Not implemented warning in file {annotation_file.name}: The handling of the QuPath '
155
                                  f'annotation type {annotations["features"][polygon_nb]["geometry"]["type"]} '
156
                                  f'(id:{annotations["features"][polygon_nb]["id"]}) has not been implemented, yet. '
157
                                  f'Skipping.')
158
        # xml for CAMELYON17
159
        elif file_format == ".xml":
160
            tree = ET.parse(annotation_path)
161
            root = tree.getroot()
162
163
            for elem in root:
164
                polygon_nb = 0
165
                for subelem in elem:
166
                    items = subelem.attrib
167
                    if "Type" in items.keys():
168
                        if items["Type"] == "Polygon":
169
                            polygon_list = []
170
                            for coordinates in subelem:
171
                                for coord in coordinates:
172
                                    polygon_list.append([float(coord.attrib["X"]), float(coord.attrib["Y"])])
173
                            # all annotationy in CAMELYON17 are tumor, so this is a pseudo label
174
                            annotation_dict.update({polygon_nb: {"coordinates": polygon_list, "tissue_type": "Tumor"}})
175
                            polygon_nb += 1
176
        else:
177
            return None
178
179
        return annotation_dict
180
181
    def get_img(self, level=None, show=False):
182
        if level is None:
183
            level = self.levels
184
185
        dims = self.slide.level_dimensions[level]
186
        image = np.array(self.slide.read_region((0, 0), level, dims))
187
188
        if show:
189
            # Katja: fix for Wayland issue on my Ubuntu:
190
            # run 'export QT_QPA_PLATFORM=xcb' before opening pycharm (in the same terminal)
191
            plt.imshow(image)
192
            plt.title("Slide image")
193
            plt.show()
194
195
        return image, level
196
197
    def apply_tissue_detection(self, level=None, show=False):
198
199
        if level is not None:
200
            image, level = self.get_img(level, show)
201
        else:
202
            image, level = self.get_img(show=show)
203
204
        tissue_mask = tissue_detection.tissue_detection(image, remove_top_percentage=0)
205
206
        if show:
207
            plt.imshow(tissue_mask)
208
            plt.title("Tissue Mask")
209
            plt.show()
210
211
        return tissue_mask, level
212
213
    def determine_tile_size(self, level):
214
215
        if self.config["calibration"]["use_non_pixel_lengths"]:
216
            tile_size_0 = (self.config["calibration"]["patch_size_microns"] / self.res_x) * self.config[
217
                "patches_per_tile"
218
            ]
219
        else:
220
            tile_size_0 = self.config["patches_per_tile"] * self.config["patch_size"]
221
222
        downscale_factor = int(self.slide.level_downsamples[level])
223
        tile_size = int(tile_size_0 / downscale_factor)
224
225
        assert self.config["patches_per_tile"] >= 1, "Patches per tile must be greater than 1."
226
227
        return tile_size
228
229
    def get_relevant_tiles(self, tissue_mask, tile_size, min_coverage, level, show=False):
230
231
        rows, row_residue = divmod(tissue_mask.shape[0], tile_size)
232
        cols, col_residue = divmod(tissue_mask.shape[1], tile_size)
233
234
        if row_residue:
235
            rows += 1
236
        if col_residue:
237
            cols += 1
238
239
        if self.config["use_tissue_detection"]:
240
            colored = cv2.cvtColor(tissue_mask, cv2.COLOR_GRAY2RGB)
241
242
        if self.annotation_dict is not None:
243
            annotation_mask = np.zeros(shape=(tissue_mask.shape[0], tissue_mask.shape[1]))
244
            scaling_factor = self.slide.level_downsamples[level]
245
            scaled_list = [
246
                [[point[0] / scaling_factor, point[1] / scaling_factor]
247
                 for point in self.annotation_dict[polygon]["coordinates"]]
248
                for polygon in self.annotation_dict
249
            ]
250
251
            for polygon in scaled_list:
252
                cv2.fillPoly(annotation_mask, [np.array(polygon).astype(np.int32)], 1)
253
254
        relevant_tiles_dict = {}
255
        tile_nb = 0
256
257
        # +1 to solve border issues
258
        for row in range(rows):
259
            for col in range(cols):
260
261
                tile = tissue_mask[
262
                       row * tile_size: row * tile_size + tile_size, col * tile_size: col * tile_size + tile_size
263
                       ]
264
                tissue_coverage = np.count_nonzero(tile) / tile.size
265
                annotated = False
266
267
                if self.annotation_dict is not None:
268
                    if (np.count_nonzero(annotation_mask[row * tile_size: row * tile_size + tile_size,
269
                                         col * tile_size: col * tile_size + tile_size, ]) > 0):
270
                        annotated = True
271
272
                if (tissue_coverage >= min_coverage or
273
                        (self.config["keep_annotated_tiles_despite_too_little_tissue_coverage"] and annotated)):
274
                    relevant_tiles_dict.update(
275
                        {
276
                            tile_nb: {
277
                                "x": col * tile_size,
278
                                "y": row * tile_size,
279
                                "size": tile_size,
280
                                "level": level,
281
                                "annotated": annotated,
282
                            }
283
                        }
284
                    )
285
                    if self.config["use_tissue_detection"]:
286
                        if annotated:
287
                            colored = cv2.rectangle(
288
                                colored,
289
                                (col * tile_size, row * tile_size),
290
                                (col * tile_size + tile_size, row * tile_size + tile_size),
291
                                (0, 255, 0),
292
                                3,
293
                            )
294
                        else:
295
                            colored = cv2.rectangle(
296
                                colored,
297
                                (col * tile_size, row * tile_size),
298
                                (col * tile_size + tile_size, row * tile_size + tile_size),
299
                                (255, 0, 0),
300
                                1,
301
                            )
302
303
                    tile_nb += 1
304
305
        if show and self.config["use_tissue_detection"]:
306
            plt.imshow(colored)
307
            plt.title("Tiled image")
308
            plt.show()
309
310
        return relevant_tiles_dict
311
312
    @staticmethod
313
    def tissue_percentage_over_threshold(label, label_dict, percentage):
314
        if label_dict[label]["type"] == "==":
315
            if label_dict[label]["threshold"] == percentage:
316
                return label, percentage
317
        elif label_dict[label]["type"] == ">=":
318
            if percentage >= label_dict[label]["threshold"]:
319
                return label, percentage
320
        elif label_dict[label]["type"] == ">":
321
            if percentage > label_dict[label]["threshold"]:
322
                return label, percentage
323
        elif label_dict[label]["type"] == "<=":
324
            if percentage <= percentage[label]["threshold"]:
325
                return label, percentage
326
        elif label_dict[label]["type"] == "<":
327
            if percentage < label_dict[label]["threshold"]:
328
                return label, percentage
329
330
        return None, None
331
332
    @staticmethod
333
    def check_tissue_percentage_over_threshold(label, label_dict, percentage):
334
        if label_dict[label]["type"] == "==":
335
            if label_dict[label]["threshold"] == percentage:
336
                return True
337
        elif label_dict[label]["type"] == ">=":
338
            if percentage >= label_dict[label]["threshold"]:
339
                return True
340
        elif label_dict[label]["type"] == ">":
341
            if percentage > label_dict[label]["threshold"]:
342
                return True
343
        elif label_dict[label]["type"] == "<=":
344
            if percentage <= percentage[label]["threshold"]:
345
                return True
346
        elif label_dict[label]["type"] == "<":
347
            if percentage < label_dict[label]["threshold"]:
348
                return True
349
        return False
350
351
    @staticmethod
352
    def get_unique_nonzero_entries(ndarray):
353
        return np.unique(ndarray[np.nonzero(ndarray)]).astype(int)
354
355
    def get_possible_labels(self, annotation_mask):
356
        if self.get_unique_nonzero_entries(annotation_mask).size >= 1:
357
            return self.get_unique_nonzero_entries(annotation_mask).tolist()
358
        else:
359
            return [0]  # completely unlabeled patch -> only non-tumor
360
361
    @staticmethod
362
    def is_non_tumor(label_ids):  # non-tumor tissue is unannotated tissue that's left after tissue detection
363
        return label_ids[0] == 0
364
365
    def calculate_label_percentages(self, label_ids, annotation_mask):
366
        if self.is_non_tumor(label_ids):
367
            label_percentages = [(np.count_nonzero(np.max(annotation_mask, axis=(-1)) == 0) /
368
                                  annotation_mask[:, :, 0].size)]
369
        else:
370
            label_percentages = []
371
            for label_id in label_ids:
372
                label_percentages.append((np.count_nonzero(annotation_mask[:, :, label_id] == label_id) /
373
                                          annotation_mask[:, :, label_id].size))
374
        return label_percentages
375
376
    def get_labels_with_enough_tissue_annotated(self, label_dict, annotation_mask):
377
        label_ids = self.get_possible_labels(annotation_mask)
378
        label_percentages = self.calculate_label_percentages(label_ids, annotation_mask)
379
380
        labels_with_enough_tissue_including_non_tumor = []
381
        for (label_id, label_percentage) in zip(label_ids, label_percentages):
382
            label = list(label_dict)[label_id]
383
            if self.check_tissue_percentage_over_threshold(label, label_dict, label_percentage):
384
                labels_with_enough_tissue_including_non_tumor.append(label)
385
386
        return labels_with_enough_tissue_including_non_tumor
387
388
    def update_overlapping_annotations_file(self, slide_name, verbose):
389
        with open(os.path.join(self.config["output_path"],
390
                               "overlapping_annotations_present_in_slides.json"), "r") as file:
391
            overlapping_annotations_present = json.load(file)
392
        if not overlapping_annotations_present[slide_name]:
393
            overlapping_annotations_present[slide_name] = True
394
            with (open(os.path.join(self.config["output_path"],
395
                                    "overlapping_annotations_present_in_slides.json"), "w")
396
                  as file):
397
                json.dump(overlapping_annotations_present, file, indent=4)
398
            if verbose:
399
                print(f"There are overlapping annotations in slide {slide_name}.")
400
401
    @staticmethod
402
    def normalize_to_tile_size_px(point, tile_size_px):
403
        if point < 0:
404
            return 0
405
        elif point >= tile_size_px:
406
            return tile_size_px - 1.0
407
        else:
408
            return point
409
410
    def translate_world_coordinates_to_tile_coordinates(self, point, tile_x, tile_y, tile_size_px):
411
        # the shrinkage of the coordinates to tile size is necessary as cv2.fillPoly only works if the annotation is
412
        # completely within the tile, so I set any points larger than the tile coordinates to the closest (valid)
413
        # tile coordinates
414
        return [self.normalize_to_tile_size_px(point[0] - tile_x, tile_size_px),
415
                self.normalize_to_tile_size_px(point[1] - tile_y, tile_size_px)]
416
417
    def extract_calibrated_patches(
418
            self,
419
            tile_dict,
420
            level,
421
            annotations,
422
            label_dict,
423
            overlap=0,
424
            annotation_overlap=0,
425
            slide_name=None,
426
            output_format="png",
427
    ):
428
429
        scaling_factor = int(self.slide.level_downsamples[level])
430
431
        patch_dict = {}
432
        patch_nb = 0
433
        for tile_key in tile_dict:
434
            tile_x = tile_dict[tile_key]["x"] * scaling_factor
435
            tile_y = tile_dict[tile_key]["y"] * scaling_factor
436
437
            tile_size_px = tile_dict[tile_key]["size"] * scaling_factor
438
439
            patch_size_px_x = int(np.round(self.config["calibration"]["patch_size_microns"] / self.res_x))
440
            patch_size_px_y = int(np.round(self.config["calibration"]["patch_size_microns"] / self.res_y))
441
442
            tile = np.array(self.slide.read_region((tile_x, tile_y), level=0, size=(tile_size_px, tile_size_px)))
443
            tile = tile[:, :, 0:3]
444
445
            if tile_dict[tile_key]["annotated"]:
446
                px_overlap_x = int(patch_size_px_x * annotation_overlap)
447
                px_overlap_y = int(patch_size_px_y * annotation_overlap)
448
449
            else:
450
                px_overlap_x = int(patch_size_px_x * overlap)
451
                px_overlap_y = int(patch_size_px_y * overlap)
452
453
            rows = int(np.ceil(tile_size_px / (patch_size_px_y - px_overlap_y)))
454
            cols = int(np.ceil(tile_size_px / (patch_size_px_x - px_overlap_x)))
455
456
            # create annotation mask
457
            if annotations is not None:
458
                # Translate from world coordinates to tile coordinates
459
                tile_annotation_list = [
460
                    [self.translate_world_coordinates_to_tile_coordinates(point, tile_x, tile_y, tile_size_px)
461
                     for point in annotations[polygon]["coordinates"]] for polygon in annotations]
462
463
                tile_annotation_list = list(zip(tile_annotation_list, [annotations[polygon]["tissue_type"]
464
                                                                       for polygon in annotations]))
465
466
                # Create mask from polygons
467
                tile_annotation_mask = np.zeros(shape=(tile_size_px, tile_size_px, len(self.config["label_dict"])))
468
469
                annotated_tissue_types = {}
470
                tissue_type_number = 1
471
                for tissue_type, tissue_details in label_dict.items():
472
                    if tissue_details["annotated"]:
473
                        annotated_tissue_types.update({tissue_type: tissue_type_number})
474
                        tissue_type_number += 1
475
476
                for polygon in tile_annotation_list:
477
                    # note: the casting to a contiguous array is due to OpenCV requiring C-order (row major) for
478
                    # implementation purposes, compare the answer by vvolhejn here
479
                    # https://stackoverflow.com/questions/23830618/python-opencv-typeerror-layout-of-the-output-array-incompatible-with-cvmat
480
                    # basically: many (all?) copy operations in numpy do this, ascontiguousarray is one of the more
481
                    # verbose ones
482
                    tile_annotation_mask[:, :, annotated_tissue_types[polygon[1]]] = (
483
                        cv2.fillPoly(np.ascontiguousarray(
484
                            tile_annotation_mask[:, :, annotated_tissue_types[polygon[1]]]),
485
                            [np.array(polygon[0]).astype(np.int32)], annotated_tissue_types[polygon[1]]))
486
487
            stop_y = False
488
489
            for row in range(rows):
490
                stop_x = False
491
492
                for col in range(cols):
493
494
                    # Calculate patch coordinates
495
                    patch_x = int(col * (patch_size_px_x - px_overlap_x))
496
                    patch_y = int(row * (patch_size_px_y - px_overlap_y))
497
498
                    if patch_y + patch_size_px_y >= tile_size_px:
499
                        stop_y = True
500
                        patch_y = tile_size_px - patch_size_px_y
501
502
                    if patch_x + patch_size_px_x >= tile_size_px:
503
                        stop_x = True
504
                        patch_x = tile_size_px - patch_size_px_x
505
506
                    global_x = patch_x + tile_x
507
                    global_y = patch_y + tile_y
508
509
                    patch = tile[patch_y: patch_y + patch_size_px_y, patch_x: patch_x + patch_size_px_x, :]
510
511
                    if np.sum(patch) == 0:
512
                        break
513
514
                    # check if the patch is annotated
515
                    annotated = False
516
517
                    if annotations is not None:
518
                        patch_mask = tile_annotation_mask[patch_y: patch_y + patch_size_px_y,
519
                                     patch_x: patch_x + patch_size_px_x, :]
520
                        labels = self.get_labels_with_enough_tissue_annotated(label_dict, patch_mask)
521
                        if labels is not None:
522
                            if len(labels) > 1:
523
                                self.update_overlapping_annotations_file(
524
                                    slide_name, verbose=self.config["overlapping_annotations_verbose"])
525
526
                            for label in labels:
527
                                # this check is done to ensure that non-tumor tissue (unannotated) is handled properly
528
                                if self.config["label_dict"][label]["annotated"]:
529
                                    annotated = True
530
531
                    else:
532
                        labels = "unlabeled"
533
534
                    if labels is not None:
535
                        if self.annotated_only and annotated or not self.annotated_only:
536
537
                            file_name = slide_name + "_" + str(global_x) + "_" + str(global_y) + "." + output_format
538
539
                            if self.config["calibration"]["resize"]:
540
                                patch = cv2.resize(patch, (self.config["patch_size"], self.config["patch_size"]))
541
542
                            patch = Image.fromarray(patch)
543
                            for label in labels:
544
                                patch.save(os.path.join(self.output_path, label, file_name), format=output_format)
545
546
                                patch_dict.update(
547
                                    {
548
                                        patch_nb: {
549
                                            "slide_name": slide_name,
550
                                            "patch_path": os.path.join(label, file_name),
551
                                            "label": label,
552
                                            "x_pos": global_x,
553
                                            "y_pos": global_y,
554
                                            "patch_size": patch_size_px_x,
555
                                            "resized": self.config["calibration"]["resize"],
556
                                        }
557
                                    }
558
                                )
559
                                patch_nb += 1
560
                    if stop_x:
561
                        break
562
                if stop_y:
563
                    break
564
565
        return patch_dict
566
567
    def make_dirs(self, output_path, slide_name, label_dict, annotated):
568
        try:
569
            slide_path = os.path.join(output_path, slide_name)
570
            if not os.path.exists(slide_path):
571
                os.makedirs(slide_path)
572
            if not annotated:
573
                unlabeled_path = os.path.join(slide_path, "unlabeled")
574
                if not os.path.exists(unlabeled_path):
575
                    os.makedirs(unlabeled_path)
576
            else:
577
                for label in label_dict:
578
                    sub_path = os.path.join(slide_path, label)
579
                    if not os.path.exists(sub_path):
580
                        os.makedirs(sub_path)
581
                    for patch in os.listdir(sub_path):
582
                        os.remove(os.path.join(sub_path, patch))
583
            self.output_path = slide_path
584
585
        except Exception as e:
586
            self.print_and_log_slide_error(slide_name, e, "make_dirs")
587
588
    def extract_patches(
589
            self,
590
            tile_dict,
591
            level,
592
            annotations,
593
            label_dict,
594
            overlap=0,
595
            annotation_overlap=0,
596
            patch_size=256,
597
            slide_name=None,
598
            output_format="png",
599
    ):
600
        patch_dict = {}
601
602
        scaling_factor = int(self.slide.level_downsamples[level])
603
        patch_nb = 0
604
605
        for tile_key in tile_dict:
606
            # skip unannotated tiles in case only annotated patches should be saved
607
            if self.annotated_only and not tile_dict[tile_key]["annotated"]:
608
                pass
609
            else:
610
                # ToDo: rows and cols arent calculated correctly, instead a quick fix by using breaks was applied
611
612
                tile_x = tile_dict[tile_key]["x"] * scaling_factor
613
                tile_y = tile_dict[tile_key]["y"] * scaling_factor
614
                tile_size = tile_dict[tile_key]["size"] * scaling_factor
615
                tile = np.array(self.slide.read_region((tile_x, tile_y), level=0, size=(tile_size, tile_size)))
616
                tile = tile[:, :, 0:3]
617
618
                # overlap separately  for annotated and unannotated patches
619
                if tile_dict[tile_key]["annotated"]:
620
                    px_overlap = int(patch_size * annotation_overlap)
621
                    rows = int(np.ceil(tile_size / (patch_size - px_overlap)))
622
                    cols = int(np.ceil(tile_size / (patch_size - px_overlap)))
623
624
                else:
625
                    px_overlap = int(patch_size * overlap)
626
                    rows = int(np.ceil(tile_size / (patch_size - px_overlap)))
627
                    cols = int(np.ceil(tile_size / (patch_size - px_overlap)))
628
629
                # create annotation mask
630
                if annotations is not None:
631
                    # Translate from world coordinates to tile coordinates
632
                    tile_annotation_list = [
633
                        [self.translate_world_coordinates_to_tile_coordinates(point, tile_x, tile_y, tile_size)
634
                         for point in annotations[polygon]["coordinates"]] for polygon in annotations]
635
636
                    tile_annotation_list = list(zip(tile_annotation_list, [annotations[polygon]["tissue_type"]
637
                                                                           for polygon in annotations]))
638
639
                    # Create mask from polygons
640
                    tile_annotation_mask = np.zeros(shape=(tile_size, tile_size, len(self.config["label_dict"])))
641
642
                    annotated_tissue_types = {}
643
                    tissue_type_number = 1
644
                    for tissue_type, tissue_details in label_dict.items():
645
                        if tissue_details["annotated"]:
646
                            annotated_tissue_types.update({tissue_type: tissue_type_number})
647
                            tissue_type_number += 1
648
649
                    for polygon in tile_annotation_list:
650
                        # note: the casting to a contiguous array is due to OpenCV requiring C-order (row major) for
651
                        # implementation purposes, compare the answer by vvolhejn here
652
                        # https://stackoverflow.com/questions/23830618/python-opencv-typeerror-layout-of-the-output-array-incompatible-with-cvmat
653
                        # basically: many (all?) copy operations in numpy do this, ascontiguousarray is one of the more
654
                        # verbose ones
655
                        tile_annotation_mask[:, :, annotated_tissue_types[polygon[1]]] = (
656
                            cv2.fillPoly(np.ascontiguousarray(
657
                                tile_annotation_mask[:, :, annotated_tissue_types[polygon[1]]]),
658
                                [np.array(polygon[0]).astype(np.int32)], annotated_tissue_types[polygon[1]]))
659
660
                stop_y = False
661
662
                for row in range(rows):
663
                    stop_x = False
664
665
                    for col in range(cols):
666
667
                        # Calculate patch coordinates
668
                        patch_x = int(col * (patch_size - px_overlap))
669
                        patch_y = int(row * (patch_size - px_overlap))
670
671
                        if patch_y + patch_size >= tile_size:
672
                            stop_y = True
673
                            patch_y = tile_size - patch_size
674
675
                        if patch_x + patch_size >= tile_size:
676
                            stop_x = True
677
                            patch_x = tile_size - patch_size
678
679
                        global_x = patch_x + tile_x
680
                        global_y = patch_y + tile_y
681
682
                        patch = tile[patch_y: patch_y + patch_size, patch_x: patch_x + patch_size, :]
683
684
                        if np.sum(patch) == 0:
685
                            break
686
687
                        # check if the patch is annotated
688
                        annotated = False
689
                        if annotations is not None:
690
                            patch_mask = tile_annotation_mask[
691
                                         patch_y: patch_y + patch_size, patch_x: patch_x + patch_size
692
                                         ]
693
694
                            labels = self.get_labels_with_enough_tissue_annotated(label_dict, patch_mask)
695
                            if labels is not None:
696
                                if len(labels) > 1:
697
                                    self.update_overlapping_annotations_file(
698
                                        slide_name, verbose=self.config["overlapping_annotations_verbose"])
699
700
                                for label in labels:
701
                                    # this check is done to ensure that non-tumor tissue (unannotated) is handled
702
                                    # properly
703
                                    if self.config["label_dict"][label]["annotated"]:
704
                                        annotated = True
705
706
                        else:
707
                            labels = "unlabeled"
708
709
                        if labels is not None:
710
                            if self.annotated_only and annotated or not self.annotated_only:
711
                                if slide_name is not None:
712
713
                                    file_name = (
714
                                            slide_name + "_" + str(global_x) + "_" + str(global_y) + "." + output_format
715
                                    )
716
                                else:
717
                                    file_name = (
718
                                            str(patch_nb) + "_" + str(global_x) + "_" + str(global_y) + "." +
719
                                            output_format
720
                                    )
721
722
                                patch = Image.fromarray(patch)
723
724
                                for label in labels:
725
                                    patch.save(os.path.join(self.output_path, label, file_name), format=output_format)
726
727
                                    patch_dict.update(
728
                                        {
729
                                            patch_nb: {
730
                                                "slide_name": slide_name,
731
                                                "patch_path": os.path.join(label, file_name),
732
                                                "label": label,
733
                                                "x_pos": global_x,
734
                                                "y_pos": global_y,
735
                                                "patch_size": patch_size,
736
                                            }
737
                                        }
738
                                    )
739
                                    patch_nb += 1
740
                        if stop_x:
741
                            break
742
                    if stop_y:
743
                        break
744
745
        return patch_dict
746
747
    def export_dict(self, dictionary, metadata_format, filename):
748
749
        if metadata_format == "json":
750
            file = os.path.join(self.output_path, filename + ".json")
751
            with open(file, "w") as json_file:
752
                json.dump(dictionary, json_file, indent=4)
753
        elif metadata_format == "csv":
754
            df = pd.DataFrame(dictionary.values())
755
            file = os.path.join(self.output_path, filename + ".csv")
756
            df.to_csv(file, index=False)
757
        else:
758
            print("Could not write metadata. Metadata format has to be json or csv")
759
760
    def save_thumbnail(self, mask, slide_name, level, output_format="png"):
761
762
        remap_color = ((0, 0, 0), (255, 255, 255))
763
764
        process_level = level
765
        img = self.slide.read_region([0, 0], process_level, self.slide.level_dimensions[process_level])
766
767
        # Remove Alpha
768
        img = np.array(img)[:, :, 0:3]
769
770
        if remap_color is not None:
771
            indizes = np.all(img == remap_color[0], axis=2)
772
            img[indizes] = remap_color[1]
773
774
            copy_img = img[mask.astype(bool), :]
775
776
            median_filtered_img = cv2.medianBlur(img, 11)
777
            median_filtered_img[mask.astype(bool)] = copy_img
778
779
            img = median_filtered_img
780
781
        file_name = os.path.join(self.config["output_path"], slide_name, "thumbnail." + output_format)
782
        plt.imsave(file_name, img, format=output_format)
783
784
    def init_generic_tiff(self):
785
786
        unit_dict = {"milimeter": 1000, "centimeter": 10000, "meter": 1000000}
787
        self.scanner = "generic-tiff"
788
789
        assert self.slide.properties["tiff.ResolutionUnit"] in unit_dict.keys(), (
790
                "Unknown unit " + self.slide.properties["tiff.ResolutionUnit"]
791
        )
792
793
        factor = unit_dict[self.slide.properties["tiff.ResolutionUnit"]]
794
795
        # convert to mpp
796
        self.res_x = factor / float(self.slide.properties["tiff.XResolution"])
797
        self.res_y = factor / float(self.slide.properties["tiff.YResolution"])
798
799
    def init_aperio(self):
800
        self.scanner = "aperio"
801
802
        self.res_x = float(self.slide.properties["openslide.mpp-x"])
803
        self.res_y = float(self.slide.properties["openslide.mpp-y"])
804
805
    def init_mirax(self):
806
        self.scanner = "mirax"
807
        self.res_x = float(self.slide.properties["openslide.mpp-x"])
808
        self.res_y = float(self.slide.properties["openslide.mpp-y"])
809
810
    def init_unknown(self):
811
        try:
812
            self.scanner = self.slide.properties["openslide.vendor"]
813
            self.res_x = float(self.slide.properties["openslide.mpp-x"])
814
            self.res_y = float(self.slide.properties["openslide.mpp-y"])
815
        except Exception as e:
816
            print(e)
817
818
    def init_patch_calibration(self):
819
820
        # check scanner type
821
        if self.slide.properties["openslide.vendor"] == "aperio":
822
            self.init_aperio()
823
        elif self.slide.properties["openslide.vendor"] == "generic-tiff":
824
            self.init_generic_tiff()
825
        elif self.slide.properties["openslide.vendor"] == "mirax":
826
            self.init_mirax()
827
        else:
828
            self.init_unknown()
829
        # future vendors
830
        # elif ...
831
832
        assert self.scanner, "Not integrated scanner type, aborting"
833
834
    def process_slide(self, slide):
835
        slide_name = os.path.basename(slide)
836
        slide_name = os.path.splitext(slide_name)[0]
837
838
        print("Processing", slide_name, "process id is", os.getpid())
839
840
        try:
841
            annotation_path = os.path.join(
842
                self.config["annotation_dir"], slide_name + "." + self.config["annotation_file_format"]
843
            )
844
            if os.path.exists(annotation_path):
845
846
                annotated = True
847
                self.annotation_dict = self.load_annotation(annotation_path)
848
            else:
849
                annotated = False
850
                self.annotation_dict = None
851
852
        except Exception as e:
853
            self.print_and_log_slide_error(slide_name, e, "process_slide - load_annotations")
854
            return 0
855
856
        self.make_dirs(
857
            output_path=self.config["output_path"],
858
            slide_name=slide_name,
859
            label_dict=self.config["label_dict"],
860
            annotated=annotated,
861
        )
862
863
        slide_path = os.path.join(self.config["slides_dir"], slide)
864
        try:
865
            level = self.load_slide(slide_path)
866
        except Exception as e:
867
            self.print_and_log_slide_error(slide_name, e, "load_slide")
868
            return 0
869
870
        if self.config["calibration"]["use_non_pixel_lengths"]:
871
            try:
872
                self.init_patch_calibration()
873
            except Exception as e:
874
                self.print_and_log_slide_error(slide_name, e, "init_patch_calibration")
875
                return 0
876
877
        if self.config["use_tissue_detection"]:
878
            mask, level = self.apply_tissue_detection(level=level, show=self.config["show_mode"])
879
        else:
880
            mask = np.ones(shape=self.slide.level_dimensions[level]).transpose()
881
        try:
882
            tile_size = self.determine_tile_size(level)
883
        except Exception as e:
884
            self.print_and_log_slide_error(slide_name, e, "determine_tile_size")
885
            return 0
886
        try:
887
            tile_dict = self.get_relevant_tiles(
888
                mask,
889
                tile_size=tile_size,
890
                min_coverage=self.config["tissue_coverage"],
891
                level=level,
892
                show=self.config["show_mode"],
893
            )
894
        except Exception as e:
895
            self.print_and_log_slide_error(slide_name, e, "get_relevant_tiles")
896
            return 0
897
898
        # Calibrated or non calibrated patch sizes
899
        if self.config["calibration"]["use_non_pixel_lengths"]:
900
            try:
901
                patch_dict = self.extract_calibrated_patches(
902
                    tile_dict,
903
                    level,
904
                    self.annotation_dict,
905
                    self.config["label_dict"],
906
                    overlap=self.config["overlap"],
907
                    annotation_overlap=self.config["annotation_overlap"],
908
                    slide_name=slide_name,
909
                    output_format=self.config["output_format"]
910
                )
911
            except Exception as e:
912
                self.print_and_log_slide_error(slide_name, e, "extract_calibrated_patches")
913
                return 0
914
        else:
915
            try:
916
                patch_dict = self.extract_patches(
917
                    tile_dict,
918
                    level,
919
                    self.annotation_dict,
920
                    self.config["label_dict"],
921
                    overlap=self.config["overlap"],
922
                    annotation_overlap=self.config["annotation_overlap"],
923
                    patch_size=self.config["patch_size"],
924
                    slide_name=slide_name,
925
                    output_format=self.config["output_format"],
926
                )
927
            except Exception as e:
928
                self.print_and_log_slide_error(slide_name, e, "extract_patches")
929
                return 0
930
931
        self.export_dict(patch_dict, self.config["metadata_format"], "tile_information")
932
        try:
933
            self.save_thumbnail(mask, level=level, slide_name=slide_name,
934
                                output_format=self.config["output_format"])
935
            print("Finished slide ", slide_name)
936
937
        except Exception as e:
938
            self.print_and_log_slide_error(slide_name, e, "save_thumbnail")
939
            return 0
940
941
    @staticmethod
942
    def read_slide_file(slide_file_path, ext_list):
943
944
        slide_list = []
945
946
        with open(slide_file_path) as file:
947
            lines = file.read().splitlines()
948
949
        for line in lines:
950
            if os.path.isdir(line):
951
                for ext in ext_list:
952
                    for file in Path(line).resolve().glob("**/*" + ext):
953
                        slide = str(file)
954
            else:
955
                slide = line
956
957
            slide_list.append(slide)
958
959
        return slide_list
960
961
    @staticmethod
962
    def init(l):
963
        global lock
964
        lock = l
965
966
    @staticmethod
967
    def get_slide_name_from_slide_path(slide_path):
968
        return os.path.splitext(os.path.basename(slide_path))[0]
969
970
    def slides2patches(self):
971
972
        l = multiprocessing.Lock()
973
974
        extensions = [".tif", ".svs", ".mrxs"]
975
        slide_list = []
976
977
        if self.config["slides_file"] is not None:
978
            print("Using slide file: " + self.config["slides_file"])
979
            slide_list = self.read_slide_file(self.config["slides_file"], extensions)
980
        else:
981
            for extension in extensions:
982
                for file in Path(self.config["slides_dir"]).resolve().glob("**/*" + extension):
983
                    slide_list.append(file)
984
985
        self.annotation_list = []
986
        if os.path.exists(self.config["annotation_dir"]):
987
            annotation_list = os.listdir(self.config["annotation_dir"])
988
            self.annotation_list = [os.path.splitext(annotation)[0] for annotation in annotation_list]
989
990
        missing_annotations = []
991
        annotated_slides = [
992
            name
993
            if os.path.splitext(os.path.basename(name))[0] in self.annotation_list
994
            else missing_annotations.append(os.path.splitext(os.path.basename(name))[0])
995
            for name in slide_list
996
        ]
997
        annotated_slides = list(filter(lambda slide: True if slide is not None else False, annotated_slides))
998
999
        print("###############################################")
1000
        print("Found", len(annotated_slides), "annotated slides")
1001
        print("###############################################")
1002
        print("Found", len(missing_annotations), "unannotated slides")
1003
        print("###############################################")
1004
        if not self.config["use_tissue_detection"]:
1005
            print("Tissue detection deactivated")
1006
            print("###############################################")
1007
1008
        if self.config["skip_unlabeled_slides"]:
1009
            slide_list = annotated_slides
1010
            print("Processing annotated slides only")
1011
1012
        if not os.path.exists(self.config["output_path"]):
1013
            os.makedirs(self.config["output_path"])
1014
1015
        with open(os.path.join(self.config["output_path"], "error_log.txt"), "w"):
1016
            pass
1017
1018
        if not len(slide_list) == 0:
1019
            slide_list = sorted(slide_list)
1020
1021
            # writing this to file to work somewhat elegantly around multiprocessing without much restructuring of the
1022
            # existing code
1023
            with (open(os.path.join(self.config["output_path"], "overlapping_annotations_present_in_slides.json"), "w")
1024
                  as file):
1025
                json.dump(dict.fromkeys(list(map(self.get_slide_name_from_slide_path, slide_list)), False),
1026
                          file, indent=4)
1027
1028
            if _MULTIPROCESS:
1029
                available_threads = multiprocessing.cpu_count() - self.config["blocked_threads"]
1030
                pool = multiprocessing.Pool(processes=available_threads, initializer=self.init, initargs=(l,))
1031
                pool.map(self.process_slide, slide_list)
1032
1033
            else:
1034
                for slide in slide_list:
1035
                    self.process_slide(slide)
1036
1037
            slide_dict = {}
1038
1039
            if len(annotated_slides) == 0:
1040
                for i in range(len(slide_list)):
1041
                    slide = slide_list[i]
1042
                    slide_name = self.get_slide_name_from_slide_path(slide)
1043
                    slide_dict.update({i: {"slide_name": slide_name,
1044
                                           "slide_path": slide,
1045
                                           }
1046
                                       })
1047
                self.output_path = self.config["output_path"]
1048
                self.export_dict(slide_dict, self.config["metadata_format"], "slide_information")
1049
            else:
1050
                # Save label proportion per slide
1051
                labels = list(self.config["label_dict"].keys())
1052
1053
                with (open(os.path.join(self.config["output_path"],
1054
                                        "overlapping_annotations_present_in_slides.json"), "r") as file):
1055
                    overlapping_annotations_present_in_slide = json.load(file)
1056
                for i in range(len(annotated_slides)):
1057
                    slide = slide_list[i]
1058
                    slide_name = self.get_slide_name_from_slide_path(slide)
1059
                    slide_path = os.path.join(self.config["output_path"], slide_name)
1060
1061
                    n_labeled_tiles = 0
1062
                    n_labels = {}
1063
                    for label in labels:
1064
                        n_label = len(os.listdir(os.path.join(slide_path, label)))
1065
                        n_labels.update({label: n_label})
1066
                        n_labeled_tiles += n_label
1067
1068
                    slide_dict_entry = {}
1069
                    slide_dict_entry.update({"slide_name": slide_name,
1070
                                             "slide_contains_overlapping_annotations":
1071
                                                 overlapping_annotations_present_in_slide[slide_name]})
1072
                    fracs = {}
1073
                    for label, n_label in n_labels.items():
1074
                        slide_dict_entry.update({label: n_label})
1075
                        fracs.update({label: n_label / n_labeled_tiles * 100})
1076
                    slide_dict_entry.update({"total": n_labeled_tiles, "frac": fracs})
1077
                    slide_dict.update({i: slide_dict_entry})
1078
1079
                    self.output_path = self.config["output_path"]
1080
                    self.export_dict(slide_dict, self.config["metadata_format"], "slide_information")
1081
1082
            # Save used config file
1083
            file = os.path.join(self.config["output_path"], "config.json")
1084
            with open(file, "w") as json_file:
1085
                json.dump(self.config, json_file, indent=4)
1086
1087
            print("Finished tiling process!")
1088
1089
        else:
1090
            print("###############################################")
1091
            print("WARNING: No slides processed!")
1092
            print("###############################################")
1093
1094
1095
if __name__ == "__main__":
1096
    parser = ArgumentParser()
1097
    parser.add_argument("--config_path", default=script_dir + "/resources/config.json")
1098
    args = parser.parse_args()
1099
1100
    slide_handler = WSIHandler(config_path=args.config_path)
1101
    slide_handler.slides2patches()