a b/code/z_preprocessing/4_anonymize.py
1
import os
2
import random
3
from os import listdir
4
from os.path import isfile, join, isdir
5
6
7
def get_image_paths(folder):
8
    image_paths = [join(folder, f) for f in listdir(folder) if isfile(join(folder, f)) and '.DS_Store' not in f]
9
    return image_paths
10
11
12
def get_alpha_name(number):
13
    alphas = "abcdefghijklmnopqrstuvwxyz"
14
    first = int(number / 26 / 26)
15
    second = int(number / 26) % 26
16
    third = number % 26
17
    return alphas[first] + alphas[second] + alphas[third]
18
19
20
# create an output folder if it does not already exist
21
def confirm_output_folder(output_folder):
22
    if not os.path.exists(output_folder):
23
        os.makedirs(output_folder)
24
25
26
# parameters
27
input_folder = 'wsi_all_3x'
28
output_folder = 'wsi_3x'
29
train_split = 0.5
30
val_split = 0.15
31
test_split = 0.35
32
33
# get all the subfolders
34
all_input_folders = [join(input_folder, f) for f in listdir(input_folder) if isdir(join(input_folder, f))]
35
36
# get list of all images
37
all_image_paths = []
38
for folder in all_input_folders:
39
    all_image_paths += get_image_paths(folder)
40
41
# important dictionaries
42
image_path_to_pid = {}
43
pid_to_image_paths = {}
44
45
# fill the dictionaries
46
for image_path in all_image_paths:
47
48
    image_name = image_path.split('/')[-1]
49
50
    # parse the image_name for the patient id
51
    start = image_name.index('S')
52
    end = image_name.index('.')
53
    while image_name[end].lower() not in 'qwertyuiopasdfghjklzxcvbnm':
54
        end -= 1
55
    assert end > start
56
    pid = image_name[start:end]
57
58
    # add to dictionaries
59
    image_path_to_pid[image_path] = pid
60
    if pid in pid_to_image_paths:
61
        pid_to_image_paths[pid].append(image_path)
62
    else:
63
        pid_to_image_paths[pid] = [image_path]
64
65
print(len(image_path_to_pid), 'images from', len(pid_to_image_paths), 'patients')
66
patient_list = list(pid_to_image_paths.keys())
67
random.seed(0)
68
random.shuffle(patient_list)
69
num_patients = len(patient_list)
70
71
# partition data
72
train_patients = patient_list[:int(train_split*num_patients)]
73
val_patients = patient_list[int(train_split*num_patients):int((train_split+val_split)*num_patients)]
74
test_patients = patient_list[int((train_split+val_split)*num_patients):]
75
76
train_images = []
77
for pid in train_patients:
78
    train_images += pid_to_image_paths[pid]
79
80
val_images = []
81
for pid in val_patients:
82
    val_images += pid_to_image_paths[pid]
83
84
test_images = []
85
for pid in test_patients:
86
    test_images += pid_to_image_paths[pid]
87
88
print('train_patients:', len(train_patients), 'with', len(train_images), 'images')
89
print('val_patients:', len(val_patients), 'with', len(val_images), 'images')
90
print('test_patients:', len(test_patients), 'with', len(test_images), 'images')
91
92
# encode image names
93
all_image_paths_ordered = train_images + val_images + test_images
94
image_path_to_code = {}
95
for i, image_path in enumerate(all_image_paths_ordered):
96
    image_path_to_code[image_path] = get_alpha_name(i)
97
98
folder_to_count = {}
99
100
writer = open(join(input_folder, 'image_to_pid.tsv'), 'w')
101
# copy all the images into the new locations!
102
for image_set, set_type in zip([train_images, val_images, test_images], ['train', 'val', 'test']):
103
    for image_path in image_set:
104
        label = image_path.split('/')[-2]
105
106
        folder = '/'.join([output_folder, set_type, label])
107
108
        if folder in folder_to_count:
109
            folder_to_count[folder] += 1
110
        else:
111
            folder_to_count[folder] = 1
112
113
        new_path = '/'.join([output_folder, set_type, label, image_path_to_code[image_path]+'.jpg'])
114
115
        write_line = new_path + '\t' + image_path
116
        writer.write(write_line + '\n')
117
118
print(folder_to_count)
119
120
test_img_file = open('wsi_all_3x/new_test_images.txt', 'r').readlines()
121
test_imgs = [x[:-1] for x in test_img_file]
122
test_patients = []
123
124
img_to_pid = {image_path_to_code[img_path]: image_path_to_pid[img_path] for img_path in image_path_to_pid}
125
126
for test_img in test_imgs:
127
    test_patients.append(img_to_pid[test_img[:3]])
128
129
print(len(set(test_patients)))