|
a |
|
b/code/z_preprocessing/4_anonymize.py |
|
|
1 |
import os |
|
|
2 |
import random |
|
|
3 |
from os import listdir |
|
|
4 |
from os.path import isfile, join, isdir |
|
|
5 |
|
|
|
6 |
|
|
|
7 |
def get_image_paths(folder): |
|
|
8 |
image_paths = [join(folder, f) for f in listdir(folder) if isfile(join(folder, f)) and '.DS_Store' not in f] |
|
|
9 |
return image_paths |
|
|
10 |
|
|
|
11 |
|
|
|
12 |
def get_alpha_name(number): |
|
|
13 |
alphas = "abcdefghijklmnopqrstuvwxyz" |
|
|
14 |
first = int(number / 26 / 26) |
|
|
15 |
second = int(number / 26) % 26 |
|
|
16 |
third = number % 26 |
|
|
17 |
return alphas[first] + alphas[second] + alphas[third] |
|
|
18 |
|
|
|
19 |
|
|
|
20 |
# create an output folder if it does not already exist |
|
|
21 |
def confirm_output_folder(output_folder): |
|
|
22 |
if not os.path.exists(output_folder): |
|
|
23 |
os.makedirs(output_folder) |
|
|
24 |
|
|
|
25 |
|
|
|
26 |
# parameters |
|
|
27 |
input_folder = 'wsi_all_3x' |
|
|
28 |
output_folder = 'wsi_3x' |
|
|
29 |
train_split = 0.5 |
|
|
30 |
val_split = 0.15 |
|
|
31 |
test_split = 0.35 |
|
|
32 |
|
|
|
33 |
# get all the subfolders |
|
|
34 |
all_input_folders = [join(input_folder, f) for f in listdir(input_folder) if isdir(join(input_folder, f))] |
|
|
35 |
|
|
|
36 |
# get list of all images |
|
|
37 |
all_image_paths = [] |
|
|
38 |
for folder in all_input_folders: |
|
|
39 |
all_image_paths += get_image_paths(folder) |
|
|
40 |
|
|
|
41 |
# important dictionaries |
|
|
42 |
image_path_to_pid = {} |
|
|
43 |
pid_to_image_paths = {} |
|
|
44 |
|
|
|
45 |
# fill the dictionaries |
|
|
46 |
for image_path in all_image_paths: |
|
|
47 |
|
|
|
48 |
image_name = image_path.split('/')[-1] |
|
|
49 |
|
|
|
50 |
# parse the image_name for the patient id |
|
|
51 |
start = image_name.index('S') |
|
|
52 |
end = image_name.index('.') |
|
|
53 |
while image_name[end].lower() not in 'qwertyuiopasdfghjklzxcvbnm': |
|
|
54 |
end -= 1 |
|
|
55 |
assert end > start |
|
|
56 |
pid = image_name[start:end] |
|
|
57 |
|
|
|
58 |
# add to dictionaries |
|
|
59 |
image_path_to_pid[image_path] = pid |
|
|
60 |
if pid in pid_to_image_paths: |
|
|
61 |
pid_to_image_paths[pid].append(image_path) |
|
|
62 |
else: |
|
|
63 |
pid_to_image_paths[pid] = [image_path] |
|
|
64 |
|
|
|
65 |
print(len(image_path_to_pid), 'images from', len(pid_to_image_paths), 'patients') |
|
|
66 |
patient_list = list(pid_to_image_paths.keys()) |
|
|
67 |
random.seed(0) |
|
|
68 |
random.shuffle(patient_list) |
|
|
69 |
num_patients = len(patient_list) |
|
|
70 |
|
|
|
71 |
# partition data |
|
|
72 |
train_patients = patient_list[:int(train_split*num_patients)] |
|
|
73 |
val_patients = patient_list[int(train_split*num_patients):int((train_split+val_split)*num_patients)] |
|
|
74 |
test_patients = patient_list[int((train_split+val_split)*num_patients):] |
|
|
75 |
|
|
|
76 |
train_images = [] |
|
|
77 |
for pid in train_patients: |
|
|
78 |
train_images += pid_to_image_paths[pid] |
|
|
79 |
|
|
|
80 |
val_images = [] |
|
|
81 |
for pid in val_patients: |
|
|
82 |
val_images += pid_to_image_paths[pid] |
|
|
83 |
|
|
|
84 |
test_images = [] |
|
|
85 |
for pid in test_patients: |
|
|
86 |
test_images += pid_to_image_paths[pid] |
|
|
87 |
|
|
|
88 |
print('train_patients:', len(train_patients), 'with', len(train_images), 'images') |
|
|
89 |
print('val_patients:', len(val_patients), 'with', len(val_images), 'images') |
|
|
90 |
print('test_patients:', len(test_patients), 'with', len(test_images), 'images') |
|
|
91 |
|
|
|
92 |
# encode image names |
|
|
93 |
all_image_paths_ordered = train_images + val_images + test_images |
|
|
94 |
image_path_to_code = {} |
|
|
95 |
for i, image_path in enumerate(all_image_paths_ordered): |
|
|
96 |
image_path_to_code[image_path] = get_alpha_name(i) |
|
|
97 |
|
|
|
98 |
folder_to_count = {} |
|
|
99 |
|
|
|
100 |
writer = open(join(input_folder, 'image_to_pid.tsv'), 'w') |
|
|
101 |
# copy all the images into the new locations! |
|
|
102 |
for image_set, set_type in zip([train_images, val_images, test_images], ['train', 'val', 'test']): |
|
|
103 |
for image_path in image_set: |
|
|
104 |
label = image_path.split('/')[-2] |
|
|
105 |
|
|
|
106 |
folder = '/'.join([output_folder, set_type, label]) |
|
|
107 |
|
|
|
108 |
if folder in folder_to_count: |
|
|
109 |
folder_to_count[folder] += 1 |
|
|
110 |
else: |
|
|
111 |
folder_to_count[folder] = 1 |
|
|
112 |
|
|
|
113 |
new_path = '/'.join([output_folder, set_type, label, image_path_to_code[image_path]+'.jpg']) |
|
|
114 |
|
|
|
115 |
write_line = new_path + '\t' + image_path |
|
|
116 |
writer.write(write_line + '\n') |
|
|
117 |
|
|
|
118 |
print(folder_to_count) |
|
|
119 |
|
|
|
120 |
test_img_file = open('wsi_all_3x/new_test_images.txt', 'r').readlines() |
|
|
121 |
test_imgs = [x[:-1] for x in test_img_file] |
|
|
122 |
test_patients = [] |
|
|
123 |
|
|
|
124 |
img_to_pid = {image_path_to_code[img_path]: image_path_to_pid[img_path] for img_path in image_path_to_pid} |
|
|
125 |
|
|
|
126 |
for test_img in test_imgs: |
|
|
127 |
test_patients.append(img_to_pid[test_img[:3]]) |
|
|
128 |
|
|
|
129 |
print(len(set(test_patients))) |