deepslide / Git / [594161] /code/z_preprocessing/4

Models:
DanielG/
deepslide
Downloads: 1
[594161]: / code / z_preprocessing / 4_anonymize.py
History
Download this file
130 lines (96 with data), 3.9 kB

import os
import random
from os import listdir
from os.path import isfile, join, isdir


def get_image_paths(folder):
    image_paths = [join(folder, f) for f in listdir(folder) if isfile(join(folder, f)) and '.DS_Store' not in f]
    return image_paths


def get_alpha_name(number):
    alphas = "abcdefghijklmnopqrstuvwxyz"
    first = int(number / 26 / 26)
    second = int(number / 26) % 26
    third = number % 26
    return alphas[first] + alphas[second] + alphas[third]


# create an output folder if it does not already exist
def confirm_output_folder(output_folder):
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)


# parameters
input_folder = 'wsi_all_3x'
output_folder = 'wsi_3x'
train_split = 0.5
val_split = 0.15
test_split = 0.35

# get all the subfolders
all_input_folders = [join(input_folder, f) for f in listdir(input_folder) if isdir(join(input_folder, f))]

# get list of all images
all_image_paths = []
for folder in all_input_folders:
    all_image_paths += get_image_paths(folder)

# important dictionaries
image_path_to_pid = {}
pid_to_image_paths = {}

# fill the dictionaries
for image_path in all_image_paths:

    image_name = image_path.split('/')[-1]

    # parse the image_name for the patient id
    start = image_name.index('S')
    end = image_name.index('.')
    while image_name[end].lower() not in 'qwertyuiopasdfghjklzxcvbnm':
        end -= 1
    assert end > start
    pid = image_name[start:end]

    # add to dictionaries
    image_path_to_pid[image_path] = pid
    if pid in pid_to_image_paths:
        pid_to_image_paths[pid].append(image_path)
    else:
        pid_to_image_paths[pid] = [image_path]

print(len(image_path_to_pid), 'images from', len(pid_to_image_paths), 'patients')
patient_list = list(pid_to_image_paths.keys())
random.seed(0)
random.shuffle(patient_list)
num_patients = len(patient_list)

# partition data
train_patients = patient_list[:int(train_split*num_patients)]
val_patients = patient_list[int(train_split*num_patients):int((train_split+val_split)*num_patients)]
test_patients = patient_list[int((train_split+val_split)*num_patients):]

train_images = []
for pid in train_patients:
    train_images += pid_to_image_paths[pid]

val_images = []
for pid in val_patients:
    val_images += pid_to_image_paths[pid]

test_images = []
for pid in test_patients:
    test_images += pid_to_image_paths[pid]

print('train_patients:', len(train_patients), 'with', len(train_images), 'images')
print('val_patients:', len(val_patients), 'with', len(val_images), 'images')
print('test_patients:', len(test_patients), 'with', len(test_images), 'images')

# encode image names
all_image_paths_ordered = train_images + val_images + test_images
image_path_to_code = {}
for i, image_path in enumerate(all_image_paths_ordered):
    image_path_to_code[image_path] = get_alpha_name(i)

folder_to_count = {}

writer = open(join(input_folder, 'image_to_pid.tsv'), 'w')
# copy all the images into the new locations!
for image_set, set_type in zip([train_images, val_images, test_images], ['train', 'val', 'test']):
    for image_path in image_set:
        label = image_path.split('/')[-2]

        folder = '/'.join([output_folder, set_type, label])

        if folder in folder_to_count:
            folder_to_count[folder] += 1
        else:
            folder_to_count[folder] = 1

        new_path = '/'.join([output_folder, set_type, label, image_path_to_code[image_path]+'.jpg'])

        write_line = new_path + '\t' + image_path
        writer.write(write_line + '\n')

print(folder_to_count)

test_img_file = open('wsi_all_3x/new_test_images.txt', 'r').readlines()
test_imgs = [x[:-1] for x in test_img_file]
test_patients = []

img_to_pid = {image_path_to_code[img_path]: image_path_to_pid[img_path] for img_path in image_path_to_pid}

for test_img in test_imgs:
    test_patients.append(img_to_pid[test_img[:3]])

print(len(set(test_patients)))