[0a4821]: / DataPreparation.py

Download this file

232 lines (174 with data), 9.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
# -*- coding: utf-8 -*-
"""
Created on Fri Jul 10 13:14:31 2020
@author: Billy
"""
import slideio
import glob
import os
import time
import numpy as np
import gc
import progressbar
from PIL import Image
from skimage import filters
class DataPreparation:
#this function initialises a data cleaner
#the job of this class is to generate x20 magnification .png images from svs slides,
#whilst cutting out as much background and non-epithelial area as possible.
#
#this function needs the location of the folder that contains svs images as an input
def __init__(self, svs_loc, png_loc = None):
assert type(svs_loc) == type('')
if not os.path.exists(svs_loc):
raise ValueError("SVS directory is illegitimate:", svs_loc,". Please enter the full filepath of an existing directory containing .svs files.\n")
else:
os.chdir(svs_loc)
self.svs_images = glob.glob("*.svs")
if len(self.svs_images) == 0:
raise ValueError("Directory", svs_loc," exists, but there are no .svs images in this location.")
self.svs_loc = svs_loc
print("Found the following outputs:", self.svs_images,"\n")
if png_loc == None:
parent_dir, dir_ = os.path.split(svs_loc)
png_loc = os.path.join(parent_dir,"Prepared_SVS")
if not os.path.exists(png_loc):
os.mkdir(png_loc)
self.png_fold = png_loc
#this function receives the file location of a folder that contains svs images,
#and generates a subling subfolder populated with png images.
#
#this function also saves a log file, to inform the user about the new image's geometric properties.
def AutocropAll(self, svs_loc = None, png_loc= None, max_mag= 20):
if svs_loc == None:
svs_loc = self.svs_loc
if png_loc == None:
png_loc = self.png_fold
if not os.path.exists(svs_loc):
raise ValueError("SVS directory is illegitimate:", svs_loc,". Please enter the full filepath of an existing directory containing .svs files.\n")
if not os.path.exists(png_loc):
print("File Location", png_loc,"does not exist. Making this directory...")
os.mkdir(png_loc)
print("Successfully create .png save directory.")
def consecutive(data, stepsize=1):
arr_consec= np.split(data, np.where(np.diff(data) != stepsize)[0]+1)
return max(arr_consec, key = len)
widgets = [
'Cropping: ', progressbar.Percentage(),
' ', progressbar.AnimatedMarker(),
' ', progressbar.ETA(),
]
bar = progressbar.ProgressBar(
widgets=widgets,
maxval=len(self.svs_images)).start()
log_loc = os.path.join(png_loc, 'log.txt')
with open(log_loc, 'w') as filetowrite:
for i in range(len(self.svs_images)):
bar.update(i)
information ={}
file,svs = os.path.splitext(self.svs_images[i])
pic1_loc = os.path.join(svs_loc, self.svs_images[i])
slide= slideio.open_slide(pic1_loc, 'SVS')
scene = slide.get_scene(0)
mag = scene.magnification
pixel_size = scene.resolution[0]
_,_,width,height = scene.rect
img_fold = os.path.join(png_loc, file)
if not os.path.exists(img_fold):
os.makedirs(img_fold)
while mag>max_mag:
width = int(np.round(width/2))
height = int(np.round(height/2))
mag = mag/2
pixel_size = pixel_size*2
image= scene.read_block(scene.rect,(width,height))
image_data_bw = image.min(axis=2)
information['ImageName'] = file
information['Magnification'] = mag
information['ImagePixelHeight'] = height
information['ImagePixelWidth'] = width
information['PixelSizeMeters'] = pixel_size
filetowrite.write(str(information))
filetowrite.write(' \n ')
filetowrite.write('#####')
filetowrite.write(' \n ')
object_h = self.ObjectSplitter(image_data_bw, axis = 0)
width,height=np.shape(image_data_bw)
for j,indices_v in enumerate(object_h):
real_objects = self.ObjectSplitter(image_data_bw[0:height,indices_v[0]:indices_v[1]], axis = 1)
for k,indices_h in enumerate(real_objects):
full_path = os.path.join(img_fold, file+"_"+str(j)+"_mag"+str(int(mag))+".png")
if not os.path.exists(full_path):
self.BackgroundReducer(image[indices_h[0]:indices_h[1],indices_v[0]:indices_v[1]], full_path)
bar.finish()
#return the indices splitting pairs of an image array depending on the percentage that of pixel 'completion' along a given axis.
#This is to say, this function generates indices that an imaged should be cropped between,
#either vertically or horziontally, based upon the percentage of white background in the image
#the default percentage is 2%
def ObjectSplitter(self, image_arr,percentage_threshold=2, axis =0):
val = filters.threshold_otsu(image_arr)
data = np.sum(image_arr < val,axis=axis)
n = data.shape[0]
data = np.where(data<(percentage_threshold/100)*n,0,1)
loc_run_start = np.empty(n, dtype=bool)
loc_run_start[0] = True
np.not_equal(data[:-1], data[1:], out=loc_run_start[1:])
run_starts = np.nonzero(loc_run_start)[0].tolist()
# find run values
run_values = data[loc_run_start].tolist()
# find run lengths
run_lengths = np.diff(np.append(run_starts, n)).tolist()
counter = 0
for i in range(len(run_starts)):
idx = i-counter
if run_lengths[idx]<0.02*n:
if idx==0:continue
if run_lengths[idx-1]>0.05*n and run_values[idx-1]==1:
run_lengths[idx-1] += run_lengths[idx]
run_lengths.pop(idx)
run_starts.pop(idx)
run_values.pop(idx)
counter+=1
continue
if idx>=len(run_starts)-1:continue
if run_lengths[idx+1]>0.05*n and run_values[idx+1]==1:
run_lengths[idx+1] += run_lengths[idx]
run_lengths.pop(idx)
run_starts.pop(idx)
run_values.pop(idx)
counter+=1
continue
run_lengths[idx-1] += run_lengths[idx]
run_lengths.pop(idx)
run_starts.pop(idx)
run_values.pop(idx)
counter+=1
object_pairs = []
for i in range(len(run_values)):
if not run_values[i]==1:continue
object_pairs.append((run_starts[i], run_starts[i]+run_lengths[i]))
return object_pairs
#this function uses the indices splitting pairs to split input images.
def BackgroundReducer(self, image, png_save, true_boundary = 0.01):
def consecutive(data, stepsize=1):
arr_consec= np.split(data, np.where(np.diff(data) != stepsize)[0]+1)
return max(arr_consec, key = len)
image_data_bw = image.min(axis=2)
gc.collect()
non_empty = np.where(image_data_bw<220,True, False)
non_empty_columns = np.where(np.sum(non_empty,axis=0)>true_boundary*np.shape(non_empty)[1])
non_empty_rows = np.where(np.sum(non_empty,axis=1)>true_boundary*np.shape(non_empty)[0])
non_empty_cols_consec = consecutive(non_empty_columns)
non_empty_rows_consec = consecutive(non_empty_rows)
try:
cropBox = (np.min(non_empty_rows_consec), np.max(non_empty_rows_consec), np.min(non_empty_cols_consec), np.max(non_empty_cols_consec))
except:
print("Improper Object found. Moving on...")
gc.collect()
return
Image.fromarray(image[cropBox[0]:cropBox[1]+1, cropBox[2]:cropBox[3]+1 , :]).save(png_save)
gc.collect()
if __name__ == '__main__':
a= DataPreparation("C:\\Users\\Billy\\Downloads\\Data")
time.sleep(2)
a.AutocropAll()