|
a |
|
b/DataPreparation.py |
|
|
1 |
# -*- coding: utf-8 -*- |
|
|
2 |
""" |
|
|
3 |
Created on Fri Jul 10 13:14:31 2020 |
|
|
4 |
|
|
|
5 |
@author: Billy |
|
|
6 |
""" |
|
|
7 |
|
|
|
8 |
import slideio |
|
|
9 |
import glob |
|
|
10 |
import os |
|
|
11 |
import time |
|
|
12 |
import numpy as np |
|
|
13 |
import gc |
|
|
14 |
import progressbar |
|
|
15 |
from PIL import Image |
|
|
16 |
from skimage import filters |
|
|
17 |
|
|
|
18 |
|
|
|
19 |
class DataPreparation: |
|
|
20 |
|
|
|
21 |
#this function initialises a data cleaner |
|
|
22 |
#the job of this class is to generate x20 magnification .png images from svs slides, |
|
|
23 |
#whilst cutting out as much background and non-epithelial area as possible. |
|
|
24 |
# |
|
|
25 |
#this function needs the location of the folder that contains svs images as an input |
|
|
26 |
def __init__(self, svs_loc, png_loc = None): |
|
|
27 |
assert type(svs_loc) == type('') |
|
|
28 |
|
|
|
29 |
|
|
|
30 |
if not os.path.exists(svs_loc): |
|
|
31 |
raise ValueError("SVS directory is illegitimate:", svs_loc,". Please enter the full filepath of an existing directory containing .svs files.\n") |
|
|
32 |
else: |
|
|
33 |
os.chdir(svs_loc) |
|
|
34 |
self.svs_images = glob.glob("*.svs") |
|
|
35 |
if len(self.svs_images) == 0: |
|
|
36 |
raise ValueError("Directory", svs_loc," exists, but there are no .svs images in this location.") |
|
|
37 |
self.svs_loc = svs_loc |
|
|
38 |
|
|
|
39 |
print("Found the following outputs:", self.svs_images,"\n") |
|
|
40 |
|
|
|
41 |
if png_loc == None: |
|
|
42 |
parent_dir, dir_ = os.path.split(svs_loc) |
|
|
43 |
png_loc = os.path.join(parent_dir,"Prepared_SVS") |
|
|
44 |
|
|
|
45 |
if not os.path.exists(png_loc): |
|
|
46 |
os.mkdir(png_loc) |
|
|
47 |
|
|
|
48 |
self.png_fold = png_loc |
|
|
49 |
|
|
|
50 |
|
|
|
51 |
#this function receives the file location of a folder that contains svs images, |
|
|
52 |
#and generates a subling subfolder populated with png images. |
|
|
53 |
# |
|
|
54 |
#this function also saves a log file, to inform the user about the new image's geometric properties. |
|
|
55 |
def AutocropAll(self, svs_loc = None, png_loc= None, max_mag= 20): |
|
|
56 |
|
|
|
57 |
|
|
|
58 |
if svs_loc == None: |
|
|
59 |
svs_loc = self.svs_loc |
|
|
60 |
if png_loc == None: |
|
|
61 |
png_loc = self.png_fold |
|
|
62 |
|
|
|
63 |
if not os.path.exists(svs_loc): |
|
|
64 |
raise ValueError("SVS directory is illegitimate:", svs_loc,". Please enter the full filepath of an existing directory containing .svs files.\n") |
|
|
65 |
|
|
|
66 |
if not os.path.exists(png_loc): |
|
|
67 |
print("File Location", png_loc,"does not exist. Making this directory...") |
|
|
68 |
os.mkdir(png_loc) |
|
|
69 |
print("Successfully create .png save directory.") |
|
|
70 |
|
|
|
71 |
|
|
|
72 |
|
|
|
73 |
def consecutive(data, stepsize=1): |
|
|
74 |
arr_consec= np.split(data, np.where(np.diff(data) != stepsize)[0]+1) |
|
|
75 |
return max(arr_consec, key = len) |
|
|
76 |
|
|
|
77 |
widgets = [ |
|
|
78 |
'Cropping: ', progressbar.Percentage(), |
|
|
79 |
' ', progressbar.AnimatedMarker(), |
|
|
80 |
' ', progressbar.ETA(), |
|
|
81 |
] |
|
|
82 |
|
|
|
83 |
|
|
|
84 |
bar = progressbar.ProgressBar( |
|
|
85 |
widgets=widgets, |
|
|
86 |
maxval=len(self.svs_images)).start() |
|
|
87 |
log_loc = os.path.join(png_loc, 'log.txt') |
|
|
88 |
|
|
|
89 |
with open(log_loc, 'w') as filetowrite: |
|
|
90 |
for i in range(len(self.svs_images)): |
|
|
91 |
bar.update(i) |
|
|
92 |
information ={} |
|
|
93 |
|
|
|
94 |
file,svs = os.path.splitext(self.svs_images[i]) |
|
|
95 |
pic1_loc = os.path.join(svs_loc, self.svs_images[i]) |
|
|
96 |
|
|
|
97 |
|
|
|
98 |
slide= slideio.open_slide(pic1_loc, 'SVS') |
|
|
99 |
scene = slide.get_scene(0) |
|
|
100 |
mag = scene.magnification |
|
|
101 |
pixel_size = scene.resolution[0] |
|
|
102 |
_,_,width,height = scene.rect |
|
|
103 |
|
|
|
104 |
img_fold = os.path.join(png_loc, file) |
|
|
105 |
if not os.path.exists(img_fold): |
|
|
106 |
os.makedirs(img_fold) |
|
|
107 |
|
|
|
108 |
while mag>max_mag: |
|
|
109 |
width = int(np.round(width/2)) |
|
|
110 |
height = int(np.round(height/2)) |
|
|
111 |
mag = mag/2 |
|
|
112 |
pixel_size = pixel_size*2 |
|
|
113 |
|
|
|
114 |
image= scene.read_block(scene.rect,(width,height)) |
|
|
115 |
image_data_bw = image.min(axis=2) |
|
|
116 |
|
|
|
117 |
information['ImageName'] = file |
|
|
118 |
information['Magnification'] = mag |
|
|
119 |
information['ImagePixelHeight'] = height |
|
|
120 |
information['ImagePixelWidth'] = width |
|
|
121 |
information['PixelSizeMeters'] = pixel_size |
|
|
122 |
filetowrite.write(str(information)) |
|
|
123 |
filetowrite.write(' \n ') |
|
|
124 |
filetowrite.write('#####') |
|
|
125 |
filetowrite.write(' \n ') |
|
|
126 |
|
|
|
127 |
object_h = self.ObjectSplitter(image_data_bw, axis = 0) |
|
|
128 |
width,height=np.shape(image_data_bw) |
|
|
129 |
for j,indices_v in enumerate(object_h): |
|
|
130 |
real_objects = self.ObjectSplitter(image_data_bw[0:height,indices_v[0]:indices_v[1]], axis = 1) |
|
|
131 |
|
|
|
132 |
for k,indices_h in enumerate(real_objects): |
|
|
133 |
full_path = os.path.join(img_fold, file+"_"+str(j)+"_mag"+str(int(mag))+".png") |
|
|
134 |
if not os.path.exists(full_path): |
|
|
135 |
self.BackgroundReducer(image[indices_h[0]:indices_h[1],indices_v[0]:indices_v[1]], full_path) |
|
|
136 |
bar.finish() |
|
|
137 |
|
|
|
138 |
|
|
|
139 |
#return the indices splitting pairs of an image array depending on the percentage that of pixel 'completion' along a given axis. |
|
|
140 |
#This is to say, this function generates indices that an imaged should be cropped between, |
|
|
141 |
#either vertically or horziontally, based upon the percentage of white background in the image |
|
|
142 |
#the default percentage is 2% |
|
|
143 |
def ObjectSplitter(self, image_arr,percentage_threshold=2, axis =0): |
|
|
144 |
|
|
|
145 |
val = filters.threshold_otsu(image_arr) |
|
|
146 |
data = np.sum(image_arr < val,axis=axis) |
|
|
147 |
n = data.shape[0] |
|
|
148 |
data = np.where(data<(percentage_threshold/100)*n,0,1) |
|
|
149 |
|
|
|
150 |
loc_run_start = np.empty(n, dtype=bool) |
|
|
151 |
loc_run_start[0] = True |
|
|
152 |
np.not_equal(data[:-1], data[1:], out=loc_run_start[1:]) |
|
|
153 |
run_starts = np.nonzero(loc_run_start)[0].tolist() |
|
|
154 |
|
|
|
155 |
# find run values |
|
|
156 |
run_values = data[loc_run_start].tolist() |
|
|
157 |
|
|
|
158 |
# find run lengths |
|
|
159 |
run_lengths = np.diff(np.append(run_starts, n)).tolist() |
|
|
160 |
|
|
|
161 |
counter = 0 |
|
|
162 |
|
|
|
163 |
for i in range(len(run_starts)): |
|
|
164 |
idx = i-counter |
|
|
165 |
if run_lengths[idx]<0.02*n: |
|
|
166 |
if idx==0:continue |
|
|
167 |
if run_lengths[idx-1]>0.05*n and run_values[idx-1]==1: |
|
|
168 |
run_lengths[idx-1] += run_lengths[idx] |
|
|
169 |
run_lengths.pop(idx) |
|
|
170 |
run_starts.pop(idx) |
|
|
171 |
run_values.pop(idx) |
|
|
172 |
counter+=1 |
|
|
173 |
continue |
|
|
174 |
|
|
|
175 |
|
|
|
176 |
if idx>=len(run_starts)-1:continue |
|
|
177 |
if run_lengths[idx+1]>0.05*n and run_values[idx+1]==1: |
|
|
178 |
run_lengths[idx+1] += run_lengths[idx] |
|
|
179 |
run_lengths.pop(idx) |
|
|
180 |
run_starts.pop(idx) |
|
|
181 |
run_values.pop(idx) |
|
|
182 |
counter+=1 |
|
|
183 |
continue |
|
|
184 |
|
|
|
185 |
run_lengths[idx-1] += run_lengths[idx] |
|
|
186 |
run_lengths.pop(idx) |
|
|
187 |
run_starts.pop(idx) |
|
|
188 |
run_values.pop(idx) |
|
|
189 |
counter+=1 |
|
|
190 |
|
|
|
191 |
object_pairs = [] |
|
|
192 |
for i in range(len(run_values)): |
|
|
193 |
if not run_values[i]==1:continue |
|
|
194 |
object_pairs.append((run_starts[i], run_starts[i]+run_lengths[i])) |
|
|
195 |
|
|
|
196 |
return object_pairs |
|
|
197 |
|
|
|
198 |
|
|
|
199 |
#this function uses the indices splitting pairs to split input images. |
|
|
200 |
def BackgroundReducer(self, image, png_save, true_boundary = 0.01): |
|
|
201 |
|
|
|
202 |
def consecutive(data, stepsize=1): |
|
|
203 |
arr_consec= np.split(data, np.where(np.diff(data) != stepsize)[0]+1) |
|
|
204 |
return max(arr_consec, key = len) |
|
|
205 |
|
|
|
206 |
image_data_bw = image.min(axis=2) |
|
|
207 |
|
|
|
208 |
|
|
|
209 |
gc.collect() |
|
|
210 |
non_empty = np.where(image_data_bw<220,True, False) |
|
|
211 |
non_empty_columns = np.where(np.sum(non_empty,axis=0)>true_boundary*np.shape(non_empty)[1]) |
|
|
212 |
non_empty_rows = np.where(np.sum(non_empty,axis=1)>true_boundary*np.shape(non_empty)[0]) |
|
|
213 |
|
|
|
214 |
non_empty_cols_consec = consecutive(non_empty_columns) |
|
|
215 |
non_empty_rows_consec = consecutive(non_empty_rows) |
|
|
216 |
|
|
|
217 |
try: |
|
|
218 |
cropBox = (np.min(non_empty_rows_consec), np.max(non_empty_rows_consec), np.min(non_empty_cols_consec), np.max(non_empty_cols_consec)) |
|
|
219 |
except: |
|
|
220 |
print("Improper Object found. Moving on...") |
|
|
221 |
gc.collect() |
|
|
222 |
return |
|
|
223 |
Image.fromarray(image[cropBox[0]:cropBox[1]+1, cropBox[2]:cropBox[3]+1 , :]).save(png_save) |
|
|
224 |
|
|
|
225 |
gc.collect() |
|
|
226 |
|
|
|
227 |
|
|
|
228 |
if __name__ == '__main__': |
|
|
229 |
a= DataPreparation("C:\\Users\\Billy\\Downloads\\Data") |
|
|
230 |
time.sleep(2) |
|
|
231 |
a.AutocropAll() |
|
|
232 |
|