a b/pathflowai/cli_preprocessing.py
1
import argparse
2
import os
3
from os.path import join
4
from pathflowai.utils import run_preprocessing_pipeline, generate_patch_pipeline, img2npy_, create_zero_mask
5
import click
6
import dask
7
import time
8
9
CONTEXT_SETTINGS = dict(help_option_names=['-h','--help'], max_content_width=90)
10
11
@click.group(context_settings= CONTEXT_SETTINGS)
12
@click.version_option(version='0.1')
13
def preprocessing():
14
    pass
15
16
def output_if_exists(filename):
17
    """Returns file name if the file exists
18
19
    Parameters
20
    ----------
21
    filename : str
22
        File in question.
23
24
    Returns
25
    -------
26
    str
27
        Filename.
28
29
    """
30
31
    if os.path.exists(filename):
32
        return filename
33
    return None
34
35
@preprocessing.command()
36
@click.option('-npy', '--img2npy', is_flag=True, help='Image to numpy for faster read.', show_default=True)
37
@click.option('-b', '--basename', default='A01', help='Basename of patches.', type=click.Path(exists=False), show_default=True)
38
@click.option('-i', '--input_dir', default='./inputs/', help='Input directory for patches.', type=click.Path(exists=False), show_default=True)
39
@click.option('-a', '--annotations', default=[], multiple=True, help='Annotations in image in order.', type=click.Path(exists=False), show_default=True)
40
@click.option('-pr', '--preprocess', is_flag=True, help='Run preprocessing pipeline.', show_default=True)
41
@click.option('-pa', '--patches', is_flag=True, help='Add patches to SQL.', show_default=True)
42
@click.option('-t', '--threshold', default=0.05, help='Threshold to remove non-purple slides.',  show_default=True)
43
@click.option('-ps', '--patch_size', default=224, help='Patch size.',  show_default=True)
44
@click.option('-it', '--intensity_threshold', default=100., help='Intensity threshold to rate a pixel as non-white.',  show_default=True)
45
@click.option('-g', '--generate_finetune_segmentation', is_flag=True, help='Generate patches for one segmentation mask class for targeted finetuning.', show_default=True)
46
@click.option('-tc', '--target_segmentation_class', default=0, help='Segmentation Class to finetune on, output patches to another db.',  show_default=True)
47
@click.option('-tt', '--target_threshold', default=0., help='Threshold to include target for segmentation if saving one class.',  show_default=True)
48
@click.option('-odb', '--out_db', default='./patch_info.db', help='Output patch database.', type=click.Path(exists=False), show_default=True)
49
@click.option('-am', '--adjust_mask', is_flag=True, help='Remove additional background regions from annotation mask.', show_default=True)
50
@click.option('-nn', '--n_neighbors', default=5, help='If adjusting mask, number of neighbors connectivity to remove.',  show_default=True)
51
@click.option('-bp', '--basic_preprocess', is_flag=True, help='Basic preprocessing pipeline, annotation areas are not saved. Used for benchmarking tool against comparable pipelines', show_default=True)
52
@click.option('-ei', '--entire_image', is_flag=True, help='Store entire image in central db rather than patches.', show_default=True)
53
@click.option('-nz', '--no_zarr', is_flag=True, help='Don\'t save zarr format file.', show_default=True)
54
@click.option('-pka', '--pkl_annot', is_flag=True, help='Look for .annot.pkl pickle files instead of xml annotations.', show_default=True)
55
@click.option('-ta', '--transpose_annotations', is_flag=True, help='Transpose annotations.', show_default=True)
56
@click.option('-gtm', '--get_tissue_mask', is_flag=True, help='Build tissue mask instead of intensity thresholding.', show_default=True)
57
@click.option('-ot', '--otsu', is_flag=True, help='Utilize otsu method to decide intensity threshold.', show_default=True)
58
@click.option('-cm', '--compression', default=8., help='If find tissue mask, how much to downsample image.',  show_default=True)
59
@click.option('-ch', '--return_convex_hull', is_flag=True, help='Return convex hull of tissue mask.', show_default=True)
60
@click.option('-kh', '--keep_holes', is_flag=True, help='Keep holes tissue mask.', show_default=True)
61
@click.option('-mhs', '--max_hole_size', default=0, help='If removing holes, what is maximum allowed size to remain.',  show_default=True)
62
@click.option('-gbc', '--gray_before_close', is_flag=True, help='Filter grays before binary closing operation.', show_default=True)
63
@click.option('-kl', '--kernel', default=61, help='Binary closing kernel.',  show_default=True)
64
@click.option('-mos', '--min_object_size', default=100000, help='Remove all connected components smaller than this size.',  show_default=True)
65
@click.option('-bs', '--blur_size', default=0, help='How much to blur tissue mask.',  show_default=True)
66
def preprocess_pipeline(img2npy,basename,input_dir,annotations,preprocess,patches,threshold,patch_size, intensity_threshold, generate_finetune_segmentation, target_segmentation_class, target_threshold, out_db, adjust_mask, n_neighbors, basic_preprocess, entire_image, no_zarr, pkl_annot, transpose_annotations,get_tissue_mask,otsu,compression,return_convex_hull, keep_holes, max_hole_size, gray_before_close, kernel, min_object_size, blur_size):
67
    """Preprocessing pipeline that accomplishes 3 things. 1: storage into ZARR format, 2: optional mask adjustment, 3: storage of patch-level information into SQL DB"""
68
69
    for ext in ['.npy','.svs','.tiff','.tif', '.vms', '.vmu', '.ndpi', '.scn', '.mrxs', '.svslide', '.bif', '.jpeg', '.png', '.h5']:
70
        svs_file = output_if_exists(join(input_dir,'{}{}'.format(basename,ext)))
71
        if svs_file != None:
72
            break
73
74
    if img2npy and not svs_file.endswith('.npy'):
75
        svs_file = img2npy_(input_dir,basename, svs_file)
76
77
    xml_file = output_if_exists(join(input_dir,'{}{}'.format(basename,".xml" if not pkl_annot else ".annot.pkl")))
78
    npy_mask = output_if_exists(join(input_dir,'{}_mask.npy'.format(basename)))
79
    out_zarr = join(input_dir,'{}.zarr'.format(basename))
80
    out_pkl = join(input_dir,'{}_mask.pkl'.format(basename))
81
    adj_npy=''
82
83
84
    start=time.time()
85
    if preprocess:
86
        run_preprocessing_pipeline(svs_file=svs_file,
87
                               xml_file=xml_file,
88
                               npy_mask=npy_mask,
89
                               annotations=annotations,
90
                               out_zarr=out_zarr,
91
                               out_pkl=out_pkl,
92
                               no_zarr=no_zarr,
93
                               transpose_annotations=transpose_annotations)
94
95
    if npy_mask==None and xml_file==None:
96
        print('Generating Zero Mask')
97
        npy_mask=join(input_dir,'{}_mask.npz'.format(basename))
98
        target_segmentation_class=1
99
        generate_finetune_segmentation=True
100
        create_zero_mask(npy_mask,out_zarr if not no_zarr else svs_file,out_pkl)
101
102
103
    preprocess_point = time.time()
104
    print('Data dump took {}'.format(preprocess_point-start))
105
106
    if adjust_mask:
107
        from pathflowai.utils import adjust_mask
108
        adj_dir=join(input_dir,'adjusted_masks')
109
        adj_npy=join(adj_dir,os.path.basename(npy_mask))
110
        os.makedirs(adj_dir,exist_ok=True)
111
        if not os.path.exists(adj_npy):
112
            adjust_mask(npy_mask, out_zarr if not no_zarr else svs_file, adj_npy, n_neighbors)
113
    adjust_point = time.time()
114
    print('Adjust took {}'.format(adjust_point-preprocess_point))
115
116
117
    if patches: # ADD EXPORT TO SQL, TABLE NAME IS PATCH SIZE
118
        generate_patch_pipeline(basename,
119
                            input_dir=input_dir,
120
                            annotations=annotations,
121
                            threshold=threshold,
122
                            patch_size=patch_size,
123
                            out_db=out_db,
124
                            generate_finetune_segmentation=generate_finetune_segmentation,
125
                            target_class=target_segmentation_class,
126
                            intensity_threshold=intensity_threshold,
127
                            target_threshold=target_threshold,
128
                            adj_mask=adj_npy,
129
                            basic_preprocess=basic_preprocess,
130
                            entire_image=entire_image,
131
                            svs_file=svs_file,
132
                            transpose_annotations=transpose_annotations,
133
                            get_tissue_mask=get_tissue_mask,
134
                            otsu=otsu,
135
                            compression=compression,
136
                            return_convex_hull=return_convex_hull,
137
                            keep_holes=keep_holes,
138
                            max_hole_size=max_hole_size,
139
                            gray_before_close=gray_before_close,
140
                            kernel=kernel,
141
                            min_object_size=min_object_size,
142
                            blur_size=blur_size)
143
    patch_point = time.time()
144
    print('Patches took {}'.format(patch_point-adjust_point))
145
146
@preprocessing.command()
147
@click.option('-i', '--mask_dir', default='./inputs/', help='Input directory for masks.', type=click.Path(exists=False), show_default=True)
148
@click.option('-o', '--output_dir', default='./outputs/', help='Output directory for new masks.', type=click.Path(exists=False), show_default=True)
149
@click.option('-fr', '--from_annotations', default=[], multiple=True, help='Annotations to switch from.', show_default=True)
150
@click.option('-to', '--to_annotations', default=[], multiple=True, help='Annotations to switch to.', show_default=True)
151
def alter_masks(mask_dir, output_dir, from_annotations, to_annotations):
152
    """Map list of values to other values in mask."""
153
    import glob
154
    from pathflowai.utils import npy2da
155
    import numpy as np
156
    from dask.distributed import Client
157
    assert len(from_annotations)==len(to_annotations)
158
    c=Client()
159
    from_annotations=list(map(int,from_annotations))
160
    to_annotations=list(map(int,to_annotations))
161
    os.makedirs(output_dir,exist_ok=True)
162
    masks=glob.glob(join(mask_dir,'*_mask.npy'))
163
    from_to=list(zip(from_annotations,to_annotations))
164
    for mask in masks:
165
        output_mask=join(output_dir,os.path.basename(mask))
166
        arr=npy2da(mask)
167
        for fr,to in from_to:
168
            arr[arr==fr]=to
169
        np.save(output_mask,arr.compute())
170
171
@preprocessing.command()
172
@click.option('-i', '--input_patch_db', default='patch_info_input.db', help='Input db.', type=click.Path(exists=False), show_default=True)
173
@click.option('-o', '--output_patch_db', default='patch_info_output.db', help='Output db.', type=click.Path(exists=False), show_default=True)
174
@click.option('-b', '--basename', default='A01', help='Basename.', type=click.Path(exists=False), show_default=True)
175
@click.option('-ps', '--patch_size', default=224, help='Patch size.',  show_default=True)
176
def remove_basename_from_db(input_patch_db, output_patch_db, basename, patch_size):
177
    """Removes basename/ID from SQL DB."""
178
    import sqlite3
179
    import numpy as np, pandas as pd
180
    os.makedirs(output_patch_db[:output_patch_db.rfind('/')],exist_ok=True)
181
    conn = sqlite3.connect(input_patch_db)
182
    df=pd.read_sql('select * from "{}";'.format(patch_size),con=conn)
183
    conn.close()
184
    df=df.loc[df['ID']!=basename]
185
    conn = sqlite3.connect(output_patch_db)
186
    df.set_index('index').to_sql(str(patch_size), con=conn, if_exists='replace')
187
    conn.close()
188
189
190
@preprocessing.command()
191
@click.option('-i', '--input_patch_db', default='patch_info_input.db', help='Input db.', type=click.Path(exists=False), show_default=True)
192
@click.option('-o', '--output_patch_db', default='patch_info_output.db', help='Output db.', type=click.Path(exists=False), show_default=True)
193
@click.option('-fr', '--from_annotations', default=[], multiple=True, help='Annotations to switch from.', show_default=True)
194
@click.option('-to', '--to_annotations', default=[], multiple=True, help='Annotations to switch to.', show_default=True)
195
@click.option('-ps', '--patch_size', default=224, help='Patch size.',  show_default=True)
196
@click.option('-rb', '--remove_background_annotation', default='', help='If selected, removes 100\% background patches based on this annotation.', type=click.Path(exists=False), show_default=True)
197
@click.option('-ma', '--max_background_area', default=0.05, help='Max background area before exclusion.',  show_default=True)
198
def collapse_annotations(input_patch_db, output_patch_db, from_annotations, to_annotations, patch_size, remove_background_annotation, max_background_area):
199
    """Adds annotation classes areas to other annotation classes in SQL DB when getting rid of some annotation classes."""
200
    import sqlite3
201
    import numpy as np, pandas as pd
202
    assert len(from_annotations)==len(to_annotations)
203
    from_annotations=list(map(str,from_annotations))
204
    to_annotations=list(map(str,to_annotations))
205
    os.makedirs(output_patch_db[:output_patch_db.rfind('/')],exist_ok=True)
206
    conn = sqlite3.connect(input_patch_db)
207
    df=pd.read_sql('select * from "{}";'.format(patch_size),con=conn)
208
    conn.close()
209
    from_to=zip(from_annotations,to_annotations)
210
    if remove_background_annotation:
211
        df=df.loc[df[remove_background_annotation]<=(1.-max_background_area)]
212
    for fr,to in from_to:
213
        df.loc[:,to]+=df[fr]
214
    df=df[[col for col in list(df) if col not in from_annotations]]
215
    annotations = list(df.iloc[:,6:])
216
    df=df.rename(columns={annot:str(i) for i, annot in enumerate(annotations)})
217
    annotations = list(df.iloc[:,6:])
218
    df.loc[:,'annotation']=np.vectorize(lambda i: annotations[df.iloc[i,6:].values.argmax()])(np.arange(df.shape[0]))
219
    df.loc[:,'index']=np.arange(df.shape[0])
220
    conn = sqlite3.connect(output_patch_db)
221
    #print(df)
222
    df.set_index('index').to_sql(str(patch_size), con=conn, if_exists='replace')
223
    conn.close()
224
225
226
if __name__ == '__main__':
227
    from dask.distributed import Client
228
    dask.config.set({'temporary_dir':'tmp/',
229
                    'distributed.worker.local_dir':'tmp/',
230
                    'distributed.scheduler.allowed-failures':20})#'distributed.worker.num-workers':20}):
231
    c=Client(processes=False)
232
    preprocessing()
233
    c.close()