Switch to side-by-side view

--- a
+++ b/pathflowai/cli_preprocessing.py
@@ -0,0 +1,233 @@
+import argparse
+import os
+from os.path import join
+from pathflowai.utils import run_preprocessing_pipeline, generate_patch_pipeline, img2npy_, create_zero_mask
+import click
+import dask
+import time
+
+CONTEXT_SETTINGS = dict(help_option_names=['-h','--help'], max_content_width=90)
+
+@click.group(context_settings= CONTEXT_SETTINGS)
+@click.version_option(version='0.1')
+def preprocessing():
+	pass
+
+def output_if_exists(filename):
+	"""Returns file name if the file exists
+
+	Parameters
+	----------
+	filename : str
+		File in question.
+
+	Returns
+	-------
+	str
+		Filename.
+
+	"""
+
+	if os.path.exists(filename):
+		return filename
+	return None
+
+@preprocessing.command()
+@click.option('-npy', '--img2npy', is_flag=True, help='Image to numpy for faster read.', show_default=True)
+@click.option('-b', '--basename', default='A01', help='Basename of patches.', type=click.Path(exists=False), show_default=True)
+@click.option('-i', '--input_dir', default='./inputs/', help='Input directory for patches.', type=click.Path(exists=False), show_default=True)
+@click.option('-a', '--annotations', default=[], multiple=True, help='Annotations in image in order.', type=click.Path(exists=False), show_default=True)
+@click.option('-pr', '--preprocess', is_flag=True, help='Run preprocessing pipeline.', show_default=True)
+@click.option('-pa', '--patches', is_flag=True, help='Add patches to SQL.', show_default=True)
+@click.option('-t', '--threshold', default=0.05, help='Threshold to remove non-purple slides.',  show_default=True)
+@click.option('-ps', '--patch_size', default=224, help='Patch size.',  show_default=True)
+@click.option('-it', '--intensity_threshold', default=100., help='Intensity threshold to rate a pixel as non-white.',  show_default=True)
+@click.option('-g', '--generate_finetune_segmentation', is_flag=True, help='Generate patches for one segmentation mask class for targeted finetuning.', show_default=True)
+@click.option('-tc', '--target_segmentation_class', default=0, help='Segmentation Class to finetune on, output patches to another db.',  show_default=True)
+@click.option('-tt', '--target_threshold', default=0., help='Threshold to include target for segmentation if saving one class.',  show_default=True)
+@click.option('-odb', '--out_db', default='./patch_info.db', help='Output patch database.', type=click.Path(exists=False), show_default=True)
+@click.option('-am', '--adjust_mask', is_flag=True, help='Remove additional background regions from annotation mask.', show_default=True)
+@click.option('-nn', '--n_neighbors', default=5, help='If adjusting mask, number of neighbors connectivity to remove.',  show_default=True)
+@click.option('-bp', '--basic_preprocess', is_flag=True, help='Basic preprocessing pipeline, annotation areas are not saved. Used for benchmarking tool against comparable pipelines', show_default=True)
+@click.option('-ei', '--entire_image', is_flag=True, help='Store entire image in central db rather than patches.', show_default=True)
+@click.option('-nz', '--no_zarr', is_flag=True, help='Don\'t save zarr format file.', show_default=True)
+@click.option('-pka', '--pkl_annot', is_flag=True, help='Look for .annot.pkl pickle files instead of xml annotations.', show_default=True)
+@click.option('-ta', '--transpose_annotations', is_flag=True, help='Transpose annotations.', show_default=True)
+@click.option('-gtm', '--get_tissue_mask', is_flag=True, help='Build tissue mask instead of intensity thresholding.', show_default=True)
+@click.option('-ot', '--otsu', is_flag=True, help='Utilize otsu method to decide intensity threshold.', show_default=True)
+@click.option('-cm', '--compression', default=8., help='If find tissue mask, how much to downsample image.',  show_default=True)
+@click.option('-ch', '--return_convex_hull', is_flag=True, help='Return convex hull of tissue mask.', show_default=True)
+@click.option('-kh', '--keep_holes', is_flag=True, help='Keep holes tissue mask.', show_default=True)
+@click.option('-mhs', '--max_hole_size', default=0, help='If removing holes, what is maximum allowed size to remain.',  show_default=True)
+@click.option('-gbc', '--gray_before_close', is_flag=True, help='Filter grays before binary closing operation.', show_default=True)
+@click.option('-kl', '--kernel', default=61, help='Binary closing kernel.',  show_default=True)
+@click.option('-mos', '--min_object_size', default=100000, help='Remove all connected components smaller than this size.',  show_default=True)
+@click.option('-bs', '--blur_size', default=0, help='How much to blur tissue mask.',  show_default=True)
+def preprocess_pipeline(img2npy,basename,input_dir,annotations,preprocess,patches,threshold,patch_size, intensity_threshold, generate_finetune_segmentation, target_segmentation_class, target_threshold, out_db, adjust_mask, n_neighbors, basic_preprocess, entire_image, no_zarr, pkl_annot, transpose_annotations,get_tissue_mask,otsu,compression,return_convex_hull, keep_holes, max_hole_size, gray_before_close, kernel, min_object_size, blur_size):
+	"""Preprocessing pipeline that accomplishes 3 things. 1: storage into ZARR format, 2: optional mask adjustment, 3: storage of patch-level information into SQL DB"""
+
+	for ext in ['.npy','.svs','.tiff','.tif', '.vms', '.vmu', '.ndpi', '.scn', '.mrxs', '.svslide', '.bif', '.jpeg', '.png', '.h5']:
+		svs_file = output_if_exists(join(input_dir,'{}{}'.format(basename,ext)))
+		if svs_file != None:
+			break
+
+	if img2npy and not svs_file.endswith('.npy'):
+		svs_file = img2npy_(input_dir,basename, svs_file)
+
+	xml_file = output_if_exists(join(input_dir,'{}{}'.format(basename,".xml" if not pkl_annot else ".annot.pkl")))
+	npy_mask = output_if_exists(join(input_dir,'{}_mask.npy'.format(basename)))
+	out_zarr = join(input_dir,'{}.zarr'.format(basename))
+	out_pkl = join(input_dir,'{}_mask.pkl'.format(basename))
+	adj_npy=''
+
+
+	start=time.time()
+	if preprocess:
+		run_preprocessing_pipeline(svs_file=svs_file,
+							   xml_file=xml_file,
+							   npy_mask=npy_mask,
+							   annotations=annotations,
+							   out_zarr=out_zarr,
+							   out_pkl=out_pkl,
+							   no_zarr=no_zarr,
+							   transpose_annotations=transpose_annotations)
+
+	if npy_mask==None and xml_file==None:
+		print('Generating Zero Mask')
+		npy_mask=join(input_dir,'{}_mask.npz'.format(basename))
+		target_segmentation_class=1
+		generate_finetune_segmentation=True
+		create_zero_mask(npy_mask,out_zarr if not no_zarr else svs_file,out_pkl)
+
+
+	preprocess_point = time.time()
+	print('Data dump took {}'.format(preprocess_point-start))
+
+	if adjust_mask:
+		from pathflowai.utils import adjust_mask
+		adj_dir=join(input_dir,'adjusted_masks')
+		adj_npy=join(adj_dir,os.path.basename(npy_mask))
+		os.makedirs(adj_dir,exist_ok=True)
+		if not os.path.exists(adj_npy):
+			adjust_mask(npy_mask, out_zarr if not no_zarr else svs_file, adj_npy, n_neighbors)
+	adjust_point = time.time()
+	print('Adjust took {}'.format(adjust_point-preprocess_point))
+
+
+	if patches: # ADD EXPORT TO SQL, TABLE NAME IS PATCH SIZE
+		generate_patch_pipeline(basename,
+							input_dir=input_dir,
+							annotations=annotations,
+							threshold=threshold,
+							patch_size=patch_size,
+							out_db=out_db,
+							generate_finetune_segmentation=generate_finetune_segmentation,
+							target_class=target_segmentation_class,
+							intensity_threshold=intensity_threshold,
+							target_threshold=target_threshold,
+							adj_mask=adj_npy,
+							basic_preprocess=basic_preprocess,
+							entire_image=entire_image,
+							svs_file=svs_file,
+							transpose_annotations=transpose_annotations,
+							get_tissue_mask=get_tissue_mask,
+							otsu=otsu,
+							compression=compression,
+							return_convex_hull=return_convex_hull,
+							keep_holes=keep_holes,
+							max_hole_size=max_hole_size,
+							gray_before_close=gray_before_close,
+							kernel=kernel,
+							min_object_size=min_object_size,
+							blur_size=blur_size)
+	patch_point = time.time()
+	print('Patches took {}'.format(patch_point-adjust_point))
+
+@preprocessing.command()
+@click.option('-i', '--mask_dir', default='./inputs/', help='Input directory for masks.', type=click.Path(exists=False), show_default=True)
+@click.option('-o', '--output_dir', default='./outputs/', help='Output directory for new masks.', type=click.Path(exists=False), show_default=True)
+@click.option('-fr', '--from_annotations', default=[], multiple=True, help='Annotations to switch from.', show_default=True)
+@click.option('-to', '--to_annotations', default=[], multiple=True, help='Annotations to switch to.', show_default=True)
+def alter_masks(mask_dir, output_dir, from_annotations, to_annotations):
+	"""Map list of values to other values in mask."""
+	import glob
+	from pathflowai.utils import npy2da
+	import numpy as np
+	from dask.distributed import Client
+	assert len(from_annotations)==len(to_annotations)
+	c=Client()
+	from_annotations=list(map(int,from_annotations))
+	to_annotations=list(map(int,to_annotations))
+	os.makedirs(output_dir,exist_ok=True)
+	masks=glob.glob(join(mask_dir,'*_mask.npy'))
+	from_to=list(zip(from_annotations,to_annotations))
+	for mask in masks:
+		output_mask=join(output_dir,os.path.basename(mask))
+		arr=npy2da(mask)
+		for fr,to in from_to:
+			arr[arr==fr]=to
+		np.save(output_mask,arr.compute())
+
+@preprocessing.command()
+@click.option('-i', '--input_patch_db', default='patch_info_input.db', help='Input db.', type=click.Path(exists=False), show_default=True)
+@click.option('-o', '--output_patch_db', default='patch_info_output.db', help='Output db.', type=click.Path(exists=False), show_default=True)
+@click.option('-b', '--basename', default='A01', help='Basename.', type=click.Path(exists=False), show_default=True)
+@click.option('-ps', '--patch_size', default=224, help='Patch size.',  show_default=True)
+def remove_basename_from_db(input_patch_db, output_patch_db, basename, patch_size):
+	"""Removes basename/ID from SQL DB."""
+	import sqlite3
+	import numpy as np, pandas as pd
+	os.makedirs(output_patch_db[:output_patch_db.rfind('/')],exist_ok=True)
+	conn = sqlite3.connect(input_patch_db)
+	df=pd.read_sql('select * from "{}";'.format(patch_size),con=conn)
+	conn.close()
+	df=df.loc[df['ID']!=basename]
+	conn = sqlite3.connect(output_patch_db)
+	df.set_index('index').to_sql(str(patch_size), con=conn, if_exists='replace')
+	conn.close()
+
+
+@preprocessing.command()
+@click.option('-i', '--input_patch_db', default='patch_info_input.db', help='Input db.', type=click.Path(exists=False), show_default=True)
+@click.option('-o', '--output_patch_db', default='patch_info_output.db', help='Output db.', type=click.Path(exists=False), show_default=True)
+@click.option('-fr', '--from_annotations', default=[], multiple=True, help='Annotations to switch from.', show_default=True)
+@click.option('-to', '--to_annotations', default=[], multiple=True, help='Annotations to switch to.', show_default=True)
+@click.option('-ps', '--patch_size', default=224, help='Patch size.',  show_default=True)
+@click.option('-rb', '--remove_background_annotation', default='', help='If selected, removes 100\% background patches based on this annotation.', type=click.Path(exists=False), show_default=True)
+@click.option('-ma', '--max_background_area', default=0.05, help='Max background area before exclusion.',  show_default=True)
+def collapse_annotations(input_patch_db, output_patch_db, from_annotations, to_annotations, patch_size, remove_background_annotation, max_background_area):
+	"""Adds annotation classes areas to other annotation classes in SQL DB when getting rid of some annotation classes."""
+	import sqlite3
+	import numpy as np, pandas as pd
+	assert len(from_annotations)==len(to_annotations)
+	from_annotations=list(map(str,from_annotations))
+	to_annotations=list(map(str,to_annotations))
+	os.makedirs(output_patch_db[:output_patch_db.rfind('/')],exist_ok=True)
+	conn = sqlite3.connect(input_patch_db)
+	df=pd.read_sql('select * from "{}";'.format(patch_size),con=conn)
+	conn.close()
+	from_to=zip(from_annotations,to_annotations)
+	if remove_background_annotation:
+		df=df.loc[df[remove_background_annotation]<=(1.-max_background_area)]
+	for fr,to in from_to:
+		df.loc[:,to]+=df[fr]
+	df=df[[col for col in list(df) if col not in from_annotations]]
+	annotations = list(df.iloc[:,6:])
+	df=df.rename(columns={annot:str(i) for i, annot in enumerate(annotations)})
+	annotations = list(df.iloc[:,6:])
+	df.loc[:,'annotation']=np.vectorize(lambda i: annotations[df.iloc[i,6:].values.argmax()])(np.arange(df.shape[0]))
+	df.loc[:,'index']=np.arange(df.shape[0])
+	conn = sqlite3.connect(output_patch_db)
+	#print(df)
+	df.set_index('index').to_sql(str(patch_size), con=conn, if_exists='replace')
+	conn.close()
+
+
+if __name__ == '__main__':
+	from dask.distributed import Client
+	dask.config.set({'temporary_dir':'tmp/',
+					'distributed.worker.local_dir':'tmp/',
+					'distributed.scheduler.allowed-failures':20})#'distributed.worker.num-workers':20}):
+	c=Client(processes=False)
+	preprocessing()
+	c.close()