PathFlowAI / Git / [e9500f] /pathflowai/cli

Models:
DanielG/
PathFlowAI
Downloads: 1
[e9500f]: / pathflowai / cli_preprocessing.py
History
Download this file
234 lines (208 with data), 13.1 kB

import argparse
import os
from os.path import join
from pathflowai.utils import run_preprocessing_pipeline, generate_patch_pipeline, img2npy_, create_zero_mask
import click
import dask
import time

CONTEXT_SETTINGS = dict(help_option_names=['-h','--help'], max_content_width=90)

@click.group(context_settings= CONTEXT_SETTINGS)
@click.version_option(version='0.1')
def preprocessing():
	pass

def output_if_exists(filename):
	"""Returns file name if the file exists

	Parameters
	----------
	filename : str
		File in question.

	Returns
	-------
	str
		Filename.

	"""

	if os.path.exists(filename):
		return filename
	return None

@preprocessing.command()
@click.option('-npy', '--img2npy', is_flag=True, help='Image to numpy for faster read.', show_default=True)
@click.option('-b', '--basename', default='A01', help='Basename of patches.', type=click.Path(exists=False), show_default=True)
@click.option('-i', '--input_dir', default='./inputs/', help='Input directory for patches.', type=click.Path(exists=False), show_default=True)
@click.option('-a', '--annotations', default=[], multiple=True, help='Annotations in image in order.', type=click.Path(exists=False), show_default=True)
@click.option('-pr', '--preprocess', is_flag=True, help='Run preprocessing pipeline.', show_default=True)
@click.option('-pa', '--patches', is_flag=True, help='Add patches to SQL.', show_default=True)
@click.option('-t', '--threshold', default=0.05, help='Threshold to remove non-purple slides.',  show_default=True)
@click.option('-ps', '--patch_size', default=224, help='Patch size.',  show_default=True)
@click.option('-it', '--intensity_threshold', default=100., help='Intensity threshold to rate a pixel as non-white.',  show_default=True)
@click.option('-g', '--generate_finetune_segmentation', is_flag=True, help='Generate patches for one segmentation mask class for targeted finetuning.', show_default=True)
@click.option('-tc', '--target_segmentation_class', default=0, help='Segmentation Class to finetune on, output patches to another db.',  show_default=True)
@click.option('-tt', '--target_threshold', default=0., help='Threshold to include target for segmentation if saving one class.',  show_default=True)
@click.option('-odb', '--out_db', default='./patch_info.db', help='Output patch database.', type=click.Path(exists=False), show_default=True)
@click.option('-am', '--adjust_mask', is_flag=True, help='Remove additional background regions from annotation mask.', show_default=True)
@click.option('-nn', '--n_neighbors', default=5, help='If adjusting mask, number of neighbors connectivity to remove.',  show_default=True)
@click.option('-bp', '--basic_preprocess', is_flag=True, help='Basic preprocessing pipeline, annotation areas are not saved. Used for benchmarking tool against comparable pipelines', show_default=True)
@click.option('-ei', '--entire_image', is_flag=True, help='Store entire image in central db rather than patches.', show_default=True)
@click.option('-nz', '--no_zarr', is_flag=True, help='Don\'t save zarr format file.', show_default=True)
@click.option('-pka', '--pkl_annot', is_flag=True, help='Look for .annot.pkl pickle files instead of xml annotations.', show_default=True)
@click.option('-ta', '--transpose_annotations', is_flag=True, help='Transpose annotations.', show_default=True)
@click.option('-gtm', '--get_tissue_mask', is_flag=True, help='Build tissue mask instead of intensity thresholding.', show_default=True)
@click.option('-ot', '--otsu', is_flag=True, help='Utilize otsu method to decide intensity threshold.', show_default=True)
@click.option('-cm', '--compression', default=8., help='If find tissue mask, how much to downsample image.',  show_default=True)
@click.option('-ch', '--return_convex_hull', is_flag=True, help='Return convex hull of tissue mask.', show_default=True)
@click.option('-kh', '--keep_holes', is_flag=True, help='Keep holes tissue mask.', show_default=True)
@click.option('-mhs', '--max_hole_size', default=0, help='If removing holes, what is maximum allowed size to remain.',  show_default=True)
@click.option('-gbc', '--gray_before_close', is_flag=True, help='Filter grays before binary closing operation.', show_default=True)
@click.option('-kl', '--kernel', default=61, help='Binary closing kernel.',  show_default=True)
@click.option('-mos', '--min_object_size', default=100000, help='Remove all connected components smaller than this size.',  show_default=True)
@click.option('-bs', '--blur_size', default=0, help='How much to blur tissue mask.',  show_default=True)
def preprocess_pipeline(img2npy,basename,input_dir,annotations,preprocess,patches,threshold,patch_size, intensity_threshold, generate_finetune_segmentation, target_segmentation_class, target_threshold, out_db, adjust_mask, n_neighbors, basic_preprocess, entire_image, no_zarr, pkl_annot, transpose_annotations,get_tissue_mask,otsu,compression,return_convex_hull, keep_holes, max_hole_size, gray_before_close, kernel, min_object_size, blur_size):
	"""Preprocessing pipeline that accomplishes 3 things. 1: storage into ZARR format, 2: optional mask adjustment, 3: storage of patch-level information into SQL DB"""

	for ext in ['.npy','.svs','.tiff','.tif', '.vms', '.vmu', '.ndpi', '.scn', '.mrxs', '.svslide', '.bif', '.jpeg', '.png', '.h5']:
		svs_file = output_if_exists(join(input_dir,'{}{}'.format(basename,ext)))
		if svs_file != None:
			break

	if img2npy and not svs_file.endswith('.npy'):
		svs_file = img2npy_(input_dir,basename, svs_file)

	xml_file = output_if_exists(join(input_dir,'{}{}'.format(basename,".xml" if not pkl_annot else ".annot.pkl")))
	npy_mask = output_if_exists(join(input_dir,'{}_mask.npy'.format(basename)))
	out_zarr = join(input_dir,'{}.zarr'.format(basename))
	out_pkl = join(input_dir,'{}_mask.pkl'.format(basename))
	adj_npy=''


	start=time.time()
	if preprocess:
		run_preprocessing_pipeline(svs_file=svs_file,
							   xml_file=xml_file,
							   npy_mask=npy_mask,
							   annotations=annotations,
							   out_zarr=out_zarr,
							   out_pkl=out_pkl,
							   no_zarr=no_zarr,
							   transpose_annotations=transpose_annotations)

	if npy_mask==None and xml_file==None:
		print('Generating Zero Mask')
		npy_mask=join(input_dir,'{}_mask.npz'.format(basename))
		target_segmentation_class=1
		generate_finetune_segmentation=True
		create_zero_mask(npy_mask,out_zarr if not no_zarr else svs_file,out_pkl)


	preprocess_point = time.time()
	print('Data dump took {}'.format(preprocess_point-start))

	if adjust_mask:
		from pathflowai.utils import adjust_mask
		adj_dir=join(input_dir,'adjusted_masks')
		adj_npy=join(adj_dir,os.path.basename(npy_mask))
		os.makedirs(adj_dir,exist_ok=True)
		if not os.path.exists(adj_npy):
			adjust_mask(npy_mask, out_zarr if not no_zarr else svs_file, adj_npy, n_neighbors)
	adjust_point = time.time()
	print('Adjust took {}'.format(adjust_point-preprocess_point))


	if patches: # ADD EXPORT TO SQL, TABLE NAME IS PATCH SIZE
		generate_patch_pipeline(basename,
							input_dir=input_dir,
							annotations=annotations,
							threshold=threshold,
							patch_size=patch_size,
							out_db=out_db,
							generate_finetune_segmentation=generate_finetune_segmentation,
							target_class=target_segmentation_class,
							intensity_threshold=intensity_threshold,
							target_threshold=target_threshold,
							adj_mask=adj_npy,
							basic_preprocess=basic_preprocess,
							entire_image=entire_image,
							svs_file=svs_file,
							transpose_annotations=transpose_annotations,
							get_tissue_mask=get_tissue_mask,
							otsu=otsu,
							compression=compression,
							return_convex_hull=return_convex_hull,
							keep_holes=keep_holes,
							max_hole_size=max_hole_size,
							gray_before_close=gray_before_close,
							kernel=kernel,
							min_object_size=min_object_size,
							blur_size=blur_size)
	patch_point = time.time()
	print('Patches took {}'.format(patch_point-adjust_point))

@preprocessing.command()
@click.option('-i', '--mask_dir', default='./inputs/', help='Input directory for masks.', type=click.Path(exists=False), show_default=True)
@click.option('-o', '--output_dir', default='./outputs/', help='Output directory for new masks.', type=click.Path(exists=False), show_default=True)
@click.option('-fr', '--from_annotations', default=[], multiple=True, help='Annotations to switch from.', show_default=True)
@click.option('-to', '--to_annotations', default=[], multiple=True, help='Annotations to switch to.', show_default=True)
def alter_masks(mask_dir, output_dir, from_annotations, to_annotations):
	"""Map list of values to other values in mask."""
	import glob
	from pathflowai.utils import npy2da
	import numpy as np
	from dask.distributed import Client
	assert len(from_annotations)==len(to_annotations)
	c=Client()
	from_annotations=list(map(int,from_annotations))
	to_annotations=list(map(int,to_annotations))
	os.makedirs(output_dir,exist_ok=True)
	masks=glob.glob(join(mask_dir,'*_mask.npy'))
	from_to=list(zip(from_annotations,to_annotations))
	for mask in masks:
		output_mask=join(output_dir,os.path.basename(mask))
		arr=npy2da(mask)
		for fr,to in from_to:
			arr[arr==fr]=to
		np.save(output_mask,arr.compute())

@preprocessing.command()
@click.option('-i', '--input_patch_db', default='patch_info_input.db', help='Input db.', type=click.Path(exists=False), show_default=True)
@click.option('-o', '--output_patch_db', default='patch_info_output.db', help='Output db.', type=click.Path(exists=False), show_default=True)
@click.option('-b', '--basename', default='A01', help='Basename.', type=click.Path(exists=False), show_default=True)
@click.option('-ps', '--patch_size', default=224, help='Patch size.',  show_default=True)
def remove_basename_from_db(input_patch_db, output_patch_db, basename, patch_size):
	"""Removes basename/ID from SQL DB."""
	import sqlite3
	import numpy as np, pandas as pd
	os.makedirs(output_patch_db[:output_patch_db.rfind('/')],exist_ok=True)
	conn = sqlite3.connect(input_patch_db)
	df=pd.read_sql('select * from "{}";'.format(patch_size),con=conn)
	conn.close()
	df=df.loc[df['ID']!=basename]
	conn = sqlite3.connect(output_patch_db)
	df.set_index('index').to_sql(str(patch_size), con=conn, if_exists='replace')
	conn.close()


@preprocessing.command()
@click.option('-i', '--input_patch_db', default='patch_info_input.db', help='Input db.', type=click.Path(exists=False), show_default=True)
@click.option('-o', '--output_patch_db', default='patch_info_output.db', help='Output db.', type=click.Path(exists=False), show_default=True)
@click.option('-fr', '--from_annotations', default=[], multiple=True, help='Annotations to switch from.', show_default=True)
@click.option('-to', '--to_annotations', default=[], multiple=True, help='Annotations to switch to.', show_default=True)
@click.option('-ps', '--patch_size', default=224, help='Patch size.',  show_default=True)
@click.option('-rb', '--remove_background_annotation', default='', help='If selected, removes 100\% background patches based on this annotation.', type=click.Path(exists=False), show_default=True)
@click.option('-ma', '--max_background_area', default=0.05, help='Max background area before exclusion.',  show_default=True)
def collapse_annotations(input_patch_db, output_patch_db, from_annotations, to_annotations, patch_size, remove_background_annotation, max_background_area):
	"""Adds annotation classes areas to other annotation classes in SQL DB when getting rid of some annotation classes."""
	import sqlite3
	import numpy as np, pandas as pd
	assert len(from_annotations)==len(to_annotations)
	from_annotations=list(map(str,from_annotations))
	to_annotations=list(map(str,to_annotations))
	os.makedirs(output_patch_db[:output_patch_db.rfind('/')],exist_ok=True)
	conn = sqlite3.connect(input_patch_db)
	df=pd.read_sql('select * from "{}";'.format(patch_size),con=conn)
	conn.close()
	from_to=zip(from_annotations,to_annotations)
	if remove_background_annotation:
		df=df.loc[df[remove_background_annotation]<=(1.-max_background_area)]
	for fr,to in from_to:
		df.loc[:,to]+=df[fr]
	df=df[[col for col in list(df) if col not in from_annotations]]
	annotations = list(df.iloc[:,6:])
	df=df.rename(columns={annot:str(i) for i, annot in enumerate(annotations)})
	annotations = list(df.iloc[:,6:])
	df.loc[:,'annotation']=np.vectorize(lambda i: annotations[df.iloc[i,6:].values.argmax()])(np.arange(df.shape[0]))
	df.loc[:,'index']=np.arange(df.shape[0])
	conn = sqlite3.connect(output_patch_db)
	#print(df)
	df.set_index('index').to_sql(str(patch_size), con=conn, if_exists='replace')
	conn.close()


if __name__ == '__main__':
	from dask.distributed import Client
	dask.config.set({'temporary_dir':'tmp/',
					'distributed.worker.local_dir':'tmp/',
					'distributed.scheduler.allowed-failures':20})#'distributed.worker.num-workers':20}):
	c=Client(processes=False)
	preprocessing()
	c.close()