--- a +++ b/dl/utils/outlier.py @@ -0,0 +1,62 @@ +"""Functions for remove outliers""" +import numpy as np + +def remove_boxplot_outlier(array, whis=1.5, only_positive=True): + """remove outliers drawn from matplotlib.pyplot.boxplot + """ + if only_positive: + q1 = np.percentile(array[array>0], 25) + q3 = np.percentile(array[array>0], 75) + else: + q1 = np.percentile(array, 25) + q3 = np.percentile(array, 75) + iqr = q3 - q1 + a_min = q1 - whis*iqr + a_max = q3 + whis*iqr + return np.clip(array, a_min, a_max) + + +def log2_transformation(mat, forced=False, threshold=50): + """log2 transform + + Args: + mat: np.array + forced: if forced is True, then do log2 transformation immediately; + otherwise use threshold to decide if log2 transformation is necessary + threshold: float, default 50; + if range(mat) / interquartile range > threshold, then do transform + """ + mat = np.array(mat) # in case arg mat is a list + if forced: + return np.log2(mat - mat.min() + 1) + q1 = np.percentile(mat, 25) + q3 = np.percentile(mat, 75) + iqr = q3 - q1 + r = mat.max() - mat.min() + if (iqr==0 and r>0) or r/iqr > threshold: + mat = np.log2(mat - mat.min() + 1) + return mat + + +def normalization(mat, transform=True, forced=False, threshold=50, rm_outlier=True, whis=1.5, + only_positive=True, max_val=1, diagonal=1, symmetric=True): + """Normalize interaction/similarity matrix + + Args: + transform: if True, call log2_transform(mat, forced, threshold) + rm_outlier: if True, call remove_boxplot_outlier(mat, whis, only_positive) + max_val: if max_val=1, execute mat=mat/mat.max() + diagonal: if diagonal=1, make diagonal element to be 1 + symmetric: if True, execute mat = (mat+mat.T)/2 + """ + if transform: + mat = log2_transformation(mat, forced, threshold) + if rm_outlier: + mat = remove_boxplot_outlier(mat, whis, only_positive) + if max_val == 1: + mat = mat / mat.max() + if diagonal == 1: + mat[range(len(mat)), range(len(mat))] = 1 + if symmetric: + mat = (mat + mat.T) / 2 + return mat \ No newline at end of file