|
a |
|
b/dl/utils/outlier.py |
|
|
1 |
"""Functions for remove outliers""" |
|
|
2 |
import numpy as np |
|
|
3 |
|
|
|
4 |
def remove_boxplot_outlier(array, whis=1.5, only_positive=True): |
|
|
5 |
"""remove outliers drawn from matplotlib.pyplot.boxplot |
|
|
6 |
""" |
|
|
7 |
if only_positive: |
|
|
8 |
q1 = np.percentile(array[array>0], 25) |
|
|
9 |
q3 = np.percentile(array[array>0], 75) |
|
|
10 |
else: |
|
|
11 |
q1 = np.percentile(array, 25) |
|
|
12 |
q3 = np.percentile(array, 75) |
|
|
13 |
iqr = q3 - q1 |
|
|
14 |
a_min = q1 - whis*iqr |
|
|
15 |
a_max = q3 + whis*iqr |
|
|
16 |
return np.clip(array, a_min, a_max) |
|
|
17 |
|
|
|
18 |
|
|
|
19 |
def log2_transformation(mat, forced=False, threshold=50): |
|
|
20 |
"""log2 transform |
|
|
21 |
|
|
|
22 |
Args: |
|
|
23 |
mat: np.array |
|
|
24 |
forced: if forced is True, then do log2 transformation immediately; |
|
|
25 |
otherwise use threshold to decide if log2 transformation is necessary |
|
|
26 |
threshold: float, default 50; |
|
|
27 |
if range(mat) / interquartile range > threshold, then do transform |
|
|
28 |
""" |
|
|
29 |
mat = np.array(mat) # in case arg mat is a list |
|
|
30 |
if forced: |
|
|
31 |
return np.log2(mat - mat.min() + 1) |
|
|
32 |
q1 = np.percentile(mat, 25) |
|
|
33 |
q3 = np.percentile(mat, 75) |
|
|
34 |
iqr = q3 - q1 |
|
|
35 |
r = mat.max() - mat.min() |
|
|
36 |
if (iqr==0 and r>0) or r/iqr > threshold: |
|
|
37 |
mat = np.log2(mat - mat.min() + 1) |
|
|
38 |
return mat |
|
|
39 |
|
|
|
40 |
|
|
|
41 |
def normalization(mat, transform=True, forced=False, threshold=50, rm_outlier=True, whis=1.5, |
|
|
42 |
only_positive=True, max_val=1, diagonal=1, symmetric=True): |
|
|
43 |
"""Normalize interaction/similarity matrix |
|
|
44 |
|
|
|
45 |
Args: |
|
|
46 |
transform: if True, call log2_transform(mat, forced, threshold) |
|
|
47 |
rm_outlier: if True, call remove_boxplot_outlier(mat, whis, only_positive) |
|
|
48 |
max_val: if max_val=1, execute mat=mat/mat.max() |
|
|
49 |
diagonal: if diagonal=1, make diagonal element to be 1 |
|
|
50 |
symmetric: if True, execute mat = (mat+mat.T)/2 |
|
|
51 |
""" |
|
|
52 |
if transform: |
|
|
53 |
mat = log2_transformation(mat, forced, threshold) |
|
|
54 |
if rm_outlier: |
|
|
55 |
mat = remove_boxplot_outlier(mat, whis, only_positive) |
|
|
56 |
if max_val == 1: |
|
|
57 |
mat = mat / mat.max() |
|
|
58 |
if diagonal == 1: |
|
|
59 |
mat[range(len(mat)), range(len(mat))] = 1 |
|
|
60 |
if symmetric: |
|
|
61 |
mat = (mat + mat.T) / 2 |
|
|
62 |
return mat |