Diff of /dl/utils/outlier.py [000000] .. [4807fa]

Switch to unified view

a b/dl/utils/outlier.py
1
"""Functions for remove outliers"""
2
import numpy as np
3
4
def remove_boxplot_outlier(array, whis=1.5, only_positive=True):
5
  """remove outliers drawn from matplotlib.pyplot.boxplot
6
  """
7
  if only_positive:
8
    q1 = np.percentile(array[array>0], 25)
9
    q3 = np.percentile(array[array>0], 75)
10
  else:
11
    q1 = np.percentile(array, 25)
12
    q3 = np.percentile(array, 75)
13
  iqr = q3 - q1
14
  a_min = q1 - whis*iqr
15
  a_max = q3 + whis*iqr
16
  return np.clip(array, a_min, a_max)
17
18
19
def log2_transformation(mat, forced=False, threshold=50):
20
  """log2 transform
21
  
22
  Args:
23
    mat: np.array
24
    forced: if forced is True, then do log2 transformation immediately; 
25
      otherwise use threshold to decide if log2 transformation is necessary
26
    threshold: float, default 50; 
27
      if range(mat) / interquartile range > threshold, then do transform
28
  """
29
  mat = np.array(mat) # in case arg mat is a list
30
  if forced:
31
    return np.log2(mat - mat.min() + 1)
32
  q1 = np.percentile(mat, 25)
33
  q3 = np.percentile(mat, 75)
34
  iqr = q3 - q1
35
  r = mat.max() - mat.min()
36
  if (iqr==0 and r>0) or r/iqr > threshold:
37
    mat = np.log2(mat - mat.min() + 1)
38
  return mat
39
40
41
def normalization(mat, transform=True, forced=False, threshold=50, rm_outlier=True, whis=1.5, 
42
                  only_positive=True, max_val=1, diagonal=1, symmetric=True):
43
  """Normalize interaction/similarity matrix
44
  
45
  Args:
46
    transform: if True, call log2_transform(mat, forced, threshold)
47
    rm_outlier: if True, call remove_boxplot_outlier(mat, whis, only_positive)
48
    max_val: if max_val=1, execute mat=mat/mat.max()
49
    diagonal: if diagonal=1, make diagonal element to be 1
50
    symmetric: if True, execute mat = (mat+mat.T)/2
51
  """
52
  if transform:
53
    mat = log2_transformation(mat, forced, threshold)
54
  if rm_outlier:
55
    mat = remove_boxplot_outlier(mat, whis, only_positive) 
56
  if max_val == 1:
57
    mat = mat / mat.max()
58
  if diagonal == 1:
59
    mat[range(len(mat)), range(len(mat))] = 1
60
  if symmetric:
61
    mat = (mat + mat.T) / 2
62
  return mat