Diff of /modas/phenorm.py [000000] .. [a43cea]

Switch to unified view

a b/modas/phenorm.py
1
import pandas as pd
2
import numpy as np
3
from scipy import stats
4
from scipy.stats import norm
5
from sklearn.preprocessing import MinMaxScaler
6
import warnings
7
import subprocess
8
import re
9
10
11
def abundance_filter(d, abundance):
12
    return d.loc[:, d.mean() >= abundance]
13
14
15
def isDigit(x):
16
    try:
17
        float(x)
18
        return True
19
    except ValueError:
20
        return False
21
22
23
def missing_filter(d, missing_ratio):
24
    if d.applymap(np.isreal).all().sum() == d.shape[1]:
25
        d = d.loc[:, d.applymap(np.isnan).sum() <= d.shape[0] * missing_ratio]
26
        d = d.fillna(0)
27
    else:
28
        d = d.loc[:, d.applymap(lambda x:isDigit(x)).sum() >= d.shape[0] * missing_ratio]
29
        d_array = d.values
30
        d_array[~d.applymap(lambda x:isDigit(x))] = 0
31
        d.loc[:, :] = d_array
32
        d = d.astype(float)
33
    #d = d.loc[:, (d == 0).sum() <= d.shape[0] * missing_ratio]
34
    return d
35
36
37
def log2_scale(d):
38
    return np.log2(d+1)
39
40
41
def ln_scale(d):
42
    return np.log(d+1)
43
44
45
def log10_scale(d):
46
    return np.log10(d+1)
47
48
49
def normalize_scale(d):
50
    d = d + 1
51
    d = d.apply(lambda x: stats.boxcox(x)[0])
52
    d.loc[:,:] = MinMaxScaler().fit_transform(d.values)
53
    return d
54
55
56
def ppoints(n, a=None):
57
    try:
58
        n = np.float64(len(n))
59
    except TypeError:
60
        n = np.float64(n)
61
    if a is None:
62
        a = 3.0/8 if(n <= 10) else 1.0/2
63
    return (np.arange(n) + 1 - a)/(n + 1 - 2*a)
64
65
66
def qqnorm(y):
67
    ina = np.isnan(y)
68
    if ina.sum() > 0:
69
        yN = y
70
        y = y[~ina]
71
    n = y.shape[0]
72
    if n == 0:
73
        print('y is empty or has only NAs')
74
        return np.array([])
75
    x = np.around(norm.ppf(ppoints(n)[np.argsort(np.argsort(y))]), decimals=15)
76
    if ina.sum() > 0:
77
        y = x
78
        x = yN
79
        x[~ina] = y
80
    return x
81
82
83
def trait_correct(pc, y):
84
    pc1 = pd.concat([pd.DataFrame(np.ones((y.shape[0], 1)), index=pc.index), pc], axis=1)
85
    vhat = np.dot(np.linalg.pinv(np.dot(pc1.T, pc1)), np.dot(pc1.T, y))
86
    if len(vhat.shape) == 1:
87
        y_corr = y - np.dot(pc, vhat[1:])
88
    else:
89
        y_corr = y - np.dot(pc, vhat[1:, :])
90
    return y_corr
91
92
93
def pc_calc(bed, pc_num):
94
    try:
95
        from rpy2.robjects.packages import importr
96
        from rpy2.robjects import pandas2ri
97
        from rpy2.rinterface_lib.embedded import RRuntimeError
98
        import rpy2.robjects as robjects
99
        pandas2ri.activate()
100
101
        warnings.filterwarnings("ignore")
102
        base = importr('base')
103
        utils = importr('utils')
104
        if not base.require('bigsnpr', quietly=True)[0]:
105
            utils_path = subprocess.check_output('locate modas/utils', shell=True, text=True, encoding='utf-8')
106
            # utils_path = '/'.join(re.search('\n(.*site-packages.*)\n', utils_path).group(1).split('/')[:-1])
107
            utils_path = re.search('\n(.*site-packages.*)\n', utils_path).group(1)
108
            if not utils_path.endswith('utils'):
109
                utils_path = '/'.join(utils_path.split('/')[:-1])
110
            utils.install_packages(utils_path + '/Matrix_1.6-5.tar.gz', repos=robjects.rinterface.NULL, type='source',
111
                                   quiet=True)
112
            utils.install_packages('bigsnpr', dependence=True, repos='https://cloud.r-project.org', quiet=True)
113
        robjects.r['options'](warn=-1)
114
        bigsnpr = importr('bigsnpr')
115
        bigstatsr = importr('bigstatsr')
116
117
        base.sink('/dev/null')
118
        g = bigsnpr.snp_readBed(bed, backingfile=base.tempfile())
119
        g = bigsnpr.snp_attach(g)
120
        # svd = bigsnpr.snp_autoSVD(g.rx2[0], infos_chr=g[2]['chromosome'], ncores=1,
121
        #                           infos_pos=g[2]['physical.pos'], thr_r2=np.nan, k=pc_num)
122
        svd = bigsnpr.snp_autoSVD(g.rx2('genotypes'), infos_chr=g.rx2('map').rx2('chromosome'),
123
                                  infos_pos=g.rx2('map').rx2('physical.pos'), thr_r2=np.nan, k=pc_num)
124
        base.sink()
125
    except RRuntimeError:
126
        return None
127
    else:
128
        pc = bigstatsr.predict_big_SVD(svd)
129
        # pc = base.data_frame(pc, row_names=g[1]['sample.ID'])
130
        #pc = base.cbind(pc.rownames, pc)
131
        pc = pd.DataFrame(pc, index=g.rx2('fam').rx2('sample.ID'))
132
        pc.columns = ['PC' + str(i) for i in range(1, pc_num+1)]
133
        return pc