Diff of /modas/phenorm.py [000000] .. [a43cea]

Switch to side-by-side view

--- a
+++ b/modas/phenorm.py
@@ -0,0 +1,133 @@
+import pandas as pd
+import numpy as np
+from scipy import stats
+from scipy.stats import norm
+from sklearn.preprocessing import MinMaxScaler
+import warnings
+import subprocess
+import re
+
+
+def abundance_filter(d, abundance):
+    return d.loc[:, d.mean() >= abundance]
+
+
+def isDigit(x):
+    try:
+        float(x)
+        return True
+    except ValueError:
+        return False
+
+
+def missing_filter(d, missing_ratio):
+    if d.applymap(np.isreal).all().sum() == d.shape[1]:
+        d = d.loc[:, d.applymap(np.isnan).sum() <= d.shape[0] * missing_ratio]
+        d = d.fillna(0)
+    else:
+        d = d.loc[:, d.applymap(lambda x:isDigit(x)).sum() >= d.shape[0] * missing_ratio]
+        d_array = d.values
+        d_array[~d.applymap(lambda x:isDigit(x))] = 0
+        d.loc[:, :] = d_array
+        d = d.astype(float)
+    #d = d.loc[:, (d == 0).sum() <= d.shape[0] * missing_ratio]
+    return d
+
+
+def log2_scale(d):
+    return np.log2(d+1)
+
+
+def ln_scale(d):
+    return np.log(d+1)
+
+
+def log10_scale(d):
+    return np.log10(d+1)
+
+
+def normalize_scale(d):
+    d = d + 1
+    d = d.apply(lambda x: stats.boxcox(x)[0])
+    d.loc[:,:] = MinMaxScaler().fit_transform(d.values)
+    return d
+
+
+def ppoints(n, a=None):
+    try:
+        n = np.float64(len(n))
+    except TypeError:
+        n = np.float64(n)
+    if a is None:
+        a = 3.0/8 if(n <= 10) else 1.0/2
+    return (np.arange(n) + 1 - a)/(n + 1 - 2*a)
+
+
+def qqnorm(y):
+    ina = np.isnan(y)
+    if ina.sum() > 0:
+        yN = y
+        y = y[~ina]
+    n = y.shape[0]
+    if n == 0:
+        print('y is empty or has only NAs')
+        return np.array([])
+    x = np.around(norm.ppf(ppoints(n)[np.argsort(np.argsort(y))]), decimals=15)
+    if ina.sum() > 0:
+        y = x
+        x = yN
+        x[~ina] = y
+    return x
+
+
+def trait_correct(pc, y):
+    pc1 = pd.concat([pd.DataFrame(np.ones((y.shape[0], 1)), index=pc.index), pc], axis=1)
+    vhat = np.dot(np.linalg.pinv(np.dot(pc1.T, pc1)), np.dot(pc1.T, y))
+    if len(vhat.shape) == 1:
+        y_corr = y - np.dot(pc, vhat[1:])
+    else:
+        y_corr = y - np.dot(pc, vhat[1:, :])
+    return y_corr
+
+
+def pc_calc(bed, pc_num):
+    try:
+        from rpy2.robjects.packages import importr
+        from rpy2.robjects import pandas2ri
+        from rpy2.rinterface_lib.embedded import RRuntimeError
+        import rpy2.robjects as robjects
+        pandas2ri.activate()
+
+        warnings.filterwarnings("ignore")
+        base = importr('base')
+        utils = importr('utils')
+        if not base.require('bigsnpr', quietly=True)[0]:
+            utils_path = subprocess.check_output('locate modas/utils', shell=True, text=True, encoding='utf-8')
+            # utils_path = '/'.join(re.search('\n(.*site-packages.*)\n', utils_path).group(1).split('/')[:-1])
+            utils_path = re.search('\n(.*site-packages.*)\n', utils_path).group(1)
+            if not utils_path.endswith('utils'):
+                utils_path = '/'.join(utils_path.split('/')[:-1])
+            utils.install_packages(utils_path + '/Matrix_1.6-5.tar.gz', repos=robjects.rinterface.NULL, type='source',
+                                   quiet=True)
+            utils.install_packages('bigsnpr', dependence=True, repos='https://cloud.r-project.org', quiet=True)
+        robjects.r['options'](warn=-1)
+        bigsnpr = importr('bigsnpr')
+        bigstatsr = importr('bigstatsr')
+
+        base.sink('/dev/null')
+        g = bigsnpr.snp_readBed(bed, backingfile=base.tempfile())
+        g = bigsnpr.snp_attach(g)
+        # svd = bigsnpr.snp_autoSVD(g.rx2[0], infos_chr=g[2]['chromosome'], ncores=1,
+        #                           infos_pos=g[2]['physical.pos'], thr_r2=np.nan, k=pc_num)
+        svd = bigsnpr.snp_autoSVD(g.rx2('genotypes'), infos_chr=g.rx2('map').rx2('chromosome'),
+                                  infos_pos=g.rx2('map').rx2('physical.pos'), thr_r2=np.nan, k=pc_num)
+        base.sink()
+    except RRuntimeError:
+        return None
+    else:
+        pc = bigstatsr.predict_big_SVD(svd)
+        # pc = base.data_frame(pc, row_names=g[1]['sample.ID'])
+        #pc = base.cbind(pc.rownames, pc)
+        pc = pd.DataFrame(pc, index=g.rx2('fam').rx2('sample.ID'))
+        pc.columns = ['PC' + str(i) for i in range(1, pc_num+1)]
+        return pc