Diff of /modas/mr.py [000000] .. [a43cea]

Switch to unified view

a b/modas/mr.py
1
import pandas as pd
2
import numpy as np
3
import pyranges as pr
4
from sklearn.linear_model import LinearRegression
5
from pandas_plink import read_plink1_bin
6
from scipy.stats import chi2
7
import modas.multiprocess as mp
8
import subprocess
9
import warnings
10
import shutil
11
import glob
12
import sys
13
import os
14
import re
15
16
17
utils_path = subprocess.check_output('locate modas/utils', shell=True, text=True, encoding='utf-8')
18
#utils_path = '/'.join(re.search('\n(.*site-packages.*)\n', utils_path).group(1).split('/')[:-1])
19
utils_path = re.search('\n(.*site-packages.*)\n', utils_path).group(1)
20
if not utils_path.endswith('utils'):
21
    utils_path = '/'.join(utils_path.split('/')[:-1])
22
23
24
def lm_res(y, X):
25
    idx = np.isnan(X) | np.isnan(y)
26
    X = X[~idx].reshape(-1, 1)
27
    y = y[~idx]
28
    lm = LinearRegression().fit(X, y)
29
    sigma2 = np.sum((y - lm.predict(X))**2)/(X.shape[0]-1)
30
    se = np.sqrt(np.diag(np.linalg.pinv(np.dot(X.T, X))))[-1] * sigma2
31
    effect = lm.coef_[-1]
32
    rsq = lm.score(X, y)
33
    return pd.Series(dict(zip(['effect', 'se', 'rsq'], [effect, se, rsq])))
34
35
36
def MR(mTrait, pTrait, g, pvalue_cutoff):
37
    p = pd.concat([mTrait.to_frame(), pTrait], axis=1)
38
    snp_lm = p.apply(lm_res, args=[g.values])
39
    mTrait_lm = p.apply(lm_res, args=[mTrait.values])
40
    pTrait_mTrait = snp_lm.loc['effect', :][1:] / snp_lm.loc['effect', :][0]
41
    var_upper = pTrait.var() * (1 - mTrait_lm.loc['rsq', :][1:])
42
    var_down = mTrait.shape[0] * mTrait.var() * snp_lm.loc['rsq', :][0]
43
    var = var_upper / var_down
44
    TMR = pTrait_mTrait**2 / var
45
    pvalue = 1 - chi2.cdf(TMR, 1)
46
    MR_res = pd.DataFrame(dict(zip(['snp', 'mTrait', 'pTrait', 'effect', 'TMR', 'pvalue'],
47
                        [[g.name]*pTrait.shape[1], [mTrait.name]*pTrait.shape[1], pTrait.columns, pTrait_mTrait, TMR, pvalue])))
48
    MR_res = MR_res.loc[(MR_res.pvalue<=pvalue_cutoff) & (MR_res.mTrait!=MR_res.pTrait),:]
49
    return MR_res
50
51
52
def MR_parallel(mTrait_qtl, mTrait, pTrait, geno, threads, pvalue_cutoff):
53
    args = list()
54
    for index, row in mTrait_qtl.iterrows():
55
        rs = row['SNP']
56
        mTrait_name = row['phe_name']
57
        args.append((mTrait.loc[:, mTrait_name], pTrait, geno.loc[:, rs], pvalue_cutoff))
58
    res = mp.parallel(MR, args, threads)
59
    res = pd.concat([i for i in res])
60
    return res
61
62
63
def var_bXY(bzx, bzx_se, bzy, bzy_se):
64
    varbXY = bzx_se**2*bzy**2/bzx**4 + bzy_se**2/bzx**2
65
    return varbXY
66
67
68
def generate_geno_batch(mTrait_qtl, mTrait, pTrait, geno, threads, bed_dir, rs_dir):
69
    if os.path.exists(bed_dir):
70
        shutil.rmtree(bed_dir)
71
    os.mkdir(bed_dir)
72
    if os.path.exists(rs_dir):
73
        shutil.rmtree(rs_dir)
74
    os.mkdir(rs_dir)
75
    plink_extract = utils_path + '/plink -bfile {} -extract {} --make-bed -out {}'
76
    geno_batch = list()
77
    for mTrait_name in mTrait_qtl.phe_name.unique():
78
        out_name = bed_dir.strip('/') + '/' + mTrait_name
79
        rs = mTrait_qtl.loc[mTrait_qtl.phe_name == mTrait_name, 'SNP']
80
        rs_name = rs_dir.strip('/') + '/' + '_'.join([mTrait_name,'rs.txt'])
81
        pd.Series(rs).to_frame().to_csv(rs_name, index=False, header=None)
82
        geno_batch.append((plink_extract.format(geno, rs_name, out_name),))
83
    out_name = bed_dir.strip('/') + '/pTrait'
84
    rs_name = rs_dir.strip('/') + '/pTrait_rs.txt'
85
    mTrait_qtl['SNP'].to_frame().to_csv(rs_name, index=False, header=None)
86
    geno_batch.append((plink_extract.format(geno, rs_name, out_name),))
87
    mp.parallel(mp.run, geno_batch, threads)
88
    for fn in glob.glob(bed_dir.strip('/')+'/*fam'):
89
        fam = pd.read_csv(fn, sep=' ', header=None)
90
        mTrait_name = fn.split('/')[-1].replace('.fam', '')
91
        if mTrait_name == 'pTrait':
92
            pTrait = pTrait.reindex(fam[0])
93
            fam.index = fam[0]
94
            fam = pd.concat([fam, pTrait], axis=1)
95
        else:
96
            fam.loc[:, 5] = mTrait.loc[:, mTrait_name].reindex(fam[0]).values
97
        fam.to_csv(fn, index=False, header=None, sep=' ', na_rep='NA')
98
99
100
def calc_MLM_effect(bed_dir, pTrait, threads, geno):
101
    args = list()
102
    geno_prefix = geno.split('/')[-1]
103
    fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None)
104
    fam[5] = 1
105
    fam.to_csv(geno_prefix + '.link.fam', sep='\t', na_rep='NA', header=None, index=False)
106
    if os.path.exists(geno_prefix + '.link.bed'):
107
        os.remove(geno_prefix + '.link.bed')
108
    if os.path.exists(geno_prefix + '.link.bim'):
109
        os.remove(geno_prefix + '.link.bim')
110
    os.symlink(geno + '.bed', geno_prefix + '.link.bed')
111
    os.symlink(geno + '.bim', geno_prefix + '.link.bim')
112
    related_matrix_cmd = utils_path + '/gemma -bfile {0}.link -gk 1 -o {1}'.format(geno_prefix, geno_prefix)
113
    s = mp.run(related_matrix_cmd)
114
    if s != 0:
115
        return None
116
    gemma_cmd_mTrait = utils_path + '/gemma -bfile {0} -k ./output/{1}.cXX.txt -lmm -n 1 -o {2}'
117
    gemma_cmd_pTrait = utils_path + '/gemma -bfile {0} -k ./output/{1}.cXX.txt -lmm -n {2} -o {3}'
118
    for i in glob.glob(bed_dir + '/*.bed'):
119
        i = i.replace('.bed', '')
120
        if i.split('/')[-1] != 'pTrait':
121
            prefix = i.split('/')[-1]
122
            args.append((gemma_cmd_mTrait.format(i, geno_prefix, 'mTrait_' + prefix),))
123
        else:
124
            for _, pTrait_name in enumerate(pTrait.columns):
125
                args.append((gemma_cmd_pTrait.format(i, geno_prefix, _ + 2, 'pTrait_' + pTrait_name),))
126
    s = mp.parallel(mp.run, args, threads)
127
    os.remove(geno_prefix + '.link.bed')
128
    os.remove(geno_prefix + '.link.bim')
129
    os.remove(geno_prefix + '.link.fam')
130
    return s
131
132
133
def get_MLM_effect(fn):
134
    assoc = pd.read_csv(fn, sep='\t')
135
    assoc.index = assoc['rs']
136
    return assoc[['beta', 'se']]
137
138
139
def get_MLM_effect_parallell(assoc_dir, mTrait, pTrait, threads):
140
    mTrait_effect = pd.DataFrame()
141
    args = []
142
    #pTrait_name = []
143
    # for fn in glob.glob(assoc_dir.strip('/') + '/mTrait*.assoc.txt'):
144
    #     mTrait_name = fn.split('/')[-1].split('_')[-1].replace('.assoc.txt', '')
145
    #     assoc = pd.read_csv(fn, sep='\t')
146
    #     assoc.index = mTrait_name+';' + assoc['rs']
147
    #     mTrait_effect = pd.concat([mTrait_effect, assoc[['beta', 'se']]])
148
    # for fn in glob.glob(assoc_dir.strip('/') + '/pTrait*assoc.txt'):
149
    #     pTrait_name.append(fn.split('/')[-1].split('_')[-1].replace('.assoc.txt', ''))
150
    #     args.append((fn,))
151
    for mTrait_name in mTrait.columns:
152
        fn = assoc_dir.strip('/') + '/mTrait_' + mTrait_name + '.assoc.txt'
153
        assoc = pd.read_csv(fn, sep='\t')
154
        assoc.index = mTrait_name+';' + assoc['rs']
155
        mTrait_effect = pd.concat([mTrait_effect, assoc[['beta', 'se']]])
156
    for pTrait_name in pTrait.columns:
157
        fn = assoc_dir.strip('/') + '/pTrait_' + pTrait_name + '.assoc.txt'
158
        args.append((fn,))
159
    pTrait_res = mp.parallel(get_MLM_effect, args, threads)
160
    pTrait_effect = pd.concat([i['beta'] for i in pTrait_res], axis=1)
161
    pTrait_effect.columns = pTrait.columns
162
    pTrait_se = pd.concat([i['se'] for i in pTrait_res], axis=1)
163
    pTrait_se.columns = pTrait.columns
164
    return mTrait_effect, pTrait_effect, pTrait_se
165
166
167
def MR_MLM(mTrait_effect_snp, pTrait_effect_snp, pTrait_se_snp, pvalue_cutoff):
168
    mTrait_name, rs = mTrait_effect_snp.name.split(';')
169
    bxy = pTrait_effect_snp / mTrait_effect_snp['beta']
170
    varbXY = var_bXY(mTrait_effect_snp['beta'], mTrait_effect_snp['se'], pTrait_effect_snp, pTrait_se_snp)
171
    TMR = bxy**2 / varbXY
172
    pvalue = 1 - chi2.cdf(TMR, 1)
173
    MR_res = pd.DataFrame(dict(zip(['snp', 'mTrait', 'pTrait', 'effect', 'TMR', 'pvalue'],
174
                                   [[rs] * pTrait_effect_snp.shape[0], [mTrait_name] * pTrait_effect_snp.shape[0],
175
                                    pTrait_effect_snp.index, bxy, TMR, pvalue])))
176
    MR_res = MR_res.loc[(MR_res.pvalue<=pvalue_cutoff) & (MR_res.mTrait!=MR_res.pTrait), :]
177
    return MR_res
178
179
180
def MR_MLM_parallel(mTrait_qtl, mTrait_effect, pTrait_effect, pTrait_se, threads, pvalue_cutoff):
181
    args = []
182
    for index, row in mTrait_qtl.iterrows():
183
        mTrait_name = row['phe_name']
184
        rs = row['SNP']
185
        args.append((mTrait_effect.loc[';'.join([mTrait_name, rs]),:], pTrait_effect.loc[rs, :], pTrait_se.loc[rs, :], pvalue_cutoff))
186
    res = mp.parallel(MR_MLM, args, threads)
187
    res = pd.concat([i for i in res])
188
    return res
189
190
191
def edge_weight(qtl, MR_res):
192
    MR_res = MR_res.loc[MR_res.mTrait!=MR_res.pTrait,:]
193
    MR_res = MR_res.sort_values(by='pvalue')
194
    qtl_peak_pos = qtl[['CHR', 'phe_name']]
195
    qtl_peak_pos.loc[:, 'start'] = qtl['SNP'].apply(lambda x: int(x.split('_')[-1]) - 1)
196
    qtl_peak_pos.loc[:, 'end'] = qtl['SNP'].apply(lambda x: int(x.split('_')[-1]) + 1)
197
    qtl_peak_pos = qtl_peak_pos[['CHR', 'start', 'end', 'phe_name']]
198
    qtl_peak_pos.columns = ['Chromosome', 'Start', 'End', 'phe_name']
199
    qtl_peak_range = qtl[['CHR', 'phe_name']]
200
    qtl_peak_range.loc[:, 'start'] = qtl['SNP'].apply(lambda x: int(x.split('_')[-1]) - 1000000)
201
    qtl_peak_range.loc[:, 'end'] = qtl['SNP'].apply(lambda x: int(x.split('_')[-1]) + 1000000)
202
    qtl_peak_range = qtl_peak_range[['CHR', 'start', 'end', 'phe_name']]
203
    qtl_peak_range.columns = ['Chromosome', 'Start', 'End', 'phe_name']
204
    qtl_peak_pos = pr.PyRanges(qtl_peak_pos)
205
    qtl_peak_range = pr.PyRanges(qtl_peak_range)
206
    coloc = qtl_peak_pos.join(qtl_peak_range)
207
208
    res = pd.DataFrame()
209
    for k in sorted(coloc.dfs.keys()):
210
        res = pd.concat([res, coloc.dfs[k]])
211
    res = res.loc[res.phe_name != res.phe_name_b, :]
212
    mr_coloc = pd.merge(MR_res, res, left_on=['mTrait', 'pTrait'], right_on=['phe_name', 'phe_name_b'])
213
    #coloc = coloc.sort_values(by='pvalue')
214
    mr_coloc_count = mr_coloc.pvalue.value_counts().sort_index()
215
    mr_count = MR_res.pvalue.value_counts().sort_index().cumsum()
216
    weight = list()
217
    for p in MR_res.pvalue.values:
218
        coloc_count = mr_coloc_count[mr_coloc_count.index <= p].sum()
219
        weight.append(coloc_count / mr_count[p])
220
    MR_res.loc[:, 'weight'] = weight
221
    edgeweight = MR_res[['mTrait', 'pTrait', 'weight']]
222
    edgeweight = edgeweight.loc[edgeweight.weight >= 0.2, :]
223
    edgeweight = edgeweight.sort_values(by=['mTrait', 'pTrait', 'weight'], ascending=False).drop_duplicates(subset=['mTrait', 'pTrait'])
224
    return edgeweight
225
226