|
a |
|
b/modas/mr.py |
|
|
1 |
import pandas as pd |
|
|
2 |
import numpy as np |
|
|
3 |
import pyranges as pr |
|
|
4 |
from sklearn.linear_model import LinearRegression |
|
|
5 |
from pandas_plink import read_plink1_bin |
|
|
6 |
from scipy.stats import chi2 |
|
|
7 |
import modas.multiprocess as mp |
|
|
8 |
import subprocess |
|
|
9 |
import warnings |
|
|
10 |
import shutil |
|
|
11 |
import glob |
|
|
12 |
import sys |
|
|
13 |
import os |
|
|
14 |
import re |
|
|
15 |
|
|
|
16 |
|
|
|
17 |
utils_path = subprocess.check_output('locate modas/utils', shell=True, text=True, encoding='utf-8') |
|
|
18 |
#utils_path = '/'.join(re.search('\n(.*site-packages.*)\n', utils_path).group(1).split('/')[:-1]) |
|
|
19 |
utils_path = re.search('\n(.*site-packages.*)\n', utils_path).group(1) |
|
|
20 |
if not utils_path.endswith('utils'): |
|
|
21 |
utils_path = '/'.join(utils_path.split('/')[:-1]) |
|
|
22 |
|
|
|
23 |
|
|
|
24 |
def lm_res(y, X): |
|
|
25 |
idx = np.isnan(X) | np.isnan(y) |
|
|
26 |
X = X[~idx].reshape(-1, 1) |
|
|
27 |
y = y[~idx] |
|
|
28 |
lm = LinearRegression().fit(X, y) |
|
|
29 |
sigma2 = np.sum((y - lm.predict(X))**2)/(X.shape[0]-1) |
|
|
30 |
se = np.sqrt(np.diag(np.linalg.pinv(np.dot(X.T, X))))[-1] * sigma2 |
|
|
31 |
effect = lm.coef_[-1] |
|
|
32 |
rsq = lm.score(X, y) |
|
|
33 |
return pd.Series(dict(zip(['effect', 'se', 'rsq'], [effect, se, rsq]))) |
|
|
34 |
|
|
|
35 |
|
|
|
36 |
def MR(mTrait, pTrait, g, pvalue_cutoff): |
|
|
37 |
p = pd.concat([mTrait.to_frame(), pTrait], axis=1) |
|
|
38 |
snp_lm = p.apply(lm_res, args=[g.values]) |
|
|
39 |
mTrait_lm = p.apply(lm_res, args=[mTrait.values]) |
|
|
40 |
pTrait_mTrait = snp_lm.loc['effect', :][1:] / snp_lm.loc['effect', :][0] |
|
|
41 |
var_upper = pTrait.var() * (1 - mTrait_lm.loc['rsq', :][1:]) |
|
|
42 |
var_down = mTrait.shape[0] * mTrait.var() * snp_lm.loc['rsq', :][0] |
|
|
43 |
var = var_upper / var_down |
|
|
44 |
TMR = pTrait_mTrait**2 / var |
|
|
45 |
pvalue = 1 - chi2.cdf(TMR, 1) |
|
|
46 |
MR_res = pd.DataFrame(dict(zip(['snp', 'mTrait', 'pTrait', 'effect', 'TMR', 'pvalue'], |
|
|
47 |
[[g.name]*pTrait.shape[1], [mTrait.name]*pTrait.shape[1], pTrait.columns, pTrait_mTrait, TMR, pvalue]))) |
|
|
48 |
MR_res = MR_res.loc[(MR_res.pvalue<=pvalue_cutoff) & (MR_res.mTrait!=MR_res.pTrait),:] |
|
|
49 |
return MR_res |
|
|
50 |
|
|
|
51 |
|
|
|
52 |
def MR_parallel(mTrait_qtl, mTrait, pTrait, geno, threads, pvalue_cutoff): |
|
|
53 |
args = list() |
|
|
54 |
for index, row in mTrait_qtl.iterrows(): |
|
|
55 |
rs = row['SNP'] |
|
|
56 |
mTrait_name = row['phe_name'] |
|
|
57 |
args.append((mTrait.loc[:, mTrait_name], pTrait, geno.loc[:, rs], pvalue_cutoff)) |
|
|
58 |
res = mp.parallel(MR, args, threads) |
|
|
59 |
res = pd.concat([i for i in res]) |
|
|
60 |
return res |
|
|
61 |
|
|
|
62 |
|
|
|
63 |
def var_bXY(bzx, bzx_se, bzy, bzy_se): |
|
|
64 |
varbXY = bzx_se**2*bzy**2/bzx**4 + bzy_se**2/bzx**2 |
|
|
65 |
return varbXY |
|
|
66 |
|
|
|
67 |
|
|
|
68 |
def generate_geno_batch(mTrait_qtl, mTrait, pTrait, geno, threads, bed_dir, rs_dir): |
|
|
69 |
if os.path.exists(bed_dir): |
|
|
70 |
shutil.rmtree(bed_dir) |
|
|
71 |
os.mkdir(bed_dir) |
|
|
72 |
if os.path.exists(rs_dir): |
|
|
73 |
shutil.rmtree(rs_dir) |
|
|
74 |
os.mkdir(rs_dir) |
|
|
75 |
plink_extract = utils_path + '/plink -bfile {} -extract {} --make-bed -out {}' |
|
|
76 |
geno_batch = list() |
|
|
77 |
for mTrait_name in mTrait_qtl.phe_name.unique(): |
|
|
78 |
out_name = bed_dir.strip('/') + '/' + mTrait_name |
|
|
79 |
rs = mTrait_qtl.loc[mTrait_qtl.phe_name == mTrait_name, 'SNP'] |
|
|
80 |
rs_name = rs_dir.strip('/') + '/' + '_'.join([mTrait_name,'rs.txt']) |
|
|
81 |
pd.Series(rs).to_frame().to_csv(rs_name, index=False, header=None) |
|
|
82 |
geno_batch.append((plink_extract.format(geno, rs_name, out_name),)) |
|
|
83 |
out_name = bed_dir.strip('/') + '/pTrait' |
|
|
84 |
rs_name = rs_dir.strip('/') + '/pTrait_rs.txt' |
|
|
85 |
mTrait_qtl['SNP'].to_frame().to_csv(rs_name, index=False, header=None) |
|
|
86 |
geno_batch.append((plink_extract.format(geno, rs_name, out_name),)) |
|
|
87 |
mp.parallel(mp.run, geno_batch, threads) |
|
|
88 |
for fn in glob.glob(bed_dir.strip('/')+'/*fam'): |
|
|
89 |
fam = pd.read_csv(fn, sep=' ', header=None) |
|
|
90 |
mTrait_name = fn.split('/')[-1].replace('.fam', '') |
|
|
91 |
if mTrait_name == 'pTrait': |
|
|
92 |
pTrait = pTrait.reindex(fam[0]) |
|
|
93 |
fam.index = fam[0] |
|
|
94 |
fam = pd.concat([fam, pTrait], axis=1) |
|
|
95 |
else: |
|
|
96 |
fam.loc[:, 5] = mTrait.loc[:, mTrait_name].reindex(fam[0]).values |
|
|
97 |
fam.to_csv(fn, index=False, header=None, sep=' ', na_rep='NA') |
|
|
98 |
|
|
|
99 |
|
|
|
100 |
def calc_MLM_effect(bed_dir, pTrait, threads, geno): |
|
|
101 |
args = list() |
|
|
102 |
geno_prefix = geno.split('/')[-1] |
|
|
103 |
fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None) |
|
|
104 |
fam[5] = 1 |
|
|
105 |
fam.to_csv(geno_prefix + '.link.fam', sep='\t', na_rep='NA', header=None, index=False) |
|
|
106 |
if os.path.exists(geno_prefix + '.link.bed'): |
|
|
107 |
os.remove(geno_prefix + '.link.bed') |
|
|
108 |
if os.path.exists(geno_prefix + '.link.bim'): |
|
|
109 |
os.remove(geno_prefix + '.link.bim') |
|
|
110 |
os.symlink(geno + '.bed', geno_prefix + '.link.bed') |
|
|
111 |
os.symlink(geno + '.bim', geno_prefix + '.link.bim') |
|
|
112 |
related_matrix_cmd = utils_path + '/gemma -bfile {0}.link -gk 1 -o {1}'.format(geno_prefix, geno_prefix) |
|
|
113 |
s = mp.run(related_matrix_cmd) |
|
|
114 |
if s != 0: |
|
|
115 |
return None |
|
|
116 |
gemma_cmd_mTrait = utils_path + '/gemma -bfile {0} -k ./output/{1}.cXX.txt -lmm -n 1 -o {2}' |
|
|
117 |
gemma_cmd_pTrait = utils_path + '/gemma -bfile {0} -k ./output/{1}.cXX.txt -lmm -n {2} -o {3}' |
|
|
118 |
for i in glob.glob(bed_dir + '/*.bed'): |
|
|
119 |
i = i.replace('.bed', '') |
|
|
120 |
if i.split('/')[-1] != 'pTrait': |
|
|
121 |
prefix = i.split('/')[-1] |
|
|
122 |
args.append((gemma_cmd_mTrait.format(i, geno_prefix, 'mTrait_' + prefix),)) |
|
|
123 |
else: |
|
|
124 |
for _, pTrait_name in enumerate(pTrait.columns): |
|
|
125 |
args.append((gemma_cmd_pTrait.format(i, geno_prefix, _ + 2, 'pTrait_' + pTrait_name),)) |
|
|
126 |
s = mp.parallel(mp.run, args, threads) |
|
|
127 |
os.remove(geno_prefix + '.link.bed') |
|
|
128 |
os.remove(geno_prefix + '.link.bim') |
|
|
129 |
os.remove(geno_prefix + '.link.fam') |
|
|
130 |
return s |
|
|
131 |
|
|
|
132 |
|
|
|
133 |
def get_MLM_effect(fn): |
|
|
134 |
assoc = pd.read_csv(fn, sep='\t') |
|
|
135 |
assoc.index = assoc['rs'] |
|
|
136 |
return assoc[['beta', 'se']] |
|
|
137 |
|
|
|
138 |
|
|
|
139 |
def get_MLM_effect_parallell(assoc_dir, mTrait, pTrait, threads): |
|
|
140 |
mTrait_effect = pd.DataFrame() |
|
|
141 |
args = [] |
|
|
142 |
#pTrait_name = [] |
|
|
143 |
# for fn in glob.glob(assoc_dir.strip('/') + '/mTrait*.assoc.txt'): |
|
|
144 |
# mTrait_name = fn.split('/')[-1].split('_')[-1].replace('.assoc.txt', '') |
|
|
145 |
# assoc = pd.read_csv(fn, sep='\t') |
|
|
146 |
# assoc.index = mTrait_name+';' + assoc['rs'] |
|
|
147 |
# mTrait_effect = pd.concat([mTrait_effect, assoc[['beta', 'se']]]) |
|
|
148 |
# for fn in glob.glob(assoc_dir.strip('/') + '/pTrait*assoc.txt'): |
|
|
149 |
# pTrait_name.append(fn.split('/')[-1].split('_')[-1].replace('.assoc.txt', '')) |
|
|
150 |
# args.append((fn,)) |
|
|
151 |
for mTrait_name in mTrait.columns: |
|
|
152 |
fn = assoc_dir.strip('/') + '/mTrait_' + mTrait_name + '.assoc.txt' |
|
|
153 |
assoc = pd.read_csv(fn, sep='\t') |
|
|
154 |
assoc.index = mTrait_name+';' + assoc['rs'] |
|
|
155 |
mTrait_effect = pd.concat([mTrait_effect, assoc[['beta', 'se']]]) |
|
|
156 |
for pTrait_name in pTrait.columns: |
|
|
157 |
fn = assoc_dir.strip('/') + '/pTrait_' + pTrait_name + '.assoc.txt' |
|
|
158 |
args.append((fn,)) |
|
|
159 |
pTrait_res = mp.parallel(get_MLM_effect, args, threads) |
|
|
160 |
pTrait_effect = pd.concat([i['beta'] for i in pTrait_res], axis=1) |
|
|
161 |
pTrait_effect.columns = pTrait.columns |
|
|
162 |
pTrait_se = pd.concat([i['se'] for i in pTrait_res], axis=1) |
|
|
163 |
pTrait_se.columns = pTrait.columns |
|
|
164 |
return mTrait_effect, pTrait_effect, pTrait_se |
|
|
165 |
|
|
|
166 |
|
|
|
167 |
def MR_MLM(mTrait_effect_snp, pTrait_effect_snp, pTrait_se_snp, pvalue_cutoff): |
|
|
168 |
mTrait_name, rs = mTrait_effect_snp.name.split(';') |
|
|
169 |
bxy = pTrait_effect_snp / mTrait_effect_snp['beta'] |
|
|
170 |
varbXY = var_bXY(mTrait_effect_snp['beta'], mTrait_effect_snp['se'], pTrait_effect_snp, pTrait_se_snp) |
|
|
171 |
TMR = bxy**2 / varbXY |
|
|
172 |
pvalue = 1 - chi2.cdf(TMR, 1) |
|
|
173 |
MR_res = pd.DataFrame(dict(zip(['snp', 'mTrait', 'pTrait', 'effect', 'TMR', 'pvalue'], |
|
|
174 |
[[rs] * pTrait_effect_snp.shape[0], [mTrait_name] * pTrait_effect_snp.shape[0], |
|
|
175 |
pTrait_effect_snp.index, bxy, TMR, pvalue]))) |
|
|
176 |
MR_res = MR_res.loc[(MR_res.pvalue<=pvalue_cutoff) & (MR_res.mTrait!=MR_res.pTrait), :] |
|
|
177 |
return MR_res |
|
|
178 |
|
|
|
179 |
|
|
|
180 |
def MR_MLM_parallel(mTrait_qtl, mTrait_effect, pTrait_effect, pTrait_se, threads, pvalue_cutoff): |
|
|
181 |
args = [] |
|
|
182 |
for index, row in mTrait_qtl.iterrows(): |
|
|
183 |
mTrait_name = row['phe_name'] |
|
|
184 |
rs = row['SNP'] |
|
|
185 |
args.append((mTrait_effect.loc[';'.join([mTrait_name, rs]),:], pTrait_effect.loc[rs, :], pTrait_se.loc[rs, :], pvalue_cutoff)) |
|
|
186 |
res = mp.parallel(MR_MLM, args, threads) |
|
|
187 |
res = pd.concat([i for i in res]) |
|
|
188 |
return res |
|
|
189 |
|
|
|
190 |
|
|
|
191 |
def edge_weight(qtl, MR_res): |
|
|
192 |
MR_res = MR_res.loc[MR_res.mTrait!=MR_res.pTrait,:] |
|
|
193 |
MR_res = MR_res.sort_values(by='pvalue') |
|
|
194 |
qtl_peak_pos = qtl[['CHR', 'phe_name']] |
|
|
195 |
qtl_peak_pos.loc[:, 'start'] = qtl['SNP'].apply(lambda x: int(x.split('_')[-1]) - 1) |
|
|
196 |
qtl_peak_pos.loc[:, 'end'] = qtl['SNP'].apply(lambda x: int(x.split('_')[-1]) + 1) |
|
|
197 |
qtl_peak_pos = qtl_peak_pos[['CHR', 'start', 'end', 'phe_name']] |
|
|
198 |
qtl_peak_pos.columns = ['Chromosome', 'Start', 'End', 'phe_name'] |
|
|
199 |
qtl_peak_range = qtl[['CHR', 'phe_name']] |
|
|
200 |
qtl_peak_range.loc[:, 'start'] = qtl['SNP'].apply(lambda x: int(x.split('_')[-1]) - 1000000) |
|
|
201 |
qtl_peak_range.loc[:, 'end'] = qtl['SNP'].apply(lambda x: int(x.split('_')[-1]) + 1000000) |
|
|
202 |
qtl_peak_range = qtl_peak_range[['CHR', 'start', 'end', 'phe_name']] |
|
|
203 |
qtl_peak_range.columns = ['Chromosome', 'Start', 'End', 'phe_name'] |
|
|
204 |
qtl_peak_pos = pr.PyRanges(qtl_peak_pos) |
|
|
205 |
qtl_peak_range = pr.PyRanges(qtl_peak_range) |
|
|
206 |
coloc = qtl_peak_pos.join(qtl_peak_range) |
|
|
207 |
|
|
|
208 |
res = pd.DataFrame() |
|
|
209 |
for k in sorted(coloc.dfs.keys()): |
|
|
210 |
res = pd.concat([res, coloc.dfs[k]]) |
|
|
211 |
res = res.loc[res.phe_name != res.phe_name_b, :] |
|
|
212 |
mr_coloc = pd.merge(MR_res, res, left_on=['mTrait', 'pTrait'], right_on=['phe_name', 'phe_name_b']) |
|
|
213 |
#coloc = coloc.sort_values(by='pvalue') |
|
|
214 |
mr_coloc_count = mr_coloc.pvalue.value_counts().sort_index() |
|
|
215 |
mr_count = MR_res.pvalue.value_counts().sort_index().cumsum() |
|
|
216 |
weight = list() |
|
|
217 |
for p in MR_res.pvalue.values: |
|
|
218 |
coloc_count = mr_coloc_count[mr_coloc_count.index <= p].sum() |
|
|
219 |
weight.append(coloc_count / mr_count[p]) |
|
|
220 |
MR_res.loc[:, 'weight'] = weight |
|
|
221 |
edgeweight = MR_res[['mTrait', 'pTrait', 'weight']] |
|
|
222 |
edgeweight = edgeweight.loc[edgeweight.weight >= 0.2, :] |
|
|
223 |
edgeweight = edgeweight.sort_values(by=['mTrait', 'pTrait', 'weight'], ascending=False).drop_duplicates(subset=['mTrait', 'pTrait']) |
|
|
224 |
return edgeweight |
|
|
225 |
|
|
|
226 |
|