Diff of /modas/regiongwas.py [000000] .. [a43cea]

Switch to unified view

a b/modas/regiongwas.py
1
import pandas as pd
2
import numpy as np
3
import modas.multiprocess as mp
4
import modas.gwas_cmd as gc
5
from sklearn.decomposition import PCA
6
from rpy2.robjects import pandas2ri
7
from rpy2.rinterface_lib.embedded import RRuntimeError
8
import rpy2.robjects as robjects
9
from rpy2.robjects.packages import importr
10
from rpy2.rinterface_lib.callbacks import logger as rpy2_logger
11
import subprocess
12
import logging
13
import glob, os
14
import shutil
15
import re
16
17
pandas2ri.activate()
18
rpy2_logger.setLevel(logging.ERROR)
19
rMVP = importr('rMVP')
20
base = importr('base')
21
bigmemory = importr('bigmemory')
22
23
utils_path = subprocess.check_output('locate modas/utils', shell=True, text=True, encoding='utf-8')
24
#utils_path = '/'.join(re.search('\n(.*site-packages.*)\n', utils_path).group(1).split('/')[:-1])
25
utils_path = re.search('\n(.*site-packages.*)\n', utils_path).group(1)
26
if not utils_path.endswith('utils'):
27
    utils_path = '/'.join(utils_path.split('/')[:-1])
28
29
30
def region_gwas_parallel(bed_dir, threads, geno, gwas_model):
31
    region_gwas_args = list()
32
    geno_prefix = geno.split('/')[-1]
33
    if gwas_model == 'GLM' or gwas_model == 'MLM':
34
        fam = pd.read_csv(geno+'.fam', sep=r'\s+', header=None)
35
        fam[5] = 1
36
        fam.to_csv(geno_prefix+'.link.fam', sep='\t', na_rep='NA', header=None, index=False)
37
        if os.path.exists(geno_prefix+'.link.bed'):
38
            os.remove(geno_prefix+'.link.bed')
39
        if os.path.exists(geno_prefix+'.link.bim'):
40
            os.remove(geno_prefix+'.link.bim')
41
        os.symlink(geno+'.bed', geno_prefix+'.link.bed')
42
        os.symlink(geno+'.bim', geno_prefix+'.link.bim')
43
        if gwas_model=='MLM':
44
            related_matrix_cmd = utils_path + '/gemma -bfile {0}.link -gk 1 -o {1}'.format(geno_prefix,geno_prefix)
45
            s = mp.run(related_matrix_cmd)
46
            if s!=0:
47
                return None
48
    # if gwas_model=='MLM':
49
    #     gemma_cmd = 'gemma.linux -bfile {0} -k ./output/{1}.cXX.txt -lmm -n 1 -o {2}'
50
    # elif gwas_model=='LM':
51
    #     gemma_cmd = 'gemma.linux -bfile {0} -lm  -o {1}'
52
53
54
    for _, i in enumerate(glob.glob(bed_dir+'/*.bed')):
55
        i = i.replace('.bed','')
56
        i = i.replace('m/z','m.z')
57
        prefix = i.split('/')[-1]
58
        if gwas_model == 'MLM':
59
            region_gwas_args.append((gc.gemma_cmd('MLM', i, geno_prefix, 1, prefix + '_plink'),))
60
            # region_gwas_args.append((gemma_cmd.format(i, geno_prefix, prefix+'_plink'),))
61
        elif gwas_model == 'LM':
62
            region_gwas_args.append((gc.gemma_cmd('LM', i, None, None, prefix + '_plink'),))
63
            # region_gwas_args.append((gemma_cmd.format(i, prefix+'_plink'),))
64
        else:
65
            phe = pd.read_csv(i+'.fam', sep='\s+',header=None)
66
            phe = phe.iloc[:, [0, -1]]
67
            phe.columns = ['Taxa', prefix]
68
            # region_gwas_args.append((phe, '../'+geno_prefix + '.link', '../'+i, 1))
69
            region_gwas_args.append(('GLM', geno_prefix+'.link', i, phe, 1, './output'))
70
    if gwas_model == 'LM' or gwas_model == 'MLM':
71
        s = mp.parallel(mp.run, region_gwas_args, threads)
72
    else:
73
        if not os.path.exists('./output'):
74
            os.mkdir('./output')
75
        # os.chdir('./output')
76
        # s = mp.parallel(glm_gwas, (region_gwas_args[0],), 1)
77
        # s = mp.parallel(glm_gwas, region_gwas_args[1:], threads)
78
        # os.chdir('../')
79
        s = mp.parallel(gc.rmvp, (region_gwas_args[0],), 1)
80
        s = mp.parallel(gc.rmvp, region_gwas_args[1:], threads)
81
    if gwas_model == 'GLM' or gwas_model == 'MLM':
82
        os.remove(geno_prefix+'.link.bed')
83
        os.remove(geno_prefix+'.link.bim')
84
        os.remove(geno_prefix+'.link.fam')
85
    return s
86
87
88
def glm_gwas(omics_phe, pc_geno_prefix, geno_prefix, threads):
89
    try:
90
        base.sink('/dev/null')
91
        if not os.path.exists(pc_geno_prefix + '.pc.desc'):
92
            rMVP.MVP_Data(fileBed=pc_geno_prefix, fileKin=False, filePC=False, out=pc_geno_prefix,
93
                          verbose=False)
94
            rMVP.MVP_Data_PC(True, mvp_prefix=pc_geno_prefix, pcs_keep=3, verbose=False)
95
        rMVP.MVP_Data(fileBed=geno_prefix, fileKin=False, filePC=False, out=geno_prefix, verbose=False)
96
        geno = bigmemory.attach_big_matrix(geno_prefix +'.geno.desc')
97
        map_file = pd.read_csv(geno_prefix +'.geno.map', sep='\t')
98
        Covariates_PC = bigmemory.as_matrix(bigmemory.attach_big_matrix(pc_geno_prefix + '.pc.desc'))
99
        # base.setwd('./output')
100
        rMVP.MVP(phe=omics_phe, geno=geno, map=map_file, CV_GLM=Covariates_PC, priority="speed",
101
                ncpus=threads, maxLoop=10, threshold=0.05, method=['GLM'], file_output=True, verbose=False)
102
        base.sink()
103
        #gwas(omics_phe, pc_geno_prefix, geno_prefix, threads)
104
    except RRuntimeError:
105
        return 0
106
    except ValueError:
107
        return 0
108
    else:
109
        return 1
110
111
112
def generate_qtl_batch(omics_phe,phe_sig_qtl,geno_name,threads,bed_dir,rs_dir):
113
    plink_extract = utils_path + '/plink -bfile {} --extract {} --make-bed -out {}'
114
    bim = pd.read_csv(geno_name+'.bim', sep='\t', header=None)
115
    qtl_batch = list()
116
    rs = dict()
117
    for index,row in phe_sig_qtl.iterrows():
118
        rs.setdefault(row['phe_name'],[]).extend(bim.loc[(bim[0]==row['chr']) & (bim[3]>=row['start']) & (bim[3]<=row['end']),1].values.tolist())
119
    for phe_name in rs:
120
        out_name = bed_dir.strip('/') + '/' + '_'.join(['tmp',phe_name])
121
        rs_name = rs_dir.strip('/') + '/' + '_'.join(['tmp',phe_name,'rs.txt'])
122
        pd.Series(rs[phe_name]).to_frame().to_csv(rs_name,index=False,header=False)
123
        qtl_batch.append((plink_extract.format(geno_name,rs_name,out_name),))
124
    mp.parallel(mp.run,qtl_batch,threads)
125
    for fn in glob.glob(bed_dir.strip('/')+'/*fam'):
126
        fam = pd.read_csv(fn,sep=' ',header=None)
127
        phe_name = '_'.join(fn.split('/')[-1].split('_')[1:]).replace('m.z','m/z').replace('.fam','')
128
        fam.loc[:,5] = omics_phe.loc[:,phe_name].reindex(fam.loc[:,0]).values
129
        fam.to_csv(fn,index=False,header=None,sep=' ',na_rep='NA')
130
131
132
# def generate_clump_input(dir,num_threads):
133
#     if os.path.exists('./clump_input'):
134
#         shutil.rmtree('./clump_input')
135
#     os.mkdir('./clump_input')
136
#     cmd = '''awk '{if(NR==1)print "SNP\\tP"; else print $2"\\t"$11}' '''
137
#     cmds = list()
138
#     fns = list()
139
#     for fn in glob.glob(dir.strip('/')+'/*_plink.assoc.txt'):
140
#         filename = fn.split('/')[-1]
141
#         cmds.append((cmd+'{0} > ./clump_input/{1}'.format(fn, filename.replace('_plink.assoc.txt', '.assoc')),))
142
#         fns.append(filename)
143
#     s = mp.parallel(mp.run, cmds, num_threads)
144
#     if sum(s) != 0:
145
#         print(','.join(list(np.array(fns)[s]))+' do not  successfully generated clump input file.')
146
#     return s
147
def generate_clump_input(dir, gwas_model):
148
    if os.path.exists('./clump_input'):
149
        shutil.rmtree('./clump_input')
150
    os.mkdir('./clump_input')
151
    if gwas_model == 'LM' or gwas_model == 'MLM':
152
        for fn in glob.glob(dir.strip('/')+'/*_plink.assoc.txt'):
153
            filename = fn.split('/')[-1]
154
            assoc = pd.read_csv(fn, sep='\t')
155
            assoc = assoc[['rs', 'p_wald']]
156
            assoc.columns = ['SNP', 'P']
157
            assoc.to_csv('./clump_input/' + filename.replace('_plink.assoc.txt', '.assoc'), index=False, sep='\t')
158
    else:
159
        for fn in glob.glob(dir.strip('/')+'/tmp_*GLM.csv'):
160
            filename = fn.split('/')[-1]
161
            assoc = pd.read_csv(fn)
162
            assoc = assoc.iloc[:, [0, -1]]
163
            assoc.columns = ['SNP', 'P']
164
            assoc.to_csv('./clump_input/' + filename.replace('.GLM.csv', '.assoc'), index=False, sep='\t')
165
166
167
def plink_clump(geno_path, p1, p2, num_threads):
168
    if os.path.exists('./clump_result'):
169
        shutil.rmtree('./clump_result')
170
    os.mkdir('./clump_result')
171
    cmd = utils_path + '/plink --bfile {0} --clump {1}  --clump-p1 {2} --clump-p2 {3} --clump-kb {4} --clump-r2 0.2 --out {5}'
172
    cmds = list()
173
    ms = list()
174
    for fn in glob.glob('./clump_input/*'):
175
        phe_name = fn.split('/')[-1].replace('.assoc','')
176
        cmds.append((cmd.format(geno_path+'/'+phe_name, fn, p1, p2,str(500), './clump_result/' + phe_name + '_'+str(500)),))
177
        ms.append(phe_name)
178
    s = mp.parallel(mp.run, cmds, num_threads)
179
    if sum(s) != 0:
180
        print(','.join(list(np.array(ms)[s]))+' do not  successfully generated clumped file.')
181
    return s
182
183
184
#def merge_qtl(qtl):
185
#    qtl = qtl.sort_values(by=['CHR','BP'])
186
#    merged_qtl = list()
187
#    for index,row in qtl.iterrows():
188
#        if not merged_qtl:
189
#            merged_qtl.append(row)
190
#        else:
191
#            if row['CHR'] != merged_qtl[-1]['CHR']:
192
#                merged_qtl.append(row)
193
#            else:
194
#                if row['BP'] - merged_qtl[-1]['BP'] <= 1000000:
195
#                    if row['P'] < merged_qtl[-1]['P']:
196
#                        merged_qtl[-1]['P'] = row['P']
197
#                        merged_qtl[-1]['BP'] = row['BP']
198
#                        merged_qtl[-1]['SNP'] = row['SNP']
199
#                    merged_qtl[-1]['SP2_num'] += row['SP2_num']
200
#                    merged_qtl[-1]['SP2']+= ',' + row['SP2']
201
#                else:
202
#                    merged_qtl.append(row)
203
#    merged_qtl = pd.DataFrame(merged_qtl)
204
#    return merged_qtl
205
206
def merge_qtl_phe(qtl):
207
    qtl = qtl.sort_values(by=['CHR','qtl_start'])
208
    merged_phe_qtl = list()
209
    for index,row in qtl.iterrows():
210
        if not merged_phe_qtl:
211
            merged_phe_qtl.append(row)
212
        else:
213
            if row['CHR'] != merged_phe_qtl[-1]['CHR']:
214
                merged_phe_qtl.append(row)
215
            else:
216
                if row['qtl_start'] < merged_phe_qtl[-1]['qtl_end'] + 3000000:
217
                    if row['P'] < merged_phe_qtl[-1]['P']:
218
                        merged_phe_qtl[-1]['P'] = row['P']
219
                        merged_phe_qtl[-1]['SNP'] = row['SNP']
220
                    merged_phe_qtl[-1]['qtl_start'] = min(merged_phe_qtl[-1]['qtl_start'],row['qtl_start'])
221
                    merged_phe_qtl[-1]['qtl_end'] = max(merged_phe_qtl[-1]['qtl_end'], row['qtl_end'])
222
                    merged_phe_qtl[-1]['SP2_num'] += row['SP2_num']
223
                else:
224
                    merged_phe_qtl.append(row)
225
    merged_phe_qtl = pd.DataFrame(merged_phe_qtl)
226
    return merged_phe_qtl
227
228
229
def merge_qtl(qtl):
230
    qtl = qtl.sort_values(by=['CHR','qtl_start'])
231
    merged_qtl = pd.DataFrame()
232
    for index,row in qtl.iterrows():
233
        if merged_qtl.empty:
234
            merged_qtl = pd.concat([merged_qtl, row.to_frame().T])
235
        else:
236
            qtl_length = row['qtl_end'] - row['qtl_start']
237
            qtl_ratio = (merged_qtl['qtl_end'] - row['qtl_start']) / qtl_length
238
            qtl_index = (merged_qtl['CHR'] == row['CHR']) & (qtl_ratio >=0.1)
239
            if qtl_index.sum()>0:
240
                peak_dis = (merged_qtl.loc[qtl_index,'SNP'].apply(lambda x: int(x.split('_')[-1])) - int(row['SNP'].split('_')[-1])).abs()
241
                if (peak_dis <= 2000000).sum()==0:
242
                    merged_qtl = pd.concat([merged_qtl, row.to_frame().T])
243
                else:
244
                    merged_qtl_index = peak_dis[qtl_index].idxmin()
245
                    if merged_qtl.loc[merged_qtl_index,'P'] > row['P']:
246
                        merged_qtl.loc[merged_qtl_index,'P'] = row['P']
247
                        merged_qtl.loc[merged_qtl_index,'SNP'] = row['SNP']
248
                    merged_qtl.loc[merged_qtl_index,'qtl_start'] = min(merged_qtl.loc[merged_qtl_index,'qtl_start'],row['qtl_start'])
249
                    merged_qtl.loc[merged_qtl_index,'qtl_end'] = max(merged_qtl.loc[merged_qtl_index,'qtl_end'], row['qtl_end'])
250
                    merged_qtl.loc[merged_qtl_index,'SP2_num'] += row['SP2_num']
251
                    merged_qtl.loc[merged_qtl_index,'phe_name'] = merged_qtl.loc[merged_qtl_index,'phe_name'] + ',' + row['phe_name']
252
253
            else:
254
                merged_qtl = pd.concat([merged_qtl, row.to_frame().T])
255
    return merged_qtl
256
257
258
def phe_cluster(phe, phe_labeled, n):
259
    pca = PCA(n_components=1)
260
    phe_pc1 = pca.fit_transform(phe)
261
    phe_corr = phe.corrwith(pd.Series(phe_pc1[:,0],index=phe.index)).abs()
262
    if (phe_corr >= 0.6).all():
263
        phe_labeled.loc[phe_corr.index,'label'] = n
264
        return pd.DataFrame(phe_pc1,index=phe.index,columns=['cluster'+str(n)+'_PC1']), phe_labeled, n+1
265
    else:
266
        phe_pc1= pd.DataFrame()
267
        while not phe.empty:
268
            phe_corr = phe.corrwith(pd.Series(pca.fit_transform(phe)[:,0],index=phe.index)).abs()
269
            if (phe_corr < 0.6).sum()==1:
270
                if phe_corr.shape[0]==2:
271
                    phe_pc1 = pd.concat([phe_pc1,phe.loc[:,phe_corr.index]],axis=1)
272
                else:
273
                    phe_pc1 = pd.concat([phe_pc1,pd.DataFrame(pca.fit_transform(phe.loc[:,phe_corr>=0.6]),index=phe.index,columns=['cluster'+str(n)+'_PC1'])],axis=1)
274
                    phe_labeled.loc[phe.loc[:,phe_corr>=0.6].columns,'label'] = n
275
                    n = n + 1
276
                    phe_pc1 = pd.concat([phe_pc1,phe.loc[:, phe_corr < 0.6]],axis=1)
277
                phe = pd.DataFrame()
278
            else:
279
                if (phe_corr>=0.6).any():
280
                    if (phe_corr>=0.6).sum()==1:
281
                        phe_pc1 = pd.concat([phe_pc1,phe.loc[:,phe_corr>=0.6]],axis=1)
282
                    else:
283
                        phe_pc1 = pd.concat([phe_pc1,pd.DataFrame(pca.fit_transform(phe.loc[:,phe_corr>=0.6]),index=phe.index,columns=['cluster'+str(n)+'_PC1'])],axis=1)
284
                        phe_labeled.loc[phe.loc[:,phe_corr>=0.6].columns,'label'] = n
285
                        n = n + 1
286
                    phe = phe.loc[:,phe_corr < 0.6]
287
                else:
288
                    phe_pc1 = pd.concat([phe_pc1,phe],axis=1)
289
                    phe= pd.DataFrame()
290
                #phe_corr = phe.corrwith(pd.Series(pca.fit_transform(phe)[:,0],index=phe.index)).abs()
291
    return phe_pc1,phe_labeled,n
292
293
294
295
def generate_qtl(clump_result_dir, p2):
296
    qtl_res = list()
297
    bad_qtl = list()
298
    for fn in glob.glob(clump_result_dir.strip('/')+'/*clumped'):
299
        phe_name = '_'.join(fn.split('/')[-1].split('_')[1:-1])
300
        clump_result = pd.read_csv(fn,sep='\s+')
301
        clump_result = clump_result.loc[clump_result.SP2!='NONE',:]
302
        qtl = clump_result[['CHR','BP','SNP','P','SP2']]
303
        qtl.loc[:,'SP2_num'] = qtl['SP2'].apply(lambda x: len(x.split(',')))
304
        qtl.loc[:,'log10P'] = -np.log10(qtl['P'])
305
        if (qtl['SP2_num'] >= 10).sum() > 0:
306
            qtl['qtl_start'] = qtl['SP2'].apply(lambda x:int(re.findall(r'_(\d+)',x)[0]))
307
            qtl['qtl_end'] = qtl['SP2'].apply(lambda x:int(re.findall(r'_(\d+)',x)[-1]))
308
            qtl['phe_name'] = phe_name
309
            qtl_filter = qtl.loc[qtl.SP2_num>=5,:]
310
            mer_qtl_filter = merge_qtl_phe(qtl_filter)
311
            mer_qtl_filter.loc[:,'qtl_length'] = mer_qtl_filter['qtl_end'] - mer_qtl_filter['qtl_start'] + 1
312
            if mer_qtl_filter.shape[0] < 10:
313
                qtl = qtl.loc[(qtl.SP2_num>=5) & (qtl.log10P >= -np.log10(p2)), ['CHR','qtl_start','qtl_end','SNP','P','SP2_num','phe_name']]
314
                mer_qtl = merge_qtl_phe(qtl)
315
                mer_qtl.loc[:,'qtl_length'] = mer_qtl['qtl_end'] - mer_qtl['qtl_start'] + 1
316
                mer_qtl = mer_qtl.loc[:,['CHR','qtl_start','qtl_end','SNP','P','SP2_num','qtl_length','phe_name']]
317
                if mer_qtl.shape[0] < 4:
318
                    qtl_res.append(mer_qtl)
319
                else:
320
                    bad_qtl.append(mer_qtl.loc[mer_qtl['SP2_num']>=10,:])
321
            else:
322
                bad_qtl.append(mer_qtl_filter.loc[mer_qtl_filter['SP2_num']>=10,['CHR','qtl_start','qtl_end','SNP','P','SP2_num','qtl_length','phe_name']])
323
    qtl_res = pd.concat(qtl_res)
324
    qtl_res = qtl_res.loc[qtl_res['SP2_num']>=10,:]
325
    if not bad_qtl:
326
        bad_qtl = pd.DataFrame()
327
    else:
328
        bad_qtl = pd.concat(bad_qtl)
329
    return qtl_res, bad_qtl
330
331
332
def phe_PCA(omics_phe, qtl):
333
    omics_phe = omics_phe.fillna(omics_phe.mean())
334
    qtl_uniq = pd.DataFrame()
335
    for phe_name in qtl['phe_name'].unique():
336
        qtl_sub = qtl.loc[qtl.phe_name==phe_name,:]
337
        if qtl_sub.shape[0]==1:
338
            qtl_uniq = pd.concat([qtl_uniq,qtl_sub])
339
        else:
340
            qtl_sub = qtl_sub.sort_values(by=['SP2_num'],ascending=False)
341
            if qtl_sub.iloc[0,:]['SP2_num'] / qtl_sub.iloc[1,:]['SP2_num'] > 2:
342
                qtl_uniq = pd.concat([qtl_uniq,qtl_sub.iloc[0,:].to_frame().T])
343
            else:
344
                qtl_uniq = pd.concat([qtl_uniq,qtl_sub.loc[qtl_sub['P'].idxmin(),:].to_frame().T])
345
    merged_qtl_uniq = merge_qtl(qtl_uniq)
346
    merged_qtl_uniq.loc[:,'phe_name_num'] = merged_qtl_uniq['phe_name'].apply(lambda x:len(x.split(',')))
347
    merged_qtl_uniq = merged_qtl_uniq.sort_values(by='phe_name_num',ascending=False)
348
    omics_sig_phe = omics_phe.loc[:,[p.replace('m.z','m/z') for p in qtl['phe_name'].unique()]]
349
    omics_sig_phe_labeled = omics_sig_phe.T
350
    omics_sig_phe_labeled.loc[:,'label'] = 0
351
    omics_phe_pc = pd.DataFrame()
352
    n=1
353
    for phe_name in merged_qtl_uniq.loc[merged_qtl_uniq['phe_name_num']>=2,'phe_name']:
354
        omics_phe_sub = omics_sig_phe.loc[:,[p.replace('m.z','m/z') for p in phe_name.split(',')]]
355
        omics_phe_sub_pc,omics_sig_phe_labeled,n = phe_cluster(omics_phe_sub, omics_sig_phe_labeled, n)
356
        omics_phe_pc = pd.concat([omics_phe_pc, omics_phe_sub_pc],axis=1)
357
    clustered_omics_phe = pd.merge(omics_phe_pc.loc[:,~omics_phe_pc.columns.isin(omics_sig_phe.columns)],omics_sig_phe_labeled.loc[omics_sig_phe_labeled.label==0,:].drop('label',axis=1).T,left_index=True,right_index=True)
358
    return clustered_omics_phe, omics_sig_phe_labeled