Diff of /modas/MODAS.py [000000] .. [a43cea]

Switch to unified view

a b/modas/MODAS.py
1
#!/usr/bin/env python
2
import sys, time
3
import pandas as pd
4
import numpy as np
5
import datetime
6
import argparse
7
import traceback
8
import subprocess
9
import warnings
10
import shutil
11
import glob
12
import os
13
import re
14
15
16
warnings.filterwarnings("ignore")
17
18
19
class Logger(object):
20
21
    def __init__(self, fn):
22
        self.log_file = open(fn, 'w')
23
24
    def log(self, msg):
25
        '''
26
        Print to log file and stdout with a single command.
27
28
        '''
29
        if isinstance(msg,list):
30
            msg = ' '.join(msg)
31
        else:
32
            msg = ('{T}: '+msg).format(T=time.ctime())
33
        print(msg,file=self.log_file)
34
        print(msg)
35
36
37
class ArgsError(Exception):
38
    def myexcepthook(type, value, tb):
39
        msg = ''.join(traceback.format_exception(type, value, tb))
40
        print(msg, end = '')
41
42
    sys.excepthook = myexcepthook
43
44
45
class FileTypeError(Exception):
46
    def myexcepthook(type, value, tb):
47
        msg = ''.join(traceback.format_exception(type, value, tb))
48
        print(msg, end = '')
49
50
    sys.excepthook = myexcepthook
51
52
53
def genoidx(args, log):
54
    import modas.genoidx as gi
55
56
    if not os.path.exists(args.g) and not os.path.exists(args.g+'.bed'):
57
        raise ArgsError('the genotype file {} is not exists.'.format(args.g))
58
    if args.convert:
59
        log.log('begin convert hapmap genotype file to plink bed format genotype file.')
60
        b = gi.hap2plink_bed(args.g, args.o)
61
        if b:
62
            args.g = args.o
63
            log.log('Successfully convert hapmap genotype file to plink bed format genotype file.')
64
        else:
65
            raise FileTypeError('{} is not a hapmap file or {} is empty.'.format(args.g, args.g))
66
    g = gi.read_genotype(args.g)
67
    if g is None:
68
            raise FileTypeError('{0}.bed is not a plink bed format file or {0}.bim file and {0}.fam file is not in the same folder with {0}.bed, or {0}.bed is empty.'.format(args.g))
69
    if args.clump:
70
        log.log('begin snp clumping...')
71
        if os.path.exists(args.o + '_clump.bed'):
72
            os.remove(args.o + '_clump.bed')
73
            os.remove(args.o + '_clump.bim')
74
            os.remove(args.o + '_clump.fam')
75
        g_clump_list = gi.snp_clumping(args.g+'.bed', 0.8, args.o + '_clump.bed')
76
        log.log('snp clumping done.')
77
        log.log('There are {0} snps in input genotype file, after clumping, {1} snp left.'.format(g_clump_list[0],g_clump_list[1]))
78
        log.log('clumped genotype file is saved as {0}_clump.bed, {0}_clump.bim and {0}_clump.fam.'.format(args.o))
79
    if args.genome_cluster:
80
        if args.w <= 0:
81
            raise ArgsError('the window size of chromosome should larger than zero')
82
        if args.s < 0:
83
            raise ArgsError('the step size should larger than zero')
84
        log.log('begin genome cluster analysis...')
85
        if args.clump:
86
            g_clump = gi.read_genotype(args.o + '_clump')
87
        else:
88
            g_clump = g
89
        #genome_cluster,variant_cluster = gi.genome_cluster(g_clump,args.w,args.s,args.p)
90
        genome_cluster = gi.genome_cluster(g_clump, args.w, args.s, args.p)
91
        if genome_cluster.empty:
92
            log.log('genome cluster analysis failed,Please check genotype file.')
93
            sys.exit()
94
        genome_cluster.to_csv(args.o+'.genome_cluster.csv')
95
        # variant_cluster.to_csv(args.o+'.variant_cluster.csv',index=False)
96
    log.log('genoidx is finishd!')
97
98
99
def phenorm(args, log):
100
    import modas.genoidx as gi
101
    import modas.phenorm as pn
102
103
    log.log('begin phenotype filter/normalize analysis...')
104
    if not os.path.exists(args.phe):
105
        raise ArgsError('the phenotype file {} is not exists.'.format(args.phe))
106
    phe = pd.read_csv(args.phe, index_col=0)
107
    if phe.empty:
108
        log.log('the phenotype file {} is empty and software is terminated.'.format(args.phe))
109
        sys.exit()
110
    if args.v is None and args.r is None and not args.log2 and not args.log10 and not args.ln and not args.norm and not args.qqnorm:
111
        log.log('No phenotype filter process or normalize process choosed, please select one of the argument in -a, -r, -log2, -log10, -ln, -norm, -qqnorm.')
112
        sys.exit()
113
    log.log('Read phenotype file finished,There are {} phenotypes in phenotype file.'.format(phe.shape[1]))
114
    if args.r is not None:
115
        if args.r < 0 or args.r > 1:
116
            raise ArgsError('the interval of missing ration is between 0 and 1.')
117
        log.log('filter phenotype missing ratio larger than {}'.format(args.r))
118
        phe = pn.missing_filter(phe, args.r)
119
        log.log('after filtered, There are {} phenotypes left.'.format(phe.shape[1]))
120
    if args.v is not None:
121
        if args.v <= 0:
122
            raise ArgsError('metabolite abundance cutoff should larger than zero.')
123
        log.log('filter phenotype values less than {}'.format(args.v))
124
        phe = pn.abundance_filter(phe, args.v)
125
        log.log('after filtered, There are {} phenotypes left.'.format(phe.shape[1]))
126
    phe = phe.loc[:, (phe.var()-0) >= 1e-6]
127
    if args.pca:
128
        log.log('correct trait by PCA')
129
        g = gi.read_genotype(args.g)
130
        if g is None:
131
            raise FileTypeError('{0}.bed is not a plink bed format file or {0}.bim file and {0}.fam file is not in the same folder with {0}.bed, or {0}.bed is empty.'.format(args.g))
132
        pca = pn.pc_calc(args.g + '.bed', 3)
133
        if pca is None:
134
            log.log('calculate PCA failed, phenotype correction analysis is not performed')
135
        else:
136
            pca_phe_idx = pca.index.intersection(phe.index)
137
            phe = pn.trait_correct(pca.reindex(pca_phe_idx), phe.reindex(pca_phe_idx))
138
            log.log('correcting phenotype is successed')
139
    if args.log2:
140
        log.log('log2 scale phenotype values.')
141
        phe = pn.log2_scale(phe)
142
    if args.log10:
143
        log.log('log10 scale phenotype values.')
144
        phe = pn.log10_scale(phe)
145
    if args.ln:
146
        log.log('ln scale phenotype values.')
147
        phe = pn.ln_scale(phe)
148
    if args.norm:
149
        log.log('normalize phenotype values')
150
        phe = pn.normalize_scale(phe)
151
    if args.qqnorm:
152
        log.log('normal quantile transform phenotype values')
153
        phe = phe.apply(pn.qqnorm)
154
    phe.to_csv(args.o+'.normalized_phe.csv', na_rep='NA')
155
    log.log('phenotype filter/normalize analysis is finished.')
156
157
158
def prescreen(args, log):
159
    import modas.prescreen as ps
160
161
    log.log('begin phenotype prescreen analysis...')
162
    if not os.path.exists(args.phe):
163
        raise ArgsError('the phenotype file {} is not exists.'.format(args.phe))
164
    phe = pd.read_csv(args.phe, index_col=0)
165
    if phe.empty:
166
        log.log('the phenotype file {} is empty and software is terminated.'.format(args.phe))
167
        sys.exit()
168
    log.log('Read phenotype file finished,There are {} phenotypes in phenotype file.'.format(phe.shape[1]))
169
    if not os.path.exists(args.genome_cluster):
170
        raise ArgsError('genome cluster file is not exists, can not run phenotype prescreen analysis,please assign a exist file for -genome_cluster.')
171
    if args.gwas_model == 'MLM':
172
        if args.g is None:
173
            log.log('the plink bed format genotype  not provided, Using LM(linear model) for pseudo-genotype file GWAS analysis.')
174
            args.gwas_model = 'LM'
175
        elif not os.path.exists(args.g+'.bed'):
176
            log.log('the plink bed format genotype file {}.bed is not exists. Using LM(linear model) for pseudo-genotype file GWAS analysis'.format(args.g))
177
            args.gwas_model = 'LM'
178
    genome_cluster = pd.read_csv(args.genome_cluster, index_col=0)
179
    if genome_cluster.empty:
180
        log.log('genome cluster file {} is empty and software is terminated.'.format(args.genome_cluster))
181
        sys.exit()
182
    if os.path.exists('tmp_omics_phe_bimbam'):
183
        shutil.rmtree('tmp_omics_phe_bimbam')
184
    os.mkdir('tmp_omics_phe_bimbam')
185
    if os.path.exists('output'):
186
        os.rename('output','output'+datetime.datetime.now().strftime('%Y-%m-%d_%H:%M:%S'))
187
    #if args.gwas_model =='MLM':
188
    #    fam = pd.read_csv(args.g+'.fam', sep=r'\s+', header=None,index_col=0)
189
    #    sample_id = fam.index.intersection(genome_cluster.index)
190
    #    genome_cluster = genome_cluster.reindex(sample_id)
191
    #    geno_prefix = args.g.split('/')[-1]
192
    #else:
193
    #    sample_id = genome_cluster.index
194
    #    geno_prefix = '.'.join(args.genome_cluster.split('/')[-1].split('.')[:-2])
195
    #phe = phe.reindex(sample_id)
196
    phe = phe.reindex(genome_cluster.index)
197
    geno_prefix = '.'.join(args.genome_cluster.split('/')[-1].split('.')[:-2])
198
    log.log('convert genome cluster file to bimbam genotype file.')
199
    a, g = ps.qtl_pc2bimbam(genome_cluster)
200
    a.to_csv('tmp_omics_phe_bimbam/'+geno_prefix+'_qtl_pc.anno.txt',index=False,header=None)
201
    g.to_csv('tmp_omics_phe_bimbam/'+geno_prefix+'_qtl_pc.geno.txt',index=False,header=None)
202
    if args.gwas_suggest_pvalue is None:
203
        args.gwas_suggest_pvalue = 1.0 / genome_cluster.shape[1]
204
    log.log('running pseudo-genotype GWAS for significant phenotype filter.')
205
    s = ps.qtl_pc_gwas_parallel(phe, 'tmp_omics_phe_bimbam', args.p, args.g, geno_prefix, args.gwas_model)
206
    # if args.lm_suggest_pvalue<1:
207
    #     log.log('run lm model for genome cluster pseudo-SNP filter.')
208
    #     s = ps.qtl_pc_lm_gwas_parallel(phe,'tmp_omics_phe_bimbam',args.p,args.g)
209
    #     ps.generate_omics_qtl_pc_bimbam(phe,a,g,args.lm_suggest_pvalue,args.p)
210
    #     log.log('run lmm model for Significant phenotype filter.')
211
    #     s = ps.qtl_pc_lmm_gwas_parallel(phe,'tmp_omics_phe_bimbam',args.p,args.g,sample_id)
212
    # else:
213
    #     log.log('run lmm model for Significant phenotype filter.')
214
    #     s = ps.qtl_pc_lmm_gwas_parallel(phe, 'tmp_omics_phe_bimbam', args.p, args.g, sample_id, args.lm_suggest_pvalue)
215
    sig_omics_phe, phe_sig_qtl = ps.prescreen(phe, args.gwas_suggest_pvalue)
216
    sig_omics_phe.to_csv(args.o+'.sig_omics_phe.csv', na_rep='NA')
217
    phe_sig_qtl.to_csv(args.o+'.phe_sig_qtl.csv', index=False)
218
    log.log('prescreened phenotype saved successfully')
219
220
221
def regiongwas(args, log):
222
    import modas.regiongwas as rg
223
224
    log.log('begin regiongwas analysis...')
225
    if not os.path.exists(args.phe):
226
        raise ArgsError('the phenotype file {} is not exists.'.format(args.phe))
227
    phe = pd.read_csv(args.phe, index_col=0)
228
    if phe.empty:
229
        log.log('the phenotype file {} is empty and software is terminated.'.format(args.phe))
230
        sys.exit()
231
    log.log('Read phenotype file finished,There are {} phenotypes in phenotype file.'.format(phe.shape[1]))
232
    if not os.path.exists(args.phe_sig_qtl):
233
        raise ArgsError('the phenotype significant qtl file {} is not exists.'.format(args.phe_sig_qtl))
234
    phe_sig_qtl = pd.read_csv(args.phe_sig_qtl)
235
    if phe_sig_qtl.empty:
236
        log.log('the phenotype significant qtl file {} is empty and software is terminated.'.format(args.phe))
237
        sys.exit()
238
    if not os.path.exists(args.g+'.bed'):
239
        raise ArgsError('the plink bed format genotype file {} is not exists.'.format(args.g))
240
    if os.path.exists('tmp_qtl_bed'):
241
        shutil.rmtree('tmp_qtl_bed')
242
    os.mkdir('tmp_qtl_bed')
243
    if os.path.exists('tmp_rs_dir'):
244
        shutil.rmtree('tmp_rs_dir')
245
    os.mkdir('tmp_rs_dir')
246
    log.log('begin generate phenotype significant gwas region SNP genotype file...')
247
    rg.generate_qtl_batch(phe,phe_sig_qtl,args.g,args.p,'tmp_qtl_bed','tmp_rs_dir')
248
    log.log('begin region gwas analysis of phenotype significant gwas region...')
249
    s = rg.region_gwas_parallel('tmp_qtl_bed', args.p,args.g,args.gwas_model)
250
    log.log('begin generate QTL of phenotypes...')
251
    s = rg.generate_clump_input('output', args.gwas_model)
252
    if args.p1 is None:
253
        bim = pd.read_csv(args.g+'.bim', header=None, sep='\s+')
254
        args.p1 = 1.0 / bim.shape[0]
255
    if args.p2 is None:
256
        args.p2 = args.p1 * 10
257
    s = rg.plink_clump('tmp_qtl_bed', args.p1, args.p2, args.p)
258
    qtl_res, bad_qtl = rg.generate_qtl('clump_result', args.p2)
259
    qtl_res.index = np.arange(qtl_res.shape[0])
260
    qtl_res.to_csv(args.o+'.region_gwas_qtl_res.csv', index=False)
261
    bad_qtl.to_csv(args.o+'.region_gwas_bad_qtl_res.csv', index=False)
262
    log.log('generate QTL done.')
263
    if args.cluster:
264
        log.log('begin cluster phenotype analysis...')
265
        clustered_phe, phe_labeled = rg.phe_PCA(phe, qtl_res)
266
        clustered_phe.to_csv(args.o+'.clustered_phe.csv')
267
        phe_labeled.to_csv(args.o+'.phe_labeled.csv')
268
        log.log('cluster phenotype analysis done')
269
    log.log('region gwas analysis is finished.')
270
271
272
def mr(args, log):
273
    import modas.genoidx as gi
274
    import modas.mr as MR
275
276
    utils_path = subprocess.check_output('locate modas/utils', shell=True, text=True, encoding='utf-8')
277
    # utils_path = '/'.join(re.search('\n(.*site-packages.*)\n', utils_path).group(1).split('/')[:-1])
278
    utils_path = re.search('\n(.*site-packages.*)\n', utils_path).group(1)
279
    if not utils_path.endswith('utils'):
280
        utils_path = '/'.join(utils_path.split('/')[:-1])
281
282
    log.log('begin Mendelian Randomization analysis...')
283
    if not os.path.exists(args.exposure):
284
        raise ArgsError('the exposure file {} is not exists.'.format(args.exposure))
285
    mTrait = pd.read_csv(args.exposure, index_col=0)
286
    mTrait.columns = [col.replace('m/z', 'm.z') for col in mTrait.columns]
287
    if mTrait.empty:
288
        log.log('the exposure file {} is empty and software is terminated.'.format(args.exposure))
289
        sys.exit()
290
    if not os.path.exists(args.outcome):
291
        raise ArgsError('the outcome file {} is not exists.'.format(args.outcome))
292
    pTrait = pd.read_csv(args.outcome, index_col=0)
293
    if pTrait.empty:
294
        log.log('the outcome file {} is empty and software is terminated.'.format(args.outcome))
295
        sys.exit()
296
    if not os.path.exists(args.qtl):
297
        raise ArgsError('the exposure QTL file {} is not exists.'.format(args.qtl))
298
    qtl = pd.read_csv(args.qtl)
299
    if qtl.empty:
300
        log.log('the exposure QTL file {} is empty and software is terminated.'.format(args.qtl))
301
        sys.exit()
302
    if not os.path.exists(args.g+'.bed'):
303
        raise ArgsError('the plink bed format genotype file {} is not exists.'.format(args.g))
304
    if args.lm and args.mlm:
305
        log.log('please select linear model or mixed linear model for Mendelian Randomization analysis.')
306
        log.log('software is terminated.')
307
        sys.exit()
308
    if not args.lm and not args.mlm:
309
        log.log('please select a model from linear model and mixed linear model for Mendelian Randomization analysis.')
310
        log.log('software is terminated.')
311
        sys.exit()
312
    if mTrait.columns.isin(qtl.phe_name).sum()>0:
313
        mTrait = mTrait.loc[:, mTrait.columns.isin(qtl.phe_name)]
314
    else:
315
        log.log('there is no mTrait in exposure QTL result, please check your QTL file or exposure file.')
316
        log.log('software is terminated.')
317
        sys.exit()
318
    if args.lm:
319
        log.log('perform Mendelian Randomization through linear model')
320
        g = gi.read_genotype(args.g)
321
        if g is None:
322
            raise FileTypeError('{0}.bed is not a plink bed format file or {0}.bim file and {0}.fam file is not in the same folder with {0}.bed, or {0}.bed is empty.'.format(args.g))
323
        g = g.where(g.snp.isin(qtl.SNP), drop=True)
324
        g = pd.DataFrame(g.values, index=g.sample, columns=g.snp.values)
325
        ril = g.index.intersection(mTrait.index.intersection(pTrait.index))
326
        g = g.reindex(ril)
327
        mTrait = mTrait.reindex(ril)
328
        pTrait = pTrait.reindex(ril)
329
        res = MR.MR_parallel(qtl, mTrait, pTrait, g, args.p, args.pvalue)
330
        res.to_csv(args.o + '.MR.csv', index=False)
331
        log.log('Successfully perform Mendelian randomization analysis using linear model')
332
    if args.mlm:
333
        log.log('perform Mendelian Randomization through mixed linear model')
334
        MR.generate_geno_batch(qtl, mTrait, pTrait, args.g, args.p, 'tmp_mr_bed', 'tmp_mr_rs')
335
        MR.calc_MLM_effect('tmp_mr_bed', pTrait, args.p, args.g)
336
        mTrait_effect, pTrait_effect, pTrait_se = MR.get_MLM_effect_parallell('./output', mTrait, pTrait, args.p)
337
        res = MR.MR_MLM_parallel(qtl, mTrait_effect, pTrait_effect, pTrait_se, args.p, args.pvalue)
338
        res.to_csv(args.o+'.MR.csv', index=False)
339
        log.log('Successfully perform Mendelian randomization analysis using mixed linear model')
340
    log.log('Mendelian Randomization analysis is successed')
341
    # res = pd.read_csv('chr_HAMP_kernel_eqtl_new.local_gwas_qtl_res.MR.csv')
342
    if args.net:
343
        log.log('Mendelian Randomization network analysis')
344
        edge_weight = MR.edge_weight(qtl, res)
345
        edge_weight = edge_weight.loc[edge_weight.weight >= 0.2, :]
346
        edge_weight = edge_weight.sort_values(by=['mTrait', 'pTrait', 'weight'], ascending=False).drop_duplicates(subset=['mTrait', 'pTrait'])
347
        edge_weight.to_csv(args.o+'.edgelist', sep='\t', header=None, index=False)
348
        subprocess.call('java -jar ' + utils_path +'/cluster_one-1.0.jar -f edge_list -F csv '+args.o+'.edgelist'+' >'+args.o+'.cluster_one.result.csv 2>/dev/null',shell=True)
349
        cluster_one_res = pd.read_csv(args.o+'.cluster_one.result.csv')
350
        cluster_one_res.loc[(cluster_one_res['P-value'] <= 0.05) & (cluster_one_res['Size'] >= 5), :].to_csv(args.o+'.sig.cluster_one.result.csv', index=False)
351
        log.log('Mendelian Randomization network analysis is successed')
352
353
354
def visual(args, log):
355
    import modas.genoidx as gi
356
    import modas.visual as vis
357
358
    utils_path = subprocess.check_output('locate modas/utils', shell=True, text=True, encoding='utf-8')
359
    # utils_path = '/'.join(re.search('\n(.*site-packages.*)\n', utils_path).group(1).split('/')[:-1])
360
    utils_path = re.search('\n(.*site-packages.*)\n', utils_path).group(1)
361
    if not utils_path.endswith('utils'):
362
        utils_path = '/'.join(utils_path.split('/')[:-1])
363
364
    log.log('begin genome-wide association analysis and visualization')
365
    if not os.path.exists(args.phe):
366
        raise ArgsError('the phenotype file {} is not exists.'.format(args.phe))
367
    phe = pd.read_csv(args.phe, index_col=0)
368
    if phe.empty:
369
        log.log('the phenotype file {} is empty and software is terminated.'.format(args.phe))
370
        sys.exit()
371
    if not os.path.exists(args.qtl):
372
        raise ArgsError('the phenotype QTL file {} is not exists.'.format(args.qtl))
373
    qtl = pd.read_csv(args.qtl)
374
    if qtl.empty:
375
        log.log('the phenotype QTL file {} is empty and software is terminated.'.format(args.qtl))
376
        sys.exit()
377
    phe = phe.loc[:, phe.columns.isin(qtl.phe_name)]
378
    if not os.path.exists(args.g + '.bed'):
379
        raise ArgsError('the plink bed format genotype file {} is not exists.'.format(args.g))
380
    g = gi.read_genotype(args.g)
381
    if g is None:
382
        raise FileTypeError('{0}.bed is not a plink bed format file or {0}.bim file and {0}.fam file is not in the same folder with {0}.bed, or {0}.bed is empty.'.format(args.g))
383
    log.log('begin GWAS analysis')
384
    s = vis.gwas(phe, args.g, args.p, args.phe, args.gwas_model)
385
    if s is None:
386
        log.log('calculated related_martix faild, please check your genotype file.')
387
        sys.exit()
388
    else:
389
        if not isinstance(s, list) and s == 1:
390
            log.log('GWAS analysis is failed, please check your genotype file or phenotype file.')
391
            sys.exit()
392
        status = np.array(s) == 0
393
        fail_phe = phe.columns[~status].astype(str)
394
        if not status.all():
395
            log.log('There are {0} omics traits GWAS is failed , and omics trait names are {1}'.format(np.sum(~status), ','.join(fail_phe)))
396
        else:
397
            log.log('All phenotype GWAS done.')
398
    if args.visual:
399
        if not os.path.exists(args.phe):
400
            raise ArgsError('the gene annotation file {} is not exists.'.format(args.anno))
401
        anno = pd.read_csv(args.anno, sep='\t')
402
        if anno.empty:
403
            log.log('the gene annotation file {} is empty and software is terminated.'.format(args.phe))
404
            sys.exit()
405
        qtl_anno = vis.qtl_anno(qtl, anno)
406
        qtl_anno.to_csv('test_anno.csv', index=False)
407
        if os.path.exists(args.o):
408
            shutil.rmtree(args.o)
409
        os.makedirs(args.o+'/manhattan_plot')
410
        os.makedirs(args.o+'/qqplot')
411
        os.makedirs(args.o+'/boxplot')
412
        shutil.copytree(utils_path + '/assets', args.o + '/assets')
413
        log.log('begin plot manhattan plot and qqplot')
414
        s = vis.gwas_plot_parallel(phe, 1e-2, args.p, 'jpg', args.phe, args.gwas_model)
415
        log.log('begin plot boxplot')
416
        vis.boxplot(phe, g, qtl)
417
        log.log('begin plot multi trait manhattan plot')
418
        vis.multi_trait_plot(phe, './output/', qtl, args.phe, 'multi_trait', 'jpg', args.gwas_model)
419
        for fn in glob.glob('Manhattan.*jpg'):
420
            if fn != 'Manhattan.multi_trait.jpg':
421
                shutil.move(fn, args.o+'/manhattan_plot')
422
        for fn in glob.glob('QQplot.*jpg'):
423
            shutil.move(fn, args.o+'/qqplot')
424
        for fn in glob.glob('*boxplot.jpg'):
425
            shutil.move(fn, args.o+'/boxplot')
426
        shutil.move('./Manhattan.multi_trait.jpg', args.o)
427
        fam = pd.read_csv(args.g+'.fam',sep='\s+')
428
        vis.generateHtml(qtl_anno, args.anno, args.o, fam.shape[0])
429
    log.log('Genome-wide association analysis and visualization is down')
430
431
432
def contrast(args, log):
433
    import modas.genoidx as gi
434
    import modas.phenorm as pn
435
    import modas.contrast as ct
436
437
    log.log('begin to construct stress response index and identify stress-responsive molecular QTL...')
438
    if not os.path.exists(args.stress_phe):
439
        raise ArgsError('the stress omics data file {} is not exists.'.format(args.stress_phe))
440
    stress_omics = pd.read_csv(args.stress_phe, index_col=0)
441
    stress_omics.columns = [col.replace('m/z', 'm.z') for col in stress_omics.columns]
442
    if stress_omics.empty:
443
        log.log('the stress omics data file {} is empty and software is terminated.'.format(args.stress_phe))
444
        sys.exit()
445
    if not os.path.exists(args.control_phe):
446
        raise ArgsError('the control omics data file {} is not exists.'.format(args.control_phe))
447
    control_omics = pd.read_csv(args.control_phe, index_col=0)
448
    control_omics.columns = [col.replace('m/z', 'm.z') for col in control_omics.columns]
449
    if control_omics.empty:
450
        log.log('the control omics data file {} is empty and software is terminated.'.format(args.control_phe))
451
        sys.exit()
452
    omics_scpca_phe, omics_scpca_test = ct.scpca_omics(stress_omics, control_omics, args.alpha, args.n_comp, args.beta_test_pvalue)
453
    omics_scpca_phe.to_csv(args.o+'.scpca_pc.phe.csv', na_rep='NA')
454
    omics_scpca_test.to_csv(args.o+'.scpca_pc.beta_test.csv', index=False)
455
    log.log('The construction of stress response index is completed.')
456
    if args.gwas:
457
        log.log('normal quantile transform phenotype values.')
458
        omics_scpca_phe = omics_scpca_phe.apply(pn.qqnorm)
459
        omics_scpca_phe.to_csv(args.o+'.scpca_pc.normalized_phe.csv', na_rep='NA')
460
        ps_args = argparse.Namespace()
461
        for arg, value in zip(('phe', 'genome_cluster', 'g', 'gwas_model', 'gwas_suggest_pvalue', 'p','o'), (args.o+'.scpca_pc.normalized_phe.csv', args.genome_cluster, args.g, args.gwas_model, None, args.p, args.o)):
462
            setattr(ps_args, arg, value)
463
        rg_args = argparse.Namespace()
464
        for arg, value in zip(('phe', 'phe_sig_qtl', 'g', 'gwas_model', 'p1', 'p2', 'p', 'cluster', 'o'),(args.o+'.sig_omics_phe.csv', args.o+'.phe_sig_qtl.csv', args.g, args.gwas_model, None, None, args.p, False, args.o)):
465
            setattr(rg_args, arg, value)
466
        prescreen(ps_args, log)
467
        regiongwas(rg_args, log)
468
        scpca_qtl_res = pd.read_csv(args.o+'.region_gwas_qtl_res.csv')
469
        geno = gi.read_genotype(args.g)
470
        var = pd.Series(geno.variant, index=geno.snp)
471
        geno = geno.sel(variant=var.reindex(scpca_qtl_res.SNP.unique()).values, drop=True)
472
        geno = pd.DataFrame(geno.values, index=geno.fid, columns=geno.snp)
473
        scpca_qtl_res = ct.anova(stress_omics, control_omics, scpca_qtl_res, geno)
474
        scpca_qtl_res.to_csv(args.o+'.region_gwas_qtl_res.anova.csv', index=False)
475
        os.remove(args.o + '.phe_sig_qtl.csv')
476
        os.remove(args.o + '.sig_omics_phe.csv')
477
        os.remove(args.o + '.region_gwas_qtl_res.csv')
478
        scpca_qtl_res.loc[scpca_qtl_res.pvalue<=0.05, :].to_csv(args.o+'.region_gwas_qtl_res.anova.sig.csv', index=False)
479
        log.log('two-way ANOVA analysis done')
480
        log.log('Identification of stress-responsive molecule QTL done.')
481
482
483
def coloc(args, log):
484
    import modas.genoidx as gi
485
    import modas.coloc as cl
486
487
    log.log('Begin co-localization analysis of multiple traits...')
488
    if not os.path.exists(args.qtl):
489
        raise ArgsError('the molecular QTL file {} is not exists.'.format(args.qtl))
490
    qtl = pd.read_csv(args.qtl)
491
    if not os.path.exists(args.g+'.bed'):
492
        raise ArgsError('the genotype file {} is not exists.'.format(args.g))
493
    g = gi.read_genotype(args.g)
494
    if not os.path.exists(args.gwas_dir):
495
        raise ArgsError('the gwas result directory {} is not exists.'.format(args.gwas_dir))
496
    qtl = cl.qtl_cluster(qtl)
497
    log.log('Start calculating the kinship matrix for each molecular feature based on the significantly associated SNP')
498
    kin_info = cl.get_kin_info(qtl, args.gwas_dir, g, args.pvalue)
499
    if not kin_info:
500
        log.log('There are no GWAS results for molecular features in the directory or the GWAS results for molecular features do not exceed the threshold')
501
        log.log('co-localization analysis is terminated.')
502
        sys.exit()
503
    log.log('start multi-trait co-localization analysis based on approximate image matching algorithm')
504
    coloc_res, coloc_pairwise_res, dis = cl.trait_coloc(kin_info, qtl, args.metric, args.eps, args.p)
505
    coloc_res.to_csv(args.o + '.coloc_res.csv', index=False)
506
    coloc_pairwise_res.to_csv(args.o + '.coloc_pairwise.csv', index=False)
507
    dis.to_csv(args.o + '.dis_res.csv', na_rep=1)
508
    log.log('multi-trait co-localization analysis is down')
509
510
511
def MODAS(args):
512
    if args.command == 'genoidx':
513
        log = Logger(args.o+'.genoidx.log.txt')
514
        log.log(sys.argv)
515
        genoidx(args, log)
516
    if args.command == 'phenorm':
517
        log = Logger(args.o+'.phenorm.log.txt')
518
        log.log(sys.argv)
519
        phenorm(args, log)
520
    if args.command == 'prescreen':
521
        log = Logger(args.o+'.prescreen.log.txt')
522
        log.log(sys.argv)
523
        prescreen(args, log)
524
    if args.command == 'regiongwas':
525
        log = Logger(args.o+'.regiongwas.log.txt')
526
        log.log(sys.argv)
527
        regiongwas(args, log)
528
    if args.command == 'mr':
529
        log = Logger(args.o+'.mr.log.txt')
530
        log.log(sys.argv)
531
        mr(args, log)
532
    if args.command == 'visual':
533
        log = Logger('.'.join(args.phe.split('/')[-1].split('.')[:-1])+'.visual.log.txt')
534
        log.log(sys.argv)
535
        visual(args, log)
536
    if args.command == 'contrast':
537
        log = Logger(args.o+'.contrast.log.txt')
538
        log.log(sys.argv)
539
        contrast(args, log)
540
    if args.command == 'coloc':
541
        log = Logger(args.o+'.coloc.log.txt')
542
        log.log(sys.argv)
543
        coloc(args, log)
544
545
546
def main():
547
    if len(sys.argv) <= 2:
548
        sys.argv.append('-h')
549
    args = parser.parse_args()
550
    MODAS(args)
551
552
553
USAGE = ' MODAS: Multi-omics data association study '
554
555
parser = argparse.ArgumentParser(usage='%(prog)s command [options]', description=USAGE)
556
subparsers = parser.add_subparsers(title='command', metavar='', dest='command', prog=parser.prog)
557
parser_genoidx = subparsers.add_parser('genoidx', help='generate a pseudo-genotype file', usage='%(prog)s [options]')
558
parser_genoidx.add_argument('-g', metavar='', help='Genotype file used for genome-wide genotype dimensionality reduction analysis, support plink bed format and hapmap format,the plink bed format file only needs to provide the file prefix, the hapmap format needs to be used together with the parameter -convert')
559
parser_genoidx.add_argument('-w', default=1000000, type=float, metavar='[default:1000000]', help='Window size for genome-wide genotype dimensionality reduction analysis(bp), default 1000000')
560
parser_genoidx.add_argument('-s', default=500000, type=float, metavar='[default:500000]', help='Step size for genome-wide genotype dimensionality reduction analysis(bp), default 500000')
561
parser_genoidx.add_argument('-genome_cluster', action='store_true', help='Perform genome-wide genotype dimensionality reduction analysis on genotype file')
562
parser_genoidx.add_argument('-convert', action='store_true', help='Convert hapmap genotype format to plink bed format genotype')
563
parser_genoidx.add_argument('-clump', action='store_true', help='Clumping analysis for genotype file, used to keep a subset of SNPs that are nearly uncorrelated with each other, thereby reducing the number of total SNPs to speed up genome-wide genotype dimensionality reduction analysis')
564
parser_genoidx.add_argument('-p', default=1, type=int, metavar='[default:1]', help='Number of threads for genome cluster analysis,max threads is total number of genome chromosomes')
565
parser_genoidx.add_argument('-o', default='MODAS_genoidx_output', metavar='[default:MODAS_genoidx_output]', help='The prefix of the output file, default MODAS_genoidx_output')
566
parser_phenorm = subparsers.add_parser('phenorm', help='phenotype normalization and transformation', usage='%(prog)s [options]')
567
parser_phenorm.add_argument('-phe', metavar='', help='Phenotype file for phenotype preprocessing and transformation, the file format is csv format, the first column and the first row are the names of inbred lines and phenotypes respectively')
568
parser_phenorm.add_argument('-g', metavar='', help='Genotype file in plink bed format for principal component analysis, used with the parameter -pca to correct differences in phenotype caused by population structure')
569
parser_phenorm.add_argument('-r', default=None, type=float, metavar='', help='Phenotype missing ratio filter cutoff')
570
parser_phenorm.add_argument('-v', default=None, type=float, metavar='', help='Phenotype value filter cutoff')
571
parser_phenorm.add_argument('-log2', action='store_true', help=' log2 transformation of phenotype')
572
parser_phenorm.add_argument('-log10', action='store_true', help='log10 trasform phenotype')
573
parser_phenorm.add_argument('-ln', action='store_true', help='ln transform phenotype')
574
parser_phenorm.add_argument('-norm', action='store_true', help='Boxcox transformation of phenotype')
575
parser_phenorm.add_argument('-qqnorm', action='store_true', help='Normal quantile transformation of phenotype')
576
parser_phenorm.add_argument('-pca', action='store_true', help='Correct the differences in phenotype caused by population structure through PCA')
577
parser_phenorm.add_argument('-o', default='MODAS_phenorm_output', metavar='[default:MODAS_phenorm_output]', help='output file prefix')
578
parser_prescreen = subparsers.add_parser('prescreen', help='prescreening of associated QTLs with lm/lmm', usage='%(prog)s [options]')
579
parser_prescreen.add_argument('-g', default=None, metavar='', help='Genotype file in plink bed format, used to calculate the kinship matrix for GWAS analysis')
580
parser_prescreen.add_argument('-genome_cluster', metavar='', help='Pseudo-genotype file for phenotype pre-screening, generated by subcommand genoidx')
581
parser_prescreen.add_argument('-phe', metavar='', help='Phenotype file for phenotype pre-screening analysis, the file format is csv format, the first column and the first row are the names of inbred lines and phenotypes respectively')
582
parser_prescreen.add_argument('-gwas_suggest_pvalue', default=None, type=float, metavar='[default:1/ genome cluster file converted pseudo-genotype number]', help='suggested GWAS P value for pseudo-genotype filtering in regional association analysis, default 1/ pseudo-genotype file pseudo-snp number')
583
parser_prescreen.add_argument('-gwas_model', default='MLM', type=str, metavar='[default:MLM]', help='model for pseudo-genotype GWAS analysis, supporting LM(linear model), GLM(general linear model) and MLM(mixed linear model) model, default MLM')
584
parser_prescreen.add_argument('-p', default=1, type=int, metavar='[default:1]', help='Number of threads for analysis in prescreen sub command')
585
parser_prescreen.add_argument('-o', default='MODAS_prescreen_output', metavar='[default:MODAS_prescreen_output]', help='The prefix of the output file, default MODAS_prescreen_output')
586
parser_regiongwas = subparsers.add_parser('regiongwas', help='perform regiongwas to identify QTLs', usage='%(prog)s [options]')
587
parser_regiongwas.add_argument('-g', metavar='', help='Genotype file in plink bed format, used to calculate the kinship matrix for regional gwas analysis and extract snp in significant association regions of phenotypes')
588
parser_regiongwas.add_argument('-phe', metavar='', help='Candidate phenotype file generated by subcommand prescreen')
589
parser_regiongwas.add_argument('-phe_sig_qtl', metavar='', help='Significant association regions of candidate phenotypes file generated by subcommand prescreen')
590
parser_regiongwas.add_argument('-p1', default=None, type=float, metavar='[default:1/snp number in genotype file]', help='Significance threshold for index SNPs, used to determine the phenotype with QTL and index snps of phenotype')
591
parser_regiongwas.add_argument('-p2', default=None, type=float, metavar='[default:10/snp number in genotype file]', help='Secondary significance threshold for clumped SNPs, Used to obtain the secondary significant SNP linked to index snp to determine the reliability of the QTL. MODAS outputs the QTL containing 10 secondary significant SNPs as the phenotypic QTL result')
592
parser_regiongwas.add_argument('-gwas_model', default='MLM', type=str, metavar='[default:MLM]', help='GWAS model for region association analysis, supporting LM, GLM and MLM, default MLM.')
593
parser_regiongwas.add_argument('-cluster', action='store_true', help='Cluster phenotype by QTL positon')
594
parser_regiongwas.add_argument('-p', default=1, type=int, metavar='[default:1]', help='Number of threads for regional gwas analysis, default 1')
595
parser_regiongwas.add_argument('-o', default='MODAS_regiongwas_out', metavar='[default:MODAS_regiongwas_out]', help='The prefix of the output file, default MODAS_regiongwas_out')
596
parser_mr = subparsers.add_parser('mr', help='perform Mendelian Randomization and identify function modules', usage='%(prog)s [options]')
597
parser_mr.add_argument('-g', metavar='', help='Genotype file in plink bed format')
598
parser_mr.add_argument('-exposure', metavar='', help='Exposure phenotype file, such as mTrait phenotype, the file format is csv format, the first column and the first row are the names of inbred lines and phenotypes respectively')
599
parser_mr.add_argument('-outcome', metavar='', help='Outcome phenotype file, such as pTrait phenotype, the file format is csv format, the first column and the first row are the names of inbred lines and phenotypes respectively')
600
parser_mr.add_argument('-qtl', metavar='', help='Exposure phenotype QTL file, generated by subcommand regiongwas')
601
parser_mr.add_argument('-lm', action='store_true', help='Perform Mendelian Randomization through linear model')
602
parser_mr.add_argument('-mlm', action='store_true', help='Perform Mendelian Randomization through mixed linear model')
603
parser_mr.add_argument('-p', default=1, type=int, metavar='[default:1]', help='Number of threads for Mendelian Randomization analysis')
604
parser_mr.add_argument('-pvalue', default=1, type=float, metavar='[default:1]', help='The pvalue cutoff of Mendelian randomization analysis result output,default 1')
605
parser_mr.add_argument('-net', action='store_true', help='Mendelian Randomization network analysis, used to identify functional modules')
606
parser_mr.add_argument('-o', default='MODAS_mr_out', metavar='[default:MODAS_mr_out]', help='The prefix of the output file, default MODAS_mr_out')
607
parser_visual = subparsers.add_parser('visual', help='whole genome-wide association analysis and visualization', usage='%(prog)s [options]')
608
parser_visual.add_argument('-g', metavar='', help='Genotype file in plink bed format for whole genome-wide association analysis')
609
parser_visual.add_argument('-phe', metavar='', help='Phenotype file for GWAS analysis and visualization')
610
parser_visual.add_argument('-gwas_model', default='gemma_MLM', type=str, metavar='[default:gemma_MLM]', help='model for GWAS analysis, supports the models provided by gemma, rMVP, and GAPIT software, including gemma_LM, gemma_MLM, rMVP_GLM, rMVP_MLM, rMVP_FarmCPU, GAPIT_GLM, GAPIT_MLM, GAPIT_CMLM, GAPIT_MLMM, GAPIT_FarmCPU, GAPIT_Blink. Default is gemma_MLM.')
611
parser_visual.add_argument('-qtl', metavar='', help='Phenotype QTL file generated by subcommand regiongwas')
612
parser_visual.add_argument('-visual', action='store_true', help='Generate web-based GWAS visualization results')
613
parser_visual.add_argument('-anno', metavar='', help='Gene annotation file for QTL annotation, used to display QTL information in visualized results')
614
parser_visual.add_argument('-p', default=1, type=int, metavar='[default:1]', help='Number of threads genome-wide association analysis and visualization')
615
parser_visual.add_argument('-o', default='MODAS_visual_out',metavar='[default:MODAS_visual_out]', help='The prefix of the output file,default MODAS_visual_out')
616
parser_contrast = subparsers.add_parser('contrast', help='construction of stress response index and identification of stress-responsive QTLs', usage='%(prog)s [options]')
617
parser_contrast.add_argument('-stress_phe', metavar='', help='omics dataset under stress treatment')
618
parser_contrast.add_argument('-control_phe', metavar='', help='omics dataset under control treatment')
619
parser_contrast.add_argument('-alpha', default=1, type=int, metavar='[default:1]', help='Contrast strength for contrastive PCA, it represents the trade-off between having the high stress variance and the low control variance, default 1')
620
parser_contrast.add_argument('-n_comp', default=2, type=int, metavar='[default:2]', help='Number of components for contrastive PCA, default 2')
621
#parser_contrast.add_argument('-cpca_phe', metavar='', help='stress response index file generated by contrastive PCA, it used to identify stress-responsive molecular QTL')
622
parser_contrast.add_argument('-beta_test_pvalue', default=None, type=float, metavar='[default:1/number of CPCA PC for beta test]', help='Significance threshold for stress-responsive CPCA PC, used to determine the CPCA PC representing stress response effect of molecular features, default 1/number of CPCA PC for beta test')
623
parser_contrast.add_argument('-gwas', action='store_true', help='perform GWAS on stress response index to identify stress-responsive molecular QTL')
624
parser_contrast.add_argument('-g', metavar='', help='Genotype file in plink bed format, used to perform GWAS on stress response index')
625
parser_contrast.add_argument('-genome_cluster', metavar='', help='Pseudo-genotype file for phenotype pre-screening, generated by subcommand genoidx')
626
parser_contrast.add_argument('-anova_pvalue', default=0.05, type=float, metavar='[default:0.05]', help='Significance threshold of two-way ANOVA, used to determine stress-responsvie molecular QTL, default 0.05')
627
parser_contrast.add_argument('-gwas_model', default='MLM', type=str, metavar='[default:MLM]', help='GWAS model for  association analysis, supporting LM, GLM and MLM, default MLM.')
628
parser_contrast.add_argument('-p', default=1, type=int, metavar='[default:1]', help='Number of threads for gwas analysis, default 1')
629
parser_contrast.add_argument('-o', default='MODAS_contrast_out', metavar='[default:MODAS_contrast_out]', help='The prefix of the output file, default MODAS_contrast_out')
630
parser_coloc = subparsers.add_parser('coloc', help='co-localization analysis of QTLs for multiple traits', usage='%(prog)s [options]')
631
parser_coloc.add_argument('-qtl', metavar='', help='molecular QTL file for co-localization analysis')
632
parser_coloc.add_argument('-g', metavar='', help='Genotype file in plink bed format, used to extract genotype of SNPs associated with molecular features')
633
parser_coloc.add_argument('-gwas_dir', metavar='', help='directory containing GWAS results for molecular features, used to extract SNPs associated with molecular features.It is recommended to use the association analysis results generated by the MODAS subcommand regiongwas')
634
parser_coloc.add_argument('-metric', default='calinski_harabasz', type=str, metavar='[default:calinski_harabasz]', help='metric for determining clustering patterns of kinship, it can be calinski_harabasz or silhouette, , defualt calinski_harabasz')
635
parser_coloc.add_argument('-eps', default=0.2, type=float, metavar='[default:0.2]', help='The maximum distance between two traits used to consider a trait to co-localize with the other trait, defualt 0.2')
636
parser_coloc.add_argument('-pvalue', default=1e-6, type=float, metavar='[defalut:1e-6]', help='Threshold of GWAS results for screening SNPs associated with molecular features, defualt 1e-6')
637
parser_coloc.add_argument('-p', default=1, type=int, metavar='[default:1]', help='Number of threads for co-localization analysis, default 1')
638
parser_coloc.add_argument('-o', default='MODAS_coloc_out', metavar='[default:MODAS_coloc_out]', help='The prefix of the output file, default MODAS_coloc_out')
639
640
641
if __name__ == "__main__":
642
    main()