Diff of /modas/visual.py [000000] .. [a43cea]

Switch to unified view

a b/modas/visual.py
1
import pandas as pd
2
import numpy as np
3
import modas.multiprocess as mp
4
from rpy2.robjects.packages import importr
5
from rpy2.robjects import pandas2ri
6
from rpy2.rinterface_lib.embedded import RRuntimeError
7
import rpy2.robjects as robjects
8
from collections import Counter
9
import modas.gwas_cmd as gc
10
import pyranges as pr
11
from yattag import Doc, indent
12
import subprocess
13
import shutil
14
import warnings
15
import glob
16
import os
17
import re
18
19
pandas2ri.activate()
20
data_table = importr('data.table')
21
base = importr('base')
22
robjects.r('options(datatable.showProgress = FALSE)')
23
24
warnings.filterwarnings("ignore")
25
26
utils_path = subprocess.check_output('locate modas/utils', shell=True, text=True, encoding='utf-8')
27
#utils_path = '/'.join(re.search('\n(.*site-packages.*)\n', utils_path).group(1).split('/')[:-1])
28
utils_path = re.search('\n(.*site-packages.*)\n', utils_path).group(1)
29
if not utils_path.endswith('utils'):
30
    utils_path = '/'.join(utils_path.split('/')[:-1])
31
32
33
# def gwas(phe, geno, num_threads, phe_fn):
34
#     geno_prefix = geno.split('/')[-1]
35
#     related_matrix_cmd = 'gemma.linux -bfile {0}.link -gk 1 -o {1}'.format(geno_prefix,geno_prefix)
36
#     gwas_cmd = 'gemma.linux -bfile {0}.link -k output/{0}.cXX.txt -lmm -n {1} -o {2}'
37
#     fam = pd.read_csv(geno+'.fam', sep=r'\s+', header=None)
38
#     fam[5] = 1
39
#     fam = pd.merge(fam, phe, left_on=0, right_index=True, how='left')
40
#     fam.to_csv(geno_prefix+'.link.fam', sep='\t', na_rep='NA', header=None, index=False)
41
#     if os.path.exists(geno_prefix+'.link.bed'):
42
#         os.remove(geno_prefix+'.link.bed')
43
#     if os.path.exists(geno_prefix+'.link.bim'):
44
#         os.remove(geno_prefix+'.link.bim')
45
#     os.symlink(geno+'.bed', geno_prefix+'.link.bed')
46
#     os.symlink(geno+'.bim', geno_prefix+'.link.bim')
47
#     values = list()
48
#     for _, p in enumerate(phe.columns):
49
#         p = p.replace('/', '.')
50
#         values.append((gwas_cmd.format(*[geno_prefix, _ + 2, '.'.join(phe_fn.split('/')[-1].split('.')[:-1])+ '_' + str(p)]),))
51
#     s = mp.run(related_matrix_cmd)
52
#     if s != 0:
53
#         return None
54
#     else:
55
#         s = mp.parallel(mp.run, values, num_threads)
56
#         os.remove(geno_prefix+'.link.bed')
57
#         os.remove(geno_prefix+'.link.bim')
58
#         os.remove(geno_prefix+'.link.fam')
59
#         return s
60
61
62
def gwas(phe, geno, num_threads, phe_fn, gwas_model):
63
    software, model = gwas_model.split('_')
64
    geno_prefix = geno.split('/')[-1]
65
    phe_fn = '.'.join(phe_fn.split('/')[-1].split('.')[:-1])
66
    if software == 'gemma' and model == 'MLM':
67
        geno_prefix = geno.split('/')[-1]
68
        related_matrix_cmd = utils_path+'/gemma.linux -bfile {0}.link -gk 1 -o {1}'.format(geno_prefix,geno_prefix)
69
        fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None)
70
        fam[5] = 1
71
        fam = pd.merge(fam, phe, left_on=0, right_index=True, how='left')
72
        fam.to_csv(geno_prefix+'.link.fam', sep='\t', na_rep='NA', header=None, index=False)
73
    if software != 'GAPIT' and gwas_model != 'gemma_LM':
74
        if os.path.exists(geno_prefix+'.link.bed'):
75
            os.remove(geno_prefix+'.link.bed')
76
        if os.path.exists(geno_prefix+'.link.bim'):
77
            os.remove(geno_prefix+'.link.bim')
78
        os.symlink(geno+'.bed', geno_prefix+'.link.bed')
79
        os.symlink(geno+'.bim', geno_prefix+'.link.bim')
80
    if software == 'rMVP':
81
        if os.path.exists(geno_prefix+'.link.fam'):
82
            os.remove(geno_prefix+'.link.fam')
83
        os.symlink(geno + '.fam', geno_prefix + '.link.fam')
84
        fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None)
85
        phe = phe.reindex(fam[0])
86
    if software == 'gemma' and model == "LM":
87
        if os.path.exists('./gemma_lm_geno'):
88
            shutil.rmtree('./gemma_lm_geno')
89
        os.mkdir('./gemma_lm_geno')
90
        for p in phe.columns:
91
            p = p.replace('m/z', 'm.z')
92
            os.symlink('../' + geno + '.bed', './gemma_lm_geno/' + geno_prefix + '_' + p + '.link.bed')
93
            os.symlink('../' + geno + '.bim', './gemma_lm_geno/' + geno_prefix + '_' + p + '.link.bim')
94
            fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None)
95
            fam = pd.merge(fam.iloc[:, :5], phe.loc[:, p.replace('m.z', 'm/z')].to_frame(), left_on=0, right_index=True, how='left')
96
            fam.to_csv('./gemma_lm_geno/' + geno_prefix + '_' + p + '.link.fam', sep='\t', na_rep='NA', header=None, index=False)
97
98
    values = list()
99
    for _, p in enumerate(phe.columns):
100
        p = p.replace('m/z', 'm.z')
101
        if gwas_model == 'gemma_LM':
102
            values.append(((gc.gemma_cmd(model, './gemma_lm_geno/' + geno_prefix + '_' + p + '.link', None, None, '_'.join([phe_fn, gwas_model, p]))),))
103
        if gwas_model == 'gemma_MLM':
104
            values.append(((gc.gemma_cmd(model, geno_prefix + '.link', geno_prefix, _ + 2, '_'.join([phe_fn, gwas_model, p]))),))
105
        if software == 'rMVP':
106
            if not os.path.exists('./output'):
107
                os.mkdir('./output')
108
            omics_phe = phe.loc[:, p.replace('m.z', 'm/z')].to_frame().reset_index()
109
            omics_phe.columns = ['Taxa', '_'.join([phe_fn, gwas_model, p])]
110
            values.append((model, geno_prefix+'.link', geno_prefix+'.link', omics_phe, 1, './output'))
111
    if gwas_model == 'gemma_MLM':
112
        s = mp.run(related_matrix_cmd)
113
        if s != 0:
114
            return None
115
    if software == 'gemma':
116
        s = mp.parallel(mp.run, values, num_threads)
117
    if software == 'rMVP':
118
        s1 = mp.parallel(gc.rmvp, (values[0],), 1)
119
        s = mp.parallel(gc.rmvp, values[1:], num_threads)
120
        s = s1 + s
121
    if software == 'GAPIT':
122
        if not os.path.exists('./output'):
123
            os.mkdir('./output')
124
        phe.columns = ['_'.join([phe_fn, gwas_model, p]) for p in phe.columns]
125
        phe = phe.reset_index()
126
        phe.columns = ['Taxa'] + list(phe.columns[1:])
127
        geno = os.path.abspath(geno)
128
        os.chdir('./output')
129
        s = gc.gapit(model, geno, phe, utils_path)
130
        os.chdir('../')
131
    if gwas_model == 'gemma_MLM' or software == 'rMVP':
132
        os.remove(geno_prefix+'.link.bed')
133
        os.remove(geno_prefix+'.link.bim')
134
        os.remove(geno_prefix+'.link.fam')
135
    if gwas_model == 'gemma_LM':
136
        shutil.rmtree('./gemma_lm_geno')
137
    return s
138
139
140
def gwas_plot(res, p, prefix, t, software):
141
    try:
142
        base.sink('/dev/null')
143
        w = data_table.fread(res, data_table=base.getOption("datatable.fread.datatable", False))
144
        if software == 'gemma':
145
            w_subset = w.loc[w.p_wald <= float(p), :]
146
            m = w_subset[['rs', 'chr', 'ps', 'p_wald']]
147
            q = w[['rs', 'chr', 'ps', 'p_wald']]
148
        if software == 'rMVP':
149
            w_subset = w.loc[w[w.columns[-1]] <= float(p), :]
150
            m = w_subset.iloc[:, [0, 1, 2, -1]]
151
            q = w.iloc[:, [0, 1, 2, -1]]
152
        if software == 'GAPIT':
153
            w_subset = w.loc[w['P.value'] <= float(p), :]
154
            m = w_subset[['SNP', 'Chromosome', 'Position', 'P.value']]
155
            q = w[['SNP', 'Chromosome', 'Position', 'P.value']]
156
        m.columns = ['SNP', 'Chromosome', 'Position', prefix]
157
        q.columns = ['SNP', 'Chromosome', 'Position', prefix]
158
        thresholdi = robjects.FloatVector([1.0 / w.shape[0], 1e-6, 1e-5])
159
        lim = -np.log10(min(m[prefix])) + 2
160
        #w_subset = base.subset(w, np.array(w.rx2('p_wald')) <= float(p))
161
        #m = w_subset.rx(robjects.StrVector(['rs', 'chr', 'ps', 'p_wald']))
162
        #q = w.rx(robjects.StrVector(['rs', 'chr', 'ps', 'p_wald']))
163
        # m.names = ['SNP', 'Chromosome', 'Position', prefix]
164
        # q.names = ['SNP', 'Chromosome', 'Position', prefix]
165
        #thresholdi = robjects.FloatVector([1.0/w.nrow, 1e-6, 1e-5])
166
        #lim = -np.log10(min(np.array(w_subset.rx2('p_wald'))))+2
167
        #base.sink('/dev/null')
168
        robjects.r('source("'+utils_path+'/CMplot.r")')
169
        CMplot = robjects.r['CMplot']
170
        CMplot(m, plot_type='m', col=robjects.StrVector(["grey30", "grey60"]), ylim=robjects.FloatVector([2, lim]), threshold=thresholdi,
171
                cex=robjects.FloatVector([0.5, 0.5, 0.5]), signal_cex=robjects.FloatVector([0.5, 0.5, 0.5]),
172
                threshold_col=robjects.StrVector(['red', 'green', 'blue']), chr_den_col=robjects.rinterface.NULL, amplify=True,
173
                signal_pch = robjects.IntVector([19, 19, 19]), dpi=300,
174
                signal_col=robjects.StrVector(['red', 'green', 'blue']), multracks=False, LOG10=True, file=t)
175
        CMplot(q, plot_type='q', col='grey30', threshold=thresholdi[0],
176
               signal_cex=robjects.FloatVector([0.5, 0.5, 0.5]), signal_pch=robjects.IntVector([19, 19, 19]),
177
               conf_int_col='gray', signal_col='red', multracks=False, LOG10=True, file=t, dpi=300)
178
        base.sink()
179
    except RRuntimeError:
180
        return 0
181
    except ValueError:
182
        return 0
183
    else:
184
        return 1
185
186
187
def gwas_plot_parallel(phe, p, threads, t, phe_fn, gwas_model):
188
    software, model = gwas_model.split('_')
189
    values = list()
190
    for i in phe.columns:
191
        i = str(i).replace('m/z', 'm.z')
192
        if software == 'gemma':
193
            gwas_fn = 'output/' + '_'.join(['.'.join(phe_fn.split('/')[-1].split('.')[:-1]), gwas_model, str(i)]) + '.assoc.txt'
194
        if software == 'rMVP':
195
            gwas_fn = 'output/' + '_'.join(['.'.join(phe_fn.split('/')[-1].split('.')[:-1]), gwas_model, str(i)]) + '.'.join(['.'+model, 'csv'])
196
        if software == 'GAPIT':
197
            gwas_fn = 'output/' + '.'.join(['GAPIT', model, '_'.join(['.'.join(phe_fn.split('/')[-1].split('.')[:-1]), gwas_model, str(i)]), 'GWAS', 'Results', 'csv'])
198
        values.append((gwas_fn, p, '.'.join(phe_fn.split('/')[-1].split('.')[:-1]) + '_' + str(i), t, software))
199
    s = mp.parallel(gwas_plot, values, threads)
200
    return s
201
202
203
def boxplot(phe, g, qtl):
204
    robjects.r('''box_plot <- function(d, phe, rs, level){
205
    library(ggplot2)
206
    library(ggsignif)
207
    d <- d[d$haplotype!=1,]
208
    d[d$haplotype==0,'haplotype'] <- level[1]
209
    d[d$haplotype==2,'haplotype'] <- level[2]
210
    d$haplotype <- factor(d$haplotype,levels = level)
211
    b <- as.numeric(formatC(max(d[,1],na.rm=T)*1.2/4,format = 'e',digits = 1))
212
    p <- ggplot(data = d,aes_string(x='haplotype',y=names(d)[1],fill='haplotype'))+
213
    theme_bw()+
214
    theme(legend.title = element_blank(),
215
          legend.background = element_blank(),
216
          legend.key = element_rect(colour = NA, fill = NA),
217
          legend.text = element_text(size = 4),
218
          legend.key.size = unit(3,'mm'),
219
          legend.position = 'none',
220
          plot.title = element_text(hjust=0.5,size = 6),
221
          plot.margin=unit(c(0.3,0.3,0,0),'cm'),
222
          panel.grid = element_blank(),
223
          axis.line = element_line(colour = 'black',size=0.4),
224
          axis.text = element_text(size = 6,color = 'black'),
225
          axis.ticks.length=unit(.1, 'cm'))+
226
    stat_boxplot(geom = 'errorbar', width = 0.2,size=0.1)+
227
    geom_boxplot(lwd=0.2,width=0.5,outlier.size = 0.2)+
228
    geom_signif(comparisons = list(level),map_signif_level = F,
229
                test= t.test, size=0.2 ,textsize=2, y_position = max(d[,1],na.rm=T)*1.1)+
230
    #xlab('')+ylab('')+ggtitle(paste(phe,rs,sep='_'))+
231
    xlab('')+ylab('')+ggtitle('')+
232
    scale_y_continuous(breaks=seq(0,4*b,by=b),labels = function(x) formatC(x, format = 'e',digits = 1), limits = c(0, max(d[,1], na.rm=T)*1.2))+
233
    scale_fill_manual(values=c('#E3FFE2', 'forest green'))
234
    ggsave(paste(phe,'_',rs,'_','boxplot','.jpg',sep=''),plot=p,device='jpg',width=3.5,height=4.3,units = 'cm')
235
}''')
236
    robjects.r['options'](warn=-1)
237
    base.sink('/dev/null')
238
    box_plot = robjects.r['box_plot']
239
    g = g.where(g.snp.isin(qtl.SNP), drop=True)
240
    allele = pd.DataFrame([g.a0, g.a1], index=['a0', 'a1'], columns=g.snp.values)
241
    g = pd.DataFrame(g.values, index=g.sample, columns=g.snp.values)
242
    ril = g.index.intersection(phe.index)
243
    g = g.reindex(ril)
244
    phe = phe.reindex(ril)
245
    for index, row in qtl.iterrows():
246
        if row['phe_name'] not in phe.columns:
247
            continue
248
        d = pd.concat([phe[row['phe_name']], g[row['SNP']]], axis=1)
249
        d.columns = ['trait.' + d.columns[0].replace('-', '.'), 'haplotype']
250
        level = robjects.StrVector([allele[row['SNP']]['a1'].values*2, allele[row['SNP']]['a0'].values*2])
251
        box_plot(d, row['phe_name'], row['SNP'], level)
252
    base.sink()
253
254
255
def multi_trait_plot(phe, gwas_dir, qtl, phe_fn, prefix, t, gwas_model):
256
    software, model = gwas_model.split('_')
257
    bk = pd.DataFrame()
258
    for i in phe.columns:
259
        i = i.replace('/', '.')
260
        # fn = gwas_dir + '/' + '.'.join(phe_fn.split('/')[-1].split('.')[:-1]) + '_' + str(i)+'.assoc.txt'
261
        if software == 'gemma':
262
            gwas_fn = gwas_dir + '_'.join(['.'.join(phe_fn.split('/')[-1].split('.')[:-1]), gwas_model, str(i)]) + '.assoc.txt'
263
            d = pd.read_csv(gwas_fn, sep='\t')
264
            d = d[['rs', 'chr', 'ps', 'p_wald']]
265
        if software == 'rMVP':
266
            gwas_fn = gwas_dir + '_'.join(['.'.join(phe_fn.split('/')[-1].split('.')[:-1]), gwas_model, str(i)]) + '.'.join(['.' + model, 'csv'])
267
            d = pd.read_csv(gwas_fn)
268
            d = d.iloc[:, [0, 1, 2, -1]]
269
            d.columns = ['rs', 'chr', 'ps', 'p_wald']
270
        if software == 'GAPIT':
271
            gwas_fn = gwas_dir + '.'.join(['GAPIT', model, '_'.join(['.'.join(phe_fn.split('/')[-1].split('.')[:-1]), gwas_model, str(i)]), 'GWAS', 'Results', 'csv'])
272
            d = pd.read_csv(gwas_fn)
273
            d = d[['SNP', 'Chromosome', 'Position ', 'P.value']]
274
            d.columns = ['rs', 'chr', 'ps', 'p_wald']
275
276
        # d = pd.read_csv(gwas_fn, sep='\t')
277
        if bk.empty:
278
            bk = d.copy()
279
            bk.loc[bk.p_wald <= 1e-5, 'p_wald'] = 1e-5
280
        for index, row in qtl.loc[qtl.phe_name == i, :].iterrows():
281
            peak_pos = int(row['SNP'].split('_')[-1])
282
            chrom = row['CHR']
283
            sig_tmp = pd.concat([bk.loc[(bk.chr.astype(str) == str(chrom)) & (bk.ps >= peak_pos-1000000) & (bk.ps <= peak_pos+1000000), 'p_wald'],
284
                                 d.loc[(bk.chr.astype(str) == str(chrom)) & (bk.ps >= peak_pos-1000000) & (bk.ps <= peak_pos+1000000), 'p_wald']], axis=1)
285
            sig_tmp.columns = ['bk', 'phe']
286
            bk.loc[(bk.chr.astype(str) == str(chrom)) & (bk.ps >= peak_pos-1000000) & (bk.ps <= peak_pos+1000000), 'p_wald'] = sig_tmp.apply(
287
                lambda x: x['bk'] if x['bk'] < x['phe'] else x['phe'], axis=1)
288
    bk = bk.loc[bk.p_wald <= 1e-2, :]
289
    bk.loc[bk.p_wald <= 1e-20, 'p_wald'] = 1e-20
290
    bk = bk[['rs', 'chr', 'ps', 'p_wald']]
291
    thresholdi = robjects.FloatVector([1.0 / d.shape[0], 1e-6, 1e-5])
292
    lim = -np.log10(min(bk['p_wald'])) + 2
293
    bk.columns = ['SNP', 'Chromosome', 'Position', prefix]
294
    base.sink('/dev/null')
295
    robjects.r('source("'+utils_path+'/CMplot.r")')
296
    #robjects.r('source("/home/debian/文档/MGWAP/compound_extract/plot/CMplot.r")')
297
    CMplot = robjects.r['CMplot']
298
    CMplot(bk, plot_type='m', col=robjects.StrVector(["grey30", "grey60"]), ylim=robjects.FloatVector([2, lim]),
299
           threshold=thresholdi,
300
           cex=robjects.FloatVector([0.5, 0.5, 0.5]), signal_cex=robjects.FloatVector([0.5, 0.5, 0.5]),
301
           threshold_col=robjects.StrVector(['red', 'green', 'blue']), chr_den_col=robjects.rinterface.NULL,
302
           amplify=True,
303
           signal_pch=robjects.IntVector([19, 19, 19]), dpi=300,
304
           signal_col=robjects.StrVector(['red', 'green', 'blue']), multracks=False, LOG10=True, file=t)
305
    base.sink()
306
307
308
def qtl_anno(qtl, anno):
309
    anno = anno[['geneid', 'position']]
310
    anno.loc[:, 'chr'] = anno['position'].apply(lambda x: x.split(':')[0])
311
    anno.loc[:, 'start'] = anno['position'].apply(lambda x: x.split(':')[1].split('-')[0])
312
    anno.loc[:, 'end'] = anno['position'].apply(lambda x: x.split(':')[1].split('-')[1])
313
    anno = anno[['chr', 'start', 'end', 'geneid']]
314
    anno.columns = ['Chromosome', 'Start', 'End', 'geneid']
315
    qtl_range = qtl[['CHR', 'qtl_start', 'qtl_end', 'phe_name']]
316
    qtl_range.columns = ['Chromosome', 'Start', 'End', 'phe_name']
317
    qtl_range = pr.PyRanges(qtl_range)
318
    anno = pr.PyRanges(anno)
319
    qtl_anno_intersect = qtl_range.join(anno, how='left')
320
    qtl_anno = pd.DataFrame()
321
    for k in sorted(qtl_anno_intersect.dfs.keys()):
322
        qtl_anno = pd.concat([qtl_anno, qtl_anno_intersect.dfs[k]])
323
    qtl_anno = qtl_anno[['Chromosome', 'Start', 'End', 'phe_name', 'geneid']]
324
    qtl_anno.loc[:, 'Chromosome'] = qtl_anno.Chromosome.astype(str)
325
    qtl_anno.loc[:, 'Start'] = qtl_anno.Start.astype(int)
326
    qtl_anno.loc[:, 'End'] = qtl_anno.End.astype(int)
327
    qtl.loc[:, 'CHR'] = qtl.CHR.astype(str)
328
    qtl = pd.merge(qtl, qtl_anno, left_on=['CHR', 'qtl_start', 'qtl_end', 'phe_name'],
329
                   right_on=['Chromosome', 'Start', 'End', 'phe_name'])
330
    qtl = qtl.drop(['Chromosome', 'Start', 'End'], axis=1)
331
    qtl = qtl.groupby(['CHR', 'qtl_start', 'qtl_end', 'SNP', 'P', 'SP2_num', 'qtl_length', 'phe_name'])['geneid'].apply(';'.join).reset_index()
332
    qtl.columns = ['CHR', 'qtl_start', 'qtl_end', 'SNP', 'P', 'SP2_num', 'qtl_length', 'phe_name', 'qtl_all_gene']
333
    qtl.loc[qtl.qtl_all_gene == '-1', 'qtl_all_gene'] = 'nan'
334
    return qtl
335
336
337
'''
338
Generate html report for SingleTrait
339
Author: CrazyHsu @ crazyhsu9527@gmail.com
340
Created on: 2020-08-26 20:33:52
341
Last modified: 2021-03-24 17:20:33
342
'''
343
344
345
################# Classes #################
346
class AllQtlStatistics():
347
    def __init__(self):
348
        self.totalSNPs = 0
349
        self.qtlDetected = 0
350
        self.medianQtlLen = 0
351
        self.longestQtlLen = 0
352
        self.shortestQtlLen = 0
353
        self.totalAnnoGenes = 0
354
        # self.totalTargetGenes = 0
355
        # self.totalEnrichTargetGenes = 0
356
        # self.aveEnrichGenes = 0
357
358
    def getQtlLengthInfo(self, qtlLenSeries):
359
        self.medianQtlLen = np.median(qtlLenSeries)
360
        self.longestQtlLen = np.max(qtlLenSeries)
361
        self.shortestQtlLen = np.min(qtlLenSeries)
362
363
    def getTotalAnnoGenes(self, qtlAllGenesSeries):
364
        self.totalAnnoGenes = len(self.mergeSeries2List(qtlAllGenesSeries))
365
366
    # def getTotalTargetGenes(self, qtlAllMetaGenesSeries):
367
    #     self.totalTargetGenes = len(self.mergeSeries2List(qtlAllMetaGenesSeries))
368
369
    # def getEnrichTargetGenes(self, myDataFrame):
370
    #     filteredSeries = myDataFrame.loc[myDataFrame.qtl_p_value <= 0.05, "qtl_meta_gene"]
371
    #     self.totalEnrichTargetGenes = len(self.mergeSeries2List(filteredSeries))
372
373
    def getInit(self, tableData):
374
        self.qtlDetected = len(tableData)
375
        self.getQtlLengthInfo(tableData.qtl_length)
376
        self.getTotalAnnoGenes(tableData.qtl_all_gene)
377
        # self.getTotalTargetGenes(tableData.qtl_meta_gene)
378
        # self.getEnrichTargetGenes(tableData)
379
380
    def mergeSeries2List(self, mySeries, sep=";"):
381
        return set().union(*mySeries.apply(lambda x: str(x).split(sep)).to_list())
382
383
384
class AllTraitStatistics():
385
    def __init__(self):
386
        self.totalTraits = 0
387
        self.filteredTraits = 0
388
        self.clusteredTraits = 0
389
        self.unclusteredTraits = 0
390
391
    def getFilteredTraits(self, traitSeries):
392
        self.filteredTraits = len(self.mergeSeries2List(traitSeries, sep=","))
393
        self.totalTraits = len(self.mergeSeries2List(traitSeries, sep=","))
394
395
    def getClusteredTraits(self, labelSeries):
396
        counter = Counter(labelSeries.to_list())
397
        clustered = [i for i in counter if counter[i] != 1]
398
        unclustered = [i for i in counter if counter[i] == 1]
399
        self.clusteredTraits = len(clustered)
400
        self.unclusteredTraits = len(unclustered)
401
402
    # def getUnclusterTraits(self, labelSeries):
403
    #     self.clusteredTraits = len(set(labelSeries.to_list()))
404
405
    def getInit(self, tableData):
406
        self.getFilteredTraits(tableData.phe_name)
407
        self.getClusteredTraits(tableData.phe_name)
408
        # self.getUnclusterTraits(tableData.phe_name)
409
410
    def mergeSeries2List(self, mySeries, sep=";"):
411
        return set().union(*mySeries.apply(lambda x: str(x).split(sep)).to_list())
412
413
414
class SingleQtlStatistics():
415
    def __init__(self, myRow, rowIndex):
416
        self.qtlName = "QTL{}_chr{}-{}-{}".format(rowIndex, myRow.CHR, myRow.qtl_start, myRow.qtl_end)
417
        self.qtlPosition = "{}:{}-{}".format(myRow.CHR, myRow.qtl_start, myRow.qtl_end)
418
        self.peakSNP = "{}, {}".format(myRow.SNP, myRow.SNP)
419
        self.traitNames = myRow.phe_name.split(",")
420
        self.totalGenesInQtl = str(myRow.qtl_all_gene).split(";")
421
        # self.targetGenesInQtl = str(myRow.qtl_meta_gene).split(";")
422
        # self.enrichTargetGenes = 0
423
        self.pvalue = myRow.P
424
425
426
################# Functions ##################
427
def resolveDir(dirName):
428
    if not os.path.exists(dirName):
429
        os.makedirs(dirName)
430
    os.chdir(dirName)
431
432
433
def getGeneFunc(myGeneFuncFile, sep=","):
434
    gene2func = pd.read_csv(myGeneFuncFile, sep=sep)
435
    tmpDict = gene2func.to_dict("index")
436
    geneFuncDict = {}
437
    for i in tmpDict:
438
        geneFuncDict[tmpDict[i]["geneid"]] = tmpDict[i]
439
    return geneFuncDict
440
441
442
def getAllQtlSummary(allQtlStatistics, doc=None, tag=None, text=None, line=None):
443
    with tag("div", id="qtlSummary"):
444
        line("h1", "Summary of QTLs detected by local GWAS", style="text-align: center;")
445
        multi_trait_File = "./Manhattan.multi_trait.jpg"
446
        with tag("div", style="text-align: center;margin-top: 50px;"):
447
            doc.stag("img", klass="img-fluid", src=multi_trait_File, alt="multi_trait")
448
        with tag("div", style="margin-top: 100px;"):
449
            with tag("div"):
450
                line("h2", "QTL summarization criteria")
451
                line("p", "The cutoffs in generating and filtering QTLs")
452
            with tag("div"):
453
                line("h2", "QTL statistics table")
454
                with tag("div", klass="table-responsive"):
455
                    with tag("table", klass="table"):
456
                        with tag("thead"):
457
                            with tag("tr", klass="table-success"):
458
                                line("th", "Categories", style="width: 50%")
459
                                line("th", "Statistics value", style="width: 50%")
460
                        with tag("tbody"):
461
                            with tag("tr"):
462
                                line("td", "Genotype file")
463
                                line("td", "XXXXXX")
464
                            with tag("tr"):
465
                                line("td", "Number of SNPs for local GWAS")
466
                                line("td", str(allQtlStatistics.totalSNPs))
467
                            with tag("tr"):
468
                                line("td", "Number of detected QTLs")
469
                                line("td", str(allQtlStatistics.qtlDetected))
470
                            with tag("tr"):
471
                                line("td", "Median QTL length")
472
                                line("td", str(allQtlStatistics.medianQtlLen))
473
                            with tag("tr"):
474
                                line("td", "Longest QTL length")
475
                                line("td", str(allQtlStatistics.longestQtlLen))
476
                            with tag("tr"):
477
                                line("td", "Shortest QTL length")
478
                                line("td", str(allQtlStatistics.shortestQtlLen))
479
                            with tag("tr"):
480
                                line("td", "Total genes in the QTLs")
481
                                line("td", str(allQtlStatistics.totalAnnoGenes))
482
                            # with tag("tr"):
483
                            #     line("td", "Total target genes in the QTLs")
484
                            #     line("td", str(allQtlStatistics.totalTargetGenes))
485
                            # with tag("tr"):
486
                            #     line("td", "Total enrichment of target genes")
487
                            #     line("td", str(allQtlStatistics.totalEnrichTargetGenes))
488
                            # with tag("tr"):
489
                            #     line("td", "Average enrichment of target genes")
490
                            #     line("td", str(allQtlStatistics.aveEnrichGenes))
491
492
493
def getAllTraitSummary(allTraitStatistics, doc=None, tag=None, text=None, line=None):
494
    # doc, tag, text, line = Doc().ttl()
495
    with tag("div", id="traitSummary"):
496
        line("h1", "Summary of omics traits detected by local GWAS", style="text-align: center;")
497
        # heatmapFile = "/home/xufeng/xufeng/Projects/MODAS/xufeng1/assets/img/heatmap.png"
498
        # with tag("div", style="text-align: center;margin-top: 50px;"):
499
        #     doc.stag("img", klass="img-fluid", src=heatmapFile, alt="heatmap")
500
        with tag("div", style="margin-top: 100px;"):
501
            with tag("div"):
502
                line("h2", "Omics trait filtration criteria")
503
                line("p", "Parameters and pipelines in filtering traits")
504
            with tag("div"):
505
                line("h2", "Trait statistics table")
506
                with tag("div", klass="table-responsive"):
507
                    with tag("table", klass="table"):
508
                        with tag("thead"):
509
                            with tag("tr", klass="table-success"):
510
                                line("th", "Categories", style="width: 50%")
511
                                line("th", "Statistics value", style="width: 50%")
512
                        with tag("tbody"):
513
                            with tag("tr"):
514
                                line("td", "Omics trait type")
515
                                line("td", "Metabolome")
516
                            with tag("tr"):
517
                                line("td", "Total number of traits")
518
                                line("td", str(allTraitStatistics.totalTraits))
519
                            with tag("tr"):
520
                                line("td", "Filtered number of traits")
521
                                line("td", str(allTraitStatistics.filteredTraits))
522
                            with tag("tr"):
523
                                line("td", "Number of clustered traits")
524
                                line("td", str(allTraitStatistics.clusteredTraits))
525
                            # with tag("tr"):
526
                            #     line("td", "Number of modules of clustered traits")
527
                            #     line("td", "Cell 2")
528
                            with tag("tr"):
529
                                line("td", "Number of unclustered traits")
530
                                line("td", str(allTraitStatistics.unclusteredTraits))
531
    # return doc.getvalue()
532
533
534
def getSingleQtlInfo(singleQtl, index, geneFuncDict, doc=None, tag=None, text=None, line=None):
535
    with tag("div", klass="container"):
536
        with tag("div", klass="row"):
537
            with tag("div", klass="col"):
538
                line("h1", "Summary in " + singleQtl.qtlName, style="text-align: center;")
539
                with tag("div", style="margin-top: 100px;"):
540
                    with tag("div"):
541
                        line("h2", "QTL summarization criteria for whole -genome GWAS")
542
                        line("p", "Parameters and pipeline for filtering traits")
543
                    with tag("div"):
544
                        line("h2", "QTL statistics table")
545
                        with tag("div", klass="table-responsive"):
546
                            with tag("table", klass="table"):
547
                                with tag("thead"):
548
                                    with tag("tr", klass="table-success"):
549
                                        line("th", "QTL", style="width: 50%")
550
                                        line("th", "Statistics", style="width: 50%")
551
                                with tag("tbody"):
552
                                    with tag("tr"):
553
                                        line("td", "QTL position")
554
                                        line("td", str(singleQtl.qtlPosition))
555
                                    with tag("tr"):
556
                                        line("td", "Peak SNP ID and position")
557
                                        line("td", str(singleQtl.peakSNP))
558
                                    with tag("tr"):
559
                                        line("td", "Number of total genes in the QTL")
560
                                        line("td", str(len(singleQtl.totalGenesInQtl)))
561
                                    # with tag("tr"):
562
                                    #     line("td", "Number of target genes in the QTL")
563
                                    #     line("td", str(len(singleQtl.targetGenesInQtl)))
564
                                    # with tag("tr"):
565
                                    #     line("td", "Enrichment of target genes")
566
                                    #     line("td", str(singleQtl.enrichTargetGenes))
567
                                    with tag("tr"):
568
                                        line("td", "Enrichment significance vs background")
569
                                        line("td", str(singleQtl.pvalue))
570
                    # with tag("div"):
571
                    #     line("h2", "List of target genes in the QTL")
572
                    #     with tag("div", klass="table-responsive"):
573
                    #         with tag("table", klass="table"):
574
                    #             with tag("thead"):
575
                    #                 with tag("tr", klass="table-success"):
576
                    #                     line("th", "Gene ID", style="width: 25%")
577
                    #                     line("th", "Alias ID", style="width: 25%")
578
                    #                     line("th", "Position", style="width: 25%")
579
                    #                     line("th", "Function", style="width: 25%")
580
                    #             with tag("tbody"):
581
                    #                 for qtl in singleQtl.targetGenesInQtl:
582
                    #                     if qtl == "nan":
583
                    #                         continue
584
                    #                     with tag("tr"):
585
                    #                         line("td", str(geneFuncDict[qtl]["geneId"].strip()))
586
                    #                         line("td", str(geneFuncDict[qtl]["aliasId"].strip()))
587
                    #                         line("td", str(geneFuncDict[qtl]["position"].strip()))
588
                    #                         line("td", str(geneFuncDict[qtl]["function"].strip()))
589
                    with tag("div"):
590
                        line("h2", "List of total genes in the QTL")
591
                        with tag("div", klass="table-responsive"):
592
                            with tag("table", klass="table"):
593
                                with tag("thead"):
594
                                    with tag("tr", klass="table-success"):
595
                                        line("th", "Gene ID", style="width: 25%")
596
                                        line("th", "Alias ID", style="width: 25%")
597
                                        line("th", "Position", style="width: 25%")
598
                                        line("th", "Function", style="width: 25%")
599
                                with tag("tbody"):
600
                                    for qtl in singleQtl.totalGenesInQtl:
601
                                        if qtl == "nan":
602
                                            continue
603
                                        with tag("tr"):
604
                                            line("td", str(geneFuncDict[qtl]["geneid"].strip()))
605
                                            line("td", str(geneFuncDict[qtl]["aliasid"].strip()))
606
                                            line("td", str(geneFuncDict[qtl]["position"].strip()))
607
                                            line("td", str(geneFuncDict[qtl]["function"].strip()))
608
609
610
# def getSingleTrait(traitName, doc=None, tag=None, text=None, line=None):
611
#     with tag("div", klass="container"):
612
#         with tag("div", klass="row"):
613
#             with tag("div", klass="col"):
614
#                 line("h1", "Details in " + traitName, style="text-align: center;")
615
#                 doc.stag("img", klass="img-fluid", src="../../assets/img/manhattan.jpg")
616
#                 with tag("div", klass="row"):
617
#                     with tag("div", klass="col-6"):
618
#                         doc.stag("img", klass="img-fluid", src="../../assets/img/qqplot.png")
619
#                     with tag("div", klass="col-6"):
620
#                         doc.stag("img", klass="img-fluid", src="../../assets/img/boxplot.png")
621
622
def getListItem(data, qtlName=None, traitName=None, doc=None, tag=None, text=None, line=None, mainPage=False):
623
    for index, row in data.iterrows():
624
        qtlItem = SingleQtlStatistics(row, index)
625
        if qtlName and qtlName == qtlItem.qtlName:
626
            expand = "true"
627
            faPlusOrMinus = "fa-minus"
628
            myClass = "list-unstyled collapse nav nav-pills show"
629
            active = " active"
630
        else:
631
            expand = "false"
632
            faPlusOrMinus = "fa-plus"
633
            myClass = "list-unstyled collapse nav nav-pills"
634
            active = ""
635
        if mainPage:
636
            relativeDir = os.path.join("", qtlItem.qtlName)
637
        else:
638
            if qtlName == qtlItem.qtlName:
639
                relativeDir = ""
640
            else:
641
                relativeDir = os.path.join("../", qtlItem.qtlName)
642
        with tag("li"):
643
            with tag("div", klass="qtlItem" + active):
644
                with tag("a", ("href", os.path.join(relativeDir, qtlItem.qtlName + ".html")), klass="qtlLink"):
645
                    text(qtlItem.qtlName)
646
                with tag("a", ("href", "#" + qtlItem.qtlName), ("data-toggle", "collapse"), ("aria-expanded", expand)):
647
                    line("i", "", klass="fa " + faPlusOrMinus)
648
            with tag("ul", ("class", myClass), ("id", qtlItem.qtlName), ("aria-expanded", expand)):
649
                for i in qtlItem.traitNames:
650
                    with tag("li"):
651
                        href = os.path.join(relativeDir, i + ".html")
652
                        if traitName and traitName == i:
653
                            with tag("a", ("href", href), ("class", "active"), ("aria-selected", "true")):
654
                                line("i", "", klass="fa fa-link")
655
                                text(" " + i)
656
                        else:
657
                            with tag("a", ("href", href), ("aria-selected", "false")):
658
                                line("i", "", klass="fa fa-link")
659
                                text(" " + i)
660
661
662
def generateMainPage(data, allQtlStatistics, allTraitStatistics):
663
    doc, tag, text, line = Doc().ttl()
664
    doc.asis('<!DOCTYPE html>')
665
    with tag('html'):
666
        with tag('head'):
667
            doc.stag('meta', charset='utf-8')
668
            doc.stag('meta', name='viewport', content='width=device-width, initial-scale=1.0, shrink-to-fit=no')
669
            line('title', 'MODAS main page')
670
            doc.stag('link', rel='stylesheet', href='assets/bootstrap/css/bootstrap.min.css')
671
            doc.stag('link', rel='stylesheet', href='assets/fonts/font-awesome.min.css')
672
            doc.stag('link', rel='stylesheet', href="assets/css/modas.css")
673
            # doc.stag('link', rel='stylesheet', href="assets/css/styles.css")
674
675
        with tag("body"):
676
            with tag("div", id="sidebar-test"):
677
                with tag("div", klass="sidebar-header"):
678
                    with tag("h2"):
679
                        line("a", "MODAS", href="mainPage.html", klass="modas")
680
                with tag("ul"):
681
                    getListItem(data, doc=doc, tag=tag, text=text, line=line, mainPage=True)
682
683
            with tag("div", klass="content"):
684
                with tag("div", klass="container"):
685
                    with tag("div", klass="row"):
686
                        with tag("div", klass="col"):
687
                            getAllQtlSummary(allQtlStatistics, doc=doc, tag=tag, text=text, line=line)
688
                            doc.stag("hr", style="margin-bottom: 50px;margin-top: 50px;")
689
                            getAllTraitSummary(allTraitStatistics, doc=doc, tag=tag, text=text, line=line)
690
            line("script", "", src="assets/js/jquery.min.js")
691
            line("script", "", src="assets/bootstrap/js/bootstrap.min.js")
692
            line("script", "", src="assets/js/modas.js")
693
694
    mainPageOut = open("mainPage.html", "w")
695
    res = indent(doc.getvalue(), indentation="    ")
696
    print(res, file=mainPageOut)
697
    mainPageOut.close()
698
699
700
def generateSingleQtlPage(data, geneFuncDict):
701
    for index, row in data.iterrows():
702
        qtlItem = SingleQtlStatistics(row, index)
703
        if not os.path.exists(qtlItem.qtlName):
704
            os.makedirs(qtlItem.qtlName)
705
        out = open(os.path.join(qtlItem.qtlName, qtlItem.qtlName + ".html"), "w")
706
707
        doc, tag, text, line = Doc().ttl()
708
        doc.asis('<!DOCTYPE html>')
709
        with tag('html'):
710
            with tag('head'):
711
                doc.stag('meta', charset='utf-8')
712
                doc.stag('meta', name='viewport', content='width=device-width, initial-scale=1.0, shrink-to-fit=no')
713
                line('title', 'Summary information in QTL ' + qtlItem.qtlName)
714
                doc.stag('link', rel='stylesheet', href='../assets/bootstrap/css/bootstrap.min.css')
715
                doc.stag('link', rel='stylesheet', href='../assets/fonts/font-awesome.min.css')
716
                doc.stag('link', rel='stylesheet', href="../assets/css/modas.css")
717
                # doc.stag('link', rel='stylesheet', href="assets/css/styles.css")
718
            with tag("body"):
719
                with tag("div", id="sidebar-test"):
720
                    with tag("div", klass="sidebar-header"):
721
                        with tag("h2"):
722
                            line("a", "MODAS", href="../mainPage.html", klass="modas")
723
                    with tag("ul"):
724
                        getListItem(data, qtlName=qtlItem.qtlName, doc=doc, tag=tag, text=text, line=line)
725
726
                with tag("div", klass="content"):
727
                    getSingleQtlInfo(qtlItem, index, geneFuncDict, doc=doc, tag=tag, text=text, line=line)
728
729
                line("script", "", src="../assets/js/jquery.min.js")
730
                line("script", "", src="../assets/bootstrap/js/bootstrap.min.js")
731
                line("script", "", src="../assets/js/modas.js")
732
733
                customJs = '''
734
                    <script>
735
                        var offestFromTop = %d * 45 + 68;
736
                        $('#sidebar-test').scrollTop(offestFromTop);
737
738
                        function clickItem(event) {
739
                            var target = event.currentTarget;
740
                            $(target).parent().removeClass(".active").addClass(".active");
741
742
                            var index = $("div.qtlItem").index($(this).parent());
743
                            var offestFromTop = index * 45 + 68;
744
                            $('#sidebar-test').scrollTop(offestFromTop);
745
                        }
746
                        if ($("div.qtlItem .qtlLink")) {
747
                            var qtlLink = $("div.qtlItem .qtlLink");
748
                            for (var i = 0; i < qtlLink.length; i++) {
749
                                var item = qtlLink[i];
750
                                item.onclick = clickItem;
751
                            }
752
                        }
753
                    </script>
754
                ''' % (index)
755
                doc.asis(customJs)
756
757
        res = indent(doc.getvalue(), indentation="    ")
758
        print(res, file=out)
759
        out.close()
760
761
762
def generateSingleTraitPage(data, manhattanDir, qqDir, boxplotDir):
763
    for index, row in data.iterrows():
764
        qtlItem = SingleQtlStatistics(row, index)
765
        for traitName in qtlItem.traitNames:
766
            # getListItem(qtlItem)
767
            out = open(os.path.join(qtlItem.qtlName, traitName + ".html"), "w")
768
            doc, tag, text, line = Doc().ttl()
769
            doc.asis('<!DOCTYPE html>')
770
            with tag('html'):
771
                with tag('head'):
772
                    doc.stag('meta', charset='utf-8')
773
                    doc.stag('meta', name='viewport', content='width=device-width, initial-scale=1.0, shrink-to-fit=no')
774
                    line('title', 'Detailed information in trait ' + traitName)
775
                    doc.stag('link', rel='stylesheet', href='../assets/bootstrap/css/bootstrap.min.css')
776
                    doc.stag('link', rel='stylesheet', href='../assets/fonts/font-awesome.min.css')
777
                    doc.stag('link', rel='stylesheet', href="../assets/css/modas.css")
778
                    # doc.stag('link', rel='stylesheet', href="assets/css/styles.css")
779
                with tag("body"):
780
                    with tag("div", id="sidebar-test"):
781
                        with tag("div", klass="sidebar-header"):
782
                            with tag("h2"):
783
                                line("a", "MODAS", href="../mainPage.html", klass="modas")
784
                        with tag("ul"):
785
                            getListItem(data, qtlItem.qtlName, traitName, doc=doc, tag=tag, text=text, line=line)
786
787
                    manhattanFile = glob.glob(os.path.join(manhattanDir, "Manhattan.*_{}.jpg".format(traitName)))[0]
788
                    boxplotFile = glob.glob(os.path.join(boxplotDir, "{}_*.jpg".format(traitName)))[0]
789
                    qqplotFile = glob.glob(os.path.join(qqDir, "QQplot.*_{}.jpg".format(traitName)))[0]
790
                    with tag("div", klass="content"):
791
                        with tag("div", klass="container"):
792
                            with tag("div", klass="row"):
793
                                with tag("div", klass="col"):
794
                                    line("h1", "Details in " + traitName, style="text-align: center;")
795
                                    doc.stag("img", klass="img-fluid", src='../'+'/'.join(manhattanFile.split('/')[-2:]))
796
                                    with tag("div", klass="row"):
797
                                        with tag("div", klass="col-6"):
798
                                            doc.stag("img", klass="img-fluid", src='../'+'/'.join(qqplotFile.split('/')[-2:]))
799
                                        with tag("div", klass="col-6"):
800
                                            doc.stag("img", klass="img-fluid", src='../'+'/'.join(boxplotFile.split('/')[-2:]))
801
802
                    line("script", "", src="../assets/js/jquery.min.js")
803
                    line("script", "", src="../assets/bootstrap/js/bootstrap.min.js")
804
                    line("script", "", src="../assets/js/modas.js")
805
806
                    customJs = '''
807
                        <script>
808
                            var offestFromTop = %d * 45 + 68;
809
                            $('#sidebar-test').scrollTop(offestFromTop);
810
811
                            function clickItem(event) {
812
                                var target = event.currentTarget;
813
                                $(target).parent().removeClass(".active").addClass(".active");
814
815
                                var index = $("div.qtlItem").index($(this).parent());
816
                                var offestFromTop = index * 45 + 68;
817
                                $('#sidebar-test').scrollTop(offestFromTop);
818
                            }
819
                            if ($("div.qtlItem .qtlLink")) {
820
                                var qtlLink = $("div.qtlItem .qtlLink");
821
                                for (var i = 0; i < qtlLink.length; i++) {
822
                                    var item = qtlLink[i];
823
                                    item.onclick = clickItem;
824
                                }
825
                            }
826
                        </script>
827
                    ''' % (index)
828
                    doc.asis(customJs)
829
830
            res = indent(doc.getvalue(), indentation="    ")
831
            print(res, file=out)
832
            out.close()
833
834
835
def generateHtml(qtl_anno, myFuncFile, out_dir, totalSNPs):
836
    manhattanDir = os.path.abspath(out_dir+'/manhattan_plot')
837
    qqplotDir = os.path.abspath(out_dir+'/qqplot')
838
    boxplotDir = os.path.abspath(out_dir+'/boxplot')
839
840
    allQtlStatistics = AllQtlStatistics()
841
    allQtlStatistics.getInit(qtl_anno)
842
    allQtlStatistics.totalSNPs = totalSNPs
843
    allTraitStatistics = AllTraitStatistics()
844
    allTraitStatistics.getInit(qtl_anno)
845
846
    geneFuncDict = getGeneFunc(myFuncFile, "\t")
847
848
    resolveDir(out_dir)
849
    generateMainPage(qtl_anno, allQtlStatistics, allTraitStatistics)
850
    generateSingleQtlPage(qtl_anno, geneFuncDict)
851
852
    generateSingleTraitPage(qtl_anno, manhattanDir, qqplotDir, boxplotDir)
853