--- a +++ b/modas/prescreen.py @@ -0,0 +1,199 @@ +import pandas as pd +import numpy as np +import modas.multiprocess as mp +from sklearn.preprocessing import MinMaxScaler +import os, glob +import logging, re +from rpy2.robjects import pandas2ri +from rpy2.rinterface_lib.embedded import RRuntimeError +import rpy2.robjects as robjects +from rpy2.robjects.packages import importr +from rpy2.rinterface_lib.callbacks import logger as rpy2_logger +import subprocess + + +pandas2ri.activate() +rpy2_logger.setLevel(logging.ERROR) +base = importr('base') +utils = importr('utils') + +utils_path = subprocess.check_output('locate modas/utils', shell=True, text=True, encoding='utf-8') +#utils_path = '/'.join(re.search('\n(.*site-packages.*)\n', utils_path).group(1).split('/')[:-1]) +utils_path = re.search('\n(.*site-packages.*)\n', utils_path).group(1) +if not utils_path.endswith('utils'): + utils_path = '/'.join(utils_path.split('/')[:-1]) + +if not base.require('rMVP')[0]: + utils.install_packages(np.array(['data.table', 'ggplot2', 'ggsignif', 'bigmemory', 'RcppProgress', 'BH']), repos='https://cloud.r-project.org', quiet=True) + utils.install_packages(utils_path + '/rMVP_1.0.6_modify.tar.gz', repos=robjects.rinterface.NULL, type='source', quiet=True) + # utils.install_packages('bigsnpr', dependence=True, repos='https://cloud.r-project.org', quiet=True) +rMVP = importr('rMVP') +bigmemory = importr('bigmemory') + + +def qtl_pc2bimbam(qtl_pc): + #qtl_pc.loc[:,:] = np.around(MinMaxScaler(feature_range=(0, 2)).fit_transform(qtl_pc.values),decimals=3) + g = qtl_pc.T.reset_index() + g.insert(1,'minor',['A']*g.shape[0]) + g.insert(2,'major',['T']*g.shape[0]) + a = g.iloc[:,0].to_frame() + a['pos'] = a.iloc[:,0].apply(lambda x: str((int(x.split('_')[2])+int(x.split('_')[3]))/2)) + a['chr'] = a.iloc[:,0].apply(lambda x: x.split('_')[1]) + return a,g + + +def qtl_pc_gwas_parallel(omics_phe, bimbam_dir, threads, geno, geno_prefix, gwas_model): + qtl_pc_gwas_args = list() + if gwas_model == 'MLM' or gwas_model == 'GLM': + fam = pd.read_csv(geno + '.fam', sep=r'\s+', header=None) + fam[5] = 1 + fam.to_csv(geno_prefix + '.link.fam', sep='\t', na_rep='NA', header=None, index=False) + #omics_phe = omics_phe.reindex(fam[0].values) + if os.path.exists(geno_prefix + '.link.bed'): + os.remove(geno_prefix + '.link.bed') + if os.path.exists(geno_prefix + '.link.bim'): + os.remove(geno_prefix + '.link.bim') + os.symlink(geno + '.bed', geno_prefix + '.link.bed') + os.symlink(geno + '.bim', geno_prefix + '.link.bim') + if gwas_model == 'MLM': + related_matrix_cmd = utils_path + '/gemma -bfile {0}.link -gk 1 -o {1}'.format(geno_prefix,geno_prefix) + s = mp.run(related_matrix_cmd) + if s!=0: + return None + if gwas_model == 'MLM': + gemma_cmd = utils_path + '/gemma -g {0} -a {1} -p {2} -k ./output/{3}.cXX.txt -lmm -n 1 -o {4}' + elif gwas_model == 'LM': + gemma_cmd = utils_path + '/gemma -g {0} -a {1} -p {2} -lm -o {3}' + else: + g = pd.read_csv(bimbam_dir.strip('/')+'/'+geno_prefix+'_qtl_pc.geno.txt',header=None) + a = pd.read_csv(bimbam_dir.strip('/')+'/'+geno_prefix+'_qtl_pc.anno.txt',header=None) + g.iloc[:,3:].to_csv(geno_prefix+'.numeric.txt',index=False, header=None, sep='\t') + a.columns = ['SNP', 'Pos', 'Chr'] + a = a[['SNP', 'Chr', 'Pos']] + a.to_csv(geno_prefix+'.map.txt', index=False, sep='\t') + + for m in omics_phe.columns: + phe = omics_phe[m].to_frame() + m = m.replace('m/z', 'm.z') + phe.to_csv(bimbam_dir.strip('/') + '/' + m + '_phe.txt', index=False, header=None, na_rep='NA') + if gwas_model == 'MLM': + qtl_pc_gwas_args.append((gemma_cmd.format(bimbam_dir.strip('/') + '/'+geno_prefix+'_qtl_pc.geno.txt', bimbam_dir.strip('/') + '/'+geno_prefix+'_qtl_pc.anno.txt', bimbam_dir.strip('/') + '/' + m + '_phe.txt',geno_prefix, m + '_prescreen'),)) + elif gwas_model == 'LM': + qtl_pc_gwas_args.append((gemma_cmd.format(bimbam_dir.strip('/')+'/'+geno_prefix+'_qtl_pc.geno.txt',bimbam_dir.strip('/')+'/'+geno_prefix+'_qtl_pc.anno.txt',bimbam_dir.strip('/')+'/'+m+'_phe.txt',m+'_prescreen'),)) + #else: + # qtl_pc_gwas_args.append((phe.reset_index(), geno_prefix+'.link', geno_prefix+'.numeric.txt', geno_prefix+'.map.txt', 1, './output')) + if gwas_model == 'LM' or gwas_model == 'MLM': + s = mp.parallel(mp.run, qtl_pc_gwas_args, threads) + else: + if not os.path.exists('./output'): + os.mkdir('./output') + #s = mp.parallel(glm_gwas, (qtl_pc_gwas_args[0],), 1) + #s = mp.parallel(glm_gwas, qtl_pc_gwas_args[1:], threads) + omics_phe.columns = [i.replace('m/z', 'm.z') for i in omics_phe.columns] + s = glm_gwas(omics_phe, geno_prefix+'.link', geno_prefix+'.numeric.txt', geno_prefix+'.map.txt', 1, './output') + if gwas_model == 'MLM' or gwas_model == 'GLM': + os.remove(geno_prefix+'.link.bed') + os.remove(geno_prefix+'.link.bim') + os.remove(geno_prefix+'.link.fam') + return s + + +def glm_gwas(omics_phe, pc_geno_prefix, genofile, mapfile, threads, out_path): + try: + geno_prefix = '.'.join(genofile.split('/')[-1].split('.')[:-2]) + base.sink('/dev/null') + robjects.r(''' + gwas <- function(omics_phe, pc_geno_prefix, geno_prefix, genofile, mapfile, threads, out_path){ + library(rMVP) + if(!file.exists(paste(pc_geno_prefix,'.pc.desc',sep=''))){ + MVP.Data(fileBed=pc_geno_prefix, fileKin=F, filePC=F, out=pc_geno_prefix, verbose=F) + MVP.Data.PC(T,mvp_prefix=pc_geno_prefix, pcs.keep=5, verbose=F) + } + MVP.Data(fileNum=genofile, fileMap=mapfile, fileKin=F, filePC=F, sep_num='\t', type.geno='double', out=geno_prefix) + geno = attach.big.matrix(paste(geno_prefix, '.geno.desc',sep='')) + map_file = read.table(paste(geno_prefix, '.geno.map',sep=''),sep='\t',header=T) + Covariates_PC = bigmemory::as.matrix(attach.big.matrix(paste(pc_geno_prefix,'.pc.desc',sep=''))) + phe_name = names(omics_phe) + for(i in 2:ncol(omics_phe)){ + mvp = MVP(phe=omics_phe[,c(1,i)], geno=geno, map=map_file, CV.GLM=Covariates_PC, priority='speed', nPC.GLM=5, + ncpus=threads, maxLoop=10, threshold=0.05, method=c('GLM'), file.output=F, verbose=F) + res = cbind(mvp$map, mvp$glm.results) + names(res) <- c('rs', 'chr', 'ps', 'effect', 'se', 'p_wald') + print(head(res)) + write.table(res,file=paste(out_path,'/',as.character(phe_name[i]),'_prescreen.assoc.txt',sep=''),sep='\t', quote=F, row.names=F) + } + } + ''') + gwas = robjects.r('gwas') + gwas(omics_phe, pc_geno_prefix, geno_prefix, genofile, mapfile, threads, out_path) + base.sink() + except RRuntimeError: + return 0 + except ValueError: + return 0 + else: + return 1 + +# def glm_gwas(omics_phe, pc_geno_prefix, genofile, mapfile, threads, out_path): +# try: +# geno_prefix = '.'.join(genofile.split('/')[-1].split('.')[:-2]) +# base.sink('/dev/null') +# if not os.path.exists(pc_geno_prefix + '.pc.desc'): +# rMVP.MVP_Data(fileBed=pc_geno_prefix, fileKin=False, filePC=False, out=pc_geno_prefix, +# verbose=False) +# rMVP.MVP_Data_PC(True, mvp_prefix=pc_geno_prefix, pcs_keep=5, verbose=False) +# rMVP.MVP_Data(fileNum=genofile, fileMap=mapfile, fileKin=False, filePC=False, sep_num = '\t', type_geno='double',out=geno_prefix, verbose=False) +# geno = bigmemory.attach_big_matrix(geno_prefix +'.geno.desc') +# map_file = pd.read_csv(geno_prefix +'.geno.map', sep='\t') +# Covariates_PC = bigmemory.as_matrix(bigmemory.attach_big_matrix(pc_geno_prefix + '.pc.desc')) +# # base.setwd('./output') +# mvp = rMVP.MVP(phe=omics_phe, geno=geno, map=map_file, CV_GLM=Covariates_PC, priority="speed", nPC_GLM=5, +# ncpus=threads, maxLoop=10, threshold=0.05, method=['GLM'], file_output=False, verbose=False) +# gwas_res = pd.DataFrame(mvp.rx2('glm.results'), columns=['effect', 'se', 'p_wald']) +# pos = pd.DataFrame(mvp.rx2('map')) +# pos.columns = ['rs','chr', 'ps'] +# pos.index = gwas_res.index +# res = pd.concat([pos, gwas_res], axis=1) +# res.to_csv(out_path.rstrip('/') + '/' + str(omics_phe.columns[1]) + '_prescreen.assoc.txt', index=False,sep='\t') +# base.sink() +# except RRuntimeError: +# return 0 +# except ValueError: +# return 0 +# else: +# return 1 + + +def prescreen(omics_phe,suggest_pvalue): + phe_sig_qtl = list() + sig_phe_names = list() + for fn in glob.glob('output/*_prescreen.assoc.txt'): + gwas = pd.read_csv(fn, sep='\t') + if gwas['p_wald'].min() > suggest_pvalue: + continue + phe_name = fn.split('/')[-1].replace('_prescreen.assoc.txt','') + pos = list() + for rs in gwas.loc[gwas.p_wald <= suggest_pvalue, 'rs'].values: + chrom,start,end = rs.split('_')[1:4] + start,end = int(start),int(end) + if not pos: + pos.append([chrom,start,end,phe_name]) + else: + if pos[-1][0] != chrom: + pos.append([chrom,start,end,phe_name]) + else: + if start < pos[-1][2]: + start = start if start < pos[-1][1] else pos[-1][1] + end = end if end > pos[-1][2] else pos[-1][2] + pos[-1] = [chrom,start,end,phe_name] + else: + pos.append([chrom,start,end,phe_name]) + phe_sig_qtl.extend(pos) + if not gwas.loc[gwas.p_wald <= suggest_pvalue,:].empty: + sig_phe_names.append(phe_name.replace('m.z','m/z')) + sig_omics_phe = omics_phe.loc[:, sig_phe_names] + phe_sig_qtl = pd.DataFrame(phe_sig_qtl,columns=['chr','start','end','phe_name']) + return sig_omics_phe, phe_sig_qtl + + +