--- a +++ b/common_scripts/featurematrix/compute.pairwise.R @@ -0,0 +1,558 @@ +regulon.feats=function(fm, genelist, cnv_annot=NULL, filter.val=3, filtertypes=NULL){ + fmname=rownames(fm) + + if(!is.null(filtertypes)){ + fmname=fmname[!substr(fmname, 1,6)%in%filtertypes] + } + feature.gene=suppressWarnings(do.call(rbind, strsplit(fmname, ":"))[,3]) + + if(any(grepl(":", genelist)))genelist[grepl(":", genelist)]=do.call(rbind, strsplit(genelist[grepl(":", genelist)], ":"))[,3] + + if(!is.null(cnv_annot)){ + l.genes=strsplit(cnv_annot[,2], ",") + names(l.genes)=cnv_annot[,1] + st=stack(l.genes) + st=st[st[,1]%in%genelist,] + st[,2]=as.character(st[,2]) + } + + nested.features=function(gene, fmname, feature.gene){ + direct=fmname[feature.gene%in%gene] + if(!is.null(cnv_annot))direct=c(direct, st[st[,1]%in%gene,2]) + + # order this to numeric, then to gexp-cnv-meth-gnab + n=unique(direct) + types=substr(n, 1, 6) + ord=c("N:GEXP","B:GEXP","B:GNAB", "N:CNVR", "B:CNVR","N:METH", "N:MIRN", "N:SAMP", "B:SAMP", "N:CLIN", "B:CLIN", "N:DRUG") + ord=ord[ord%in%types] + + r=unlist(lapply(ord, function(o) n[types%in%o])) + + return(r) + } + feats=lapply(genelist, nested.features, fmname, feature.gene) + + # Filter if not enough data: + m=fm[rownames(fm)%in%unlist(feats),] + s=is.na(m) + m=m[!rowSums(s)==dim(m)[1]&rowSums(!s)>=filter.val, !colSums(s)==dim(m)[2]] + + feats=lapply(feats, function(n)n[n%in%rownames(m)]) + names(feats)=genelist + + feats=feats[!unlist(lapply(feats, length))<1] + + return(feats) +} + +get.feat.within.region=function(INPUT, FEAT){ + + temp=tempfile() + temp2=tempfile() + + # write out bed format temp file: + write.table(INPUT, temp, quote = F, row.names = F, col.names = F, sep="\t") + + # intersect with features + system(paste0("bedtools intersect -a ",FEAT, " -b ",temp, " -wa -wb > ", temp2)) # genes in region + + # read in R + d = read.delim(temp2, header=F, stringsAsFactors = F) + + d[,4]=gsub(":.*.", "", d[,4]) + + d2=d[,c(10, 4)] + + unlink(temp) + unlink(temp2) + + return(d2) +} + +fun.correlation.regulon=function(i, l.regulon.gene, fm, filter.val=3, method.cor="spearman"){ + feats=l.regulon.gene[[i]] + genename=names(l.regulon.gene)[i] + x=data.matrix(t(as.matrix(fm[match(feats,rownames(fm)),,drop=F]))) + class(x) <- "numeric" + + if(dim(x)[2]<2)return(NULL) # this one did not have pairs to test + + # filter if binary has less than n samples + if(any(grepl("^B:", colnames(x)))){ + check=colSums(x[,grepl("^B:", colnames(x)),drop=F], na.rm = T) + x=x[,!colnames(x)%in%names(check[check<filter.val]),drop=F] + } + + # filter if numeric has less than n non-zero samples + if(any(grepl("^N:", colnames(x)))){ + check=colSums(x[,grepl("^N:", colnames(x)),drop=F]!=0, na.rm = T) + x=x[,!colnames(x)%in%names(check[check<filter.val]),drop=F] + } + + if(dim(x)[2]<2)return(NULL) # this one did not have pairs to test + + cor_rho=cor(x,use = "pairwise.complete.obs", method = method.cor) + cor_pval=cor.test.p.m(x, method = method.cor, use = "pairwise.complete.obs") + results=flat_cor_mat(cor_rho, cor_pval) + + # respect original order: + results=do.call(rbind, lapply(colnames(x), function(n)results[results[,1]%in%n,])) + + rownames(results)=NULL + + # filter pairwise result: + results=results[!duplicated(t(apply(results[,1:2], 1, sort))),] + + # add gene to cnvr: + # results[grepl(":CNVR:", results[,1]),1]=gsub(":CNVR:", paste0(":CNVR:",genename, "@"), results[grepl(":CNVR:", results[,1]),1]) + # results[grepl(":CNVR:", results[,2]),2]=gsub(":CNVR:", paste0(":CNVR:",genename, "@"), results[grepl(":CNVR:", results[,2]),2]) + + return(results) +} + +fun.correlation.extrafeatures=function(i, l.regulon.gene, fm, extrafeatures, filter.val=3, method.cor="spearman"){ + feats=l.regulon.gene[[i]] + genename=names(l.regulon.gene)[i] + x=data.matrix(t(as.matrix(fm[match(feats,rownames(fm)),,drop=F]))) + class(x) <- "numeric" + + # filter if binary has less than n samples + if(any(grepl("^B:", colnames(x)))){ + check=colSums(x[,grepl("^B:", colnames(x)),drop=F], na.rm = T) + x=x[,!colnames(x)%in%names(check[check<filter.val]),drop=F] + } + + # filter if numeric has less than n non-zero samples + if(any(grepl("^N:", colnames(x)))){ + check=colSums(x[,grepl("^N:", colnames(x)),drop=F]!=0, na.rm = T) + x=x[,!colnames(x)%in%names(check[check<filter.val]),drop=F] + } + + # remove if in both lists + extrafeatures=extrafeatures[!extrafeatures%in%feats] + + # Filter if not enough data (less than 3 not NA): + m2=fm[rownames(fm)%in%extrafeatures,] + s=is.na(m2) + m2=m2[!rowSums(s)==dim(m2)[2]&rowSums(!s)>=filter.val, ] + y=data.matrix(t(as.matrix(m2))) + class(y) <- "numeric" + + y=cbind(x,y) + + # filter if binary has less than n 3 samples + if(any(grepl("^B:", colnames(y)))){ + check=colSums(y[,grepl("^B:", colnames(y)),drop=F], na.rm = T) + y=y[,!colnames(y)%in%names(check[check<filter.val])] + } + + # filter if numeric has less than n non-zero samples + if(any(grepl("^N:", colnames(y)))){ + check=colSums(y[,grepl("^N:", colnames(y)),drop=F]!=0, na.rm = T) + y=y[,!colnames(y)%in%names(check[check<filter.val]),drop=F] + } + + if(dim(x)[2]==0|dim(y)[2]==0)return(NULL) # no pairs to test + + cor_rho=cor(x,y,use = "pairwise.complete.obs", method = method.cor) + + # compute cor pval + cor_pval=t(apply(x, 2, function(v1){ + apply(y, 2, function(v2){ + if(sum(complete.cases(cbind(v1, v2)))<filter.val)return(NA) + cor.test.p(v1, v2, method = method.cor, use = "pairwise.complete.obs") + }) + })) + + results=flat_cor_mat(data.matrix(cor_rho), data.matrix(cor_pval)) + + results=do.call(rbind, lapply(colnames(x), function(n)results[results[,1]%in%n,])) + + rownames(results)=NULL + + # remove regulonfeatures, they are coming from another test: + results=results[!(results[,1]%in%colnames(x)&results[,2]%in%colnames(x)),] + + # filter pairwise result: + results=results[!duplicated(t(apply(results[,1:2], 1, sort))),] + + # add gene to cnvr: BUG for wilcox + # results[grepl(":CNVR:", results[,1]),1]=gsub(":CNVR:", paste0(":CNVR:",genename, "@"), results[grepl(":CNVR:", results[,1]),1]) + + return(results) +} + + +p.adjust.datatype=function(results, adjust.method="BH", log10=F, prettyNumbers=T, orderdata=T){ + + # number of features tested: + datatypes=suppressWarnings(do.call(rbind, strsplit(results[,2], ":"))[,2]) # sometimes more than 3 columns + datatypes2=suppressWarnings(do.call(rbind, strsplit(results[,1], ":"))[,2]) + datatypes3=apply(cbind(datatypes2,datatypes),1,paste, collapse=":") + datatypes=datatypes3 + + results=results[order(datatypes),] + datatypes=datatypes[order(datatypes)] + + vals=seq(datatypes) + + vals=unlist(lapply(unique(datatypes), function(d)p.adjust(results$p[datatypes%in%d], method = adjust.method))) + + if(log10){ + vals=abs(log10(vals)) + } + if(prettyNumbers){ + vals=prettyNum(signif(vals,2)) + results[,3]=prettyNum(signif(results[,3],2)) + results[,4]=prettyNum(signif(results[,4],2)) + }else{ + vals=as.numeric(vals) + results[,3]=as.numeric(results[,3]) + results[,4]=as.numeric(results[,4]) + } + + results$adj.p=vals + results$signifCode="" + results$signifCode[as.numeric(vals)<0.1]="." + results$signifCode[as.numeric(vals)<0.05]="*" + results$signifCode[as.numeric(vals)<0.01]="**" + results$signifCode[as.numeric(vals)<0.001]="***" + results$signifCode[as.numeric(vals)<0.0001]="****" + + results$datapairs=datatypes + + if(orderdata)results=results[order(results[,1], datatypes, -as.numeric(results$cor)>0, as.numeric(results$adj.p), decreasing = F),] + + return(results) +} + + +ft.alt <- function(a, b, c, d, alternative="greater") { + as.numeric(fisher.test(matrix(c(a, b, c, d), 2), alternative = alternative)$p.value) +} + +fisher.test.pwpw=function(results, find, fm, fisher.alternative, cores){ + results.o=results + # remove gene anno if CNV type, to match with FM + results.o[,1]=gsub("CNVR:.*.@", "CNVR:", results.o[,1]) + results.o[,2]=gsub("CNVR:.*.@", "CNVR:", results.o[,2]) + + featsA=as.list(as.data.frame(t(fm[results.o[find,1],]))) + featsB=as.list(as.data.frame(t(fm[results.o[find,2],]))) + + grid=data.frame(do.call(rbind, mclapply(seq(find), function(i)c(table(featsA[[i]], featsB[[i]])), mc.cores=cores))) + colnames(grid)=c("a", "b", "c", "d") + + if(fisher.alternative=="two.sided"){ + p.ft <- with(grid, HighSpeedStats::ultrafastfet(a, b, c, d)) + results$p[find]=p.ft + + }else{ + p.ft <- with(grid, mapply(ft.alt, a, b, c, d, fisher.alternative)) + results$p[find]=p.ft + } + results$test.method[find]=paste0("Fisher´s exact (", fisher.alternative, ")") + return(results) +} + +# pairwise.correlation=function(l.regulon.gene, fm, extrafeatures=NULL, filter.val=3, cores=5, adjust.method="BH", method.cor="spearman", prettyNumbers=T, use.fisher=T, fisher.alternative="two.sided"){ +# +# #****************************** Test within regulon ****************************** +# r=parallel::mclapply(seq(l.regulon.gene),fun.correlation.regulon,l.regulon.gene, fm, filter.val=filter.val,method.cor=method.cor, mc.cores=cores) +# results=do.call(rbind, r) +# results=results[!is.na(results$p),,drop=F] +# +# if(!is.null(results)){ +# results$test.method=paste(method.cor, "correlation") +# results$test.group="gene.features" +# +# # if any pairs are binary-binary, add fisher test p-value +# fa=substr(results[,1], 0, 1)=="B" +# fb=substr(results[,2], 0, 1)=="B" +# find=which(fa&fb) +# findl=fa&fb +# +# if(use.fisher&any(findl)){ +# results=fisher.test.pwpw(results, find, fm, fisher.alternative, cores) +# results1=p.adjust.datatype(results[findl,], orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers) +# results2=p.adjust.datatype(results[!findl,], orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers) +# results=rbind(results1, results2) +# }else{ +# results=p.adjust.datatype(results, orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers) +# } +# } +# #********************************************************************************* +# +# # Test against extrafeatures if they exist: +# if(!is.null(extrafeatures)){ +# r=parallel::mclapply(seq(l.regulon.gene),fun.correlation.extrafeatures, l.regulon.gene, fm, extrafeatures=extrafeatures, filter.val=filter.val,method.cor=method.cor, mc.cores=cores) +# results.e=do.call(rbind, r) +# results.e=results.e[!is.na(results.e$p),] +# results.e$test.method=paste(method.cor, "correlation") +# results.e$test.group="extra.features" +# +# # if any pairs are binary-binary, add fisher test p-value +# fa=substr(results.e[,1], 0, 1)=="B" +# fb=substr(results.e[,2], 0, 1)=="B" +# find=which(fa&fb) +# findl=fa&fb +# +# if(use.fisher&any(findl)){ +# results.e=fisher.test.pwpw(results.e, find,fm, fisher.alternative, cores) +# results1=p.adjust.datatype(results.e[findl,], orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers) +# if(sum(!findl)>0){ +# results2=p.adjust.datatype(results.e[!findl,], orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers) +# results.e=rbind(results1, results2) +# }else{ +# results.e=results1 +# } +# }else{ +# results.e=p.adjust.datatype(results.e, orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers) +# } +# results=rbind(results, results.e) +# } +# +# if(adjust.method=="BH")adj="FDR" +# if(adjust.method=="bonferroni")adj="FWER" +# colnames(results)[7]=adj +# results[grepl("Fisher", results$test.method),3]="" +# +# # order columnts: +# cols=c("featureA", "featureB", "cor", "p", adj, "signifCode", "test.method", "test.group", "datapairs") +# results=results[,cols] +# return(results) +# } + +filter.pairwise.res=function(results, filter.table=NULL, FDR=0.1){ + if(is.null(filter.table)){ + filter.table=data.frame("pair"=unique(results$datapairs), "FDR"=rep(FDR, length(unique(results$datapairs))), stringsAsFactors = F) + type1=gsub(":.*.", "", filter.table[,1]) + type2=gsub(".*.:", "", filter.table[,1]) + + # usually fewer features + filter.table$FDR[type1=="GNAB"|type2=="GNAB"]=0.25 # mutations can be interesting, high P-value kept + filter.table$FDR[type1=="SAMP"&type2=="SAMP"]=0.1 # these can be very different + filter.table$FDR[type1=="CLIN"&type2=="CLIN"]=0.1 # these can be very different + + # exclude, not usually meaningful + filter.table$FDR[type1=="CNVR"&type2=="CNVR"]=0 # remove, highly correlated + filter.table$FDR[type1=="METH"&type2=="METH"]=0 # remove, highly correlated + + # SAMP special rules: + filter.table$FDR[type1=="SAMP"&type2=="GEXP"]=1e-30 + filter.table$FDR[type1=="SAMP"&type2=="MIRN"]=1e-30 + filter.table$FDR[type1=="SAMP"&type2=="METH"]=1e-8 + + print("default table used: ") + print(filter.table) + } + + pairs=results$datapairs + p=unique(pairs) + + # make sure all pairs are found in filter.table + if(!all(p%in%filter.table[,1]))stop(paste("Not all pairs were found: ", p[!p%in%filter.table[,1],])) + + filter.table=filter.table[match(p, filter.table[,1]),] + + # p-value per pairs: + vals=do.call(rbind, lapply(seq(p), function(i){ + + r=results[pairs%in%p[i],] + + filt=as.numeric(r[,5])<as.numeric(filter.table[i,2]) + + r=r[filt,] + })) + + return(vals) +} + +pairwise.correlation=function(l.regulon.gene, fm, extrafeatures=NULL, filter.val=3, cores=5, adjust.method="BH", method.cor="spearman", prettyNumbers=T, use.fisher=T, use.wilcox=T, fisher.alternative="two.sided", wilcox.alternative="two.sided"){ + + #****************************** Test within regulon ****************************** + r=parallel::mclapply(seq(l.regulon.gene),fun.correlation.regulon,l.regulon.gene, fm, filter.val=filter.val,method.cor=method.cor, mc.cores=cores) + results=do.call(rbind, r) + results=results[!is.na(results$p),,drop=F] + + if(!is.null(results)){ + results$test.method=paste(method.cor, "correlation") + results$test.group="gene.features" + + # if any pairs are binary-binary, add fisher test p-value + fa=substr(results[,1], 0, 1)=="B" + fb=substr(results[,2], 0, 1)=="B" + find=which(fa&fb) + findl=fa&fb + + # if any pairs are binary-numeric, add wilcoxon test p-value + fc=substr(results[,1], 0, 1)=="N" + fd=substr(results[,2], 0, 1)=="B" + fe=substr(results[,1], 0, 1)=="B" + ff=substr(results[,2], 0, 1)=="N" + + findw=fc&fd|fe&ff + + # compute p-value depending on test type: + if(use.fisher&any(findl)){ + # update B vs B tests: + results=fisher.test.pwpw(results, find, fm, fisher.alternative, cores) + } + + # compute p-value depending on test type: + if(use.wilcox&any(findw)) { + # update B vs N tests: + results=test.feats.wilcox(features = results, rows = findw, data = fm, wilcox.alternative = wilcox.alternative, cores = cores) + } + + # go through each test and adjust p-value per data type: + results=do.call(rbind, lapply(unique(results$test.method), function(type){ + p.adjust.datatype(results[results$test.method%in%type,], orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers) + })) + } + #********************************************************************************* + + # Test against extrafeatures if they exist: + if(!is.null(extrafeatures)){ + r=parallel::mclapply(seq(l.regulon.gene),fun.correlation.extrafeatures, l.regulon.gene, fm, extrafeatures=extrafeatures, filter.val=filter.val,method.cor=method.cor, mc.cores=cores) + results.e=do.call(rbind, r) + results.e=results.e[!is.na(results.e$p),] + results.e$test.method=paste(method.cor, "correlation") + results.e$test.group="extra.features" + + # if any pairs are binary-binary, add fisher test p-value + fa=substr(results.e[,1], 0, 1)=="B" + fb=substr(results.e[,2], 0, 1)=="B" + find=which(fa&fb) + findl=fa&fb + + # if any pairs are binary-numeric, add wilcoxon test p-value + fc=substr(results.e[,1], 0, 1)=="N" + fd=substr(results.e[,2], 0, 1)=="B" + fe=substr(results.e[,1], 0, 1)=="B" + ff=substr(results.e[,2], 0, 1)=="N" + findw=fc&fd|fe&ff + + # compute p-value depending on test type: + if(use.fisher&any(findl)){ + # update B vs B tests: + results.e=fisher.test.pwpw(results.e, find, fm, fisher.alternative, cores) + } + + # compute p-value depending on test type: + if(use.wilcox&any(findw)) { + # update B vs N tests: + results.e=test.feats.wilcox(features = results.e, rows = findw, data = fm, wilcox.alternative = wilcox.alternative, cores = cores) + } + + # go through each test and adjust p-value per data type + results.e=do.call(rbind, lapply(unique(results.e$test.method), function(type){ + p.adjust.datatype(results.e[results.e$test.method%in%type,], orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers) + })) + + results=rbind(results, results.e) + } + + if(adjust.method=="BH")adj="FDR" + if(adjust.method=="bonferroni")adj="FWER" + colnames(results)[7]=adj + results[grepl("Fisher", results$test.method),3]="" + + # order columnts: + cols=c("featureA", "featureB", "cor", "p", adj, "signifCode", "test.method", "test.group", "datapairs") + results=results[,cols] + return(results) +} + +filter.pairwise.res=function(results, filter.table=NULL, FDR=0.1){ + if(is.null(filter.table)){ + filter.table=data.frame("pair"=unique(results$datapairs), "FDR"=rep(FDR, length(unique(results$datapairs))), stringsAsFactors = F) + type1=gsub(":.*.", "", filter.table[,1]) + type2=gsub(".*.:", "", filter.table[,1]) + + # usually fewer features + filter.table$FDR[type1=="GNAB"|type2=="GNAB"]=0.25 # mutations can be interesting, high P-value kept + filter.table$FDR[type1=="SAMP"&type2=="SAMP"]=0.1 # these can be very different + filter.table$FDR[type1=="CLIN"&type2=="CLIN"]=0.1 # these can be very different + + # exclude, not usually meaningful + filter.table$FDR[type1=="CNVR"&type2=="CNVR"]=0 # remove, highly correlated + filter.table$FDR[type1=="METH"&type2=="METH"]=0 # remove, highly correlated + + # SAMP special rules: + filter.table$FDR[type1=="SAMP"&type2=="GEXP"]=1e-30 + filter.table$FDR[type1=="SAMP"&type2=="MIRN"]=1e-30 + filter.table$FDR[type1=="SAMP"&type2=="METH"]=1e-8 + + print("default table used: ") + print(filter.table) + } + + pairs=results$datapairs + p=unique(pairs) + + # make sure all pairs are found in filter.table + if(!all(p%in%filter.table[,1]))stop(paste("Not all pairs were found: ", p[!p%in%filter.table[,1],])) + + filter.table=filter.table[match(p, filter.table[,1]),] + + # p-value per pairs: + vals=do.call(rbind, lapply(seq(p), function(i){ + + r=results[pairs%in%p[i],] + + filt=as.numeric(r[,5])<as.numeric(filter.table[i,2])&!is.na(r[,5]) + + r=r[filt,] + })) + + return(vals) +} + +test.feats.wilcox <- function(features, rows=NA, data, wilcox.alternative="two.sided", cores=1) { + if(length(rows)>=1) { + pairs <- features[rows,] + } else pairs <- features + + p=unlist(parallel::mclapply(1:nrow(pairs), function(i){ + binary=substr(pairs[i,2], 0, 1)=="B" + + if(binary) { + + x <- as.numeric(data[pairs[i,1],]) + y <- as.logical(as.numeric(data[pairs[i,2],])) + + }else { + + x <- as.numeric(data[pairs[i,2],]) + y <- as.logical(as.numeric(data[pairs[i,1],])) + + } + + a <- vector() + b <- vector() + + for(j in 1:length(y)) { + + if(is.na(y[j])) next + if(y[j]) a <- c(a, x[j]) + if(!y[j]) b <- c(b, x[j]) + + } + + if((is.numeric(a) && is.numeric(b))&&(length(a)>0&&length(b>0))&&sum(is.na(a))!=length(a)&&sum(is.na(b))!=length(b)) { + p <- wilcox.test(a,b, alternative = wilcox.alternative)$p.value + } else { + p=NA # not enough data + } + return(p) + }, mc.cores=cores)) + + pairs$test.method=paste0("Wilcoxon test (", wilcox.alternative, ")") + pairs$p<- as.numeric(as.character(p)) + + # return original table but with updated p and test.method + features[rows,]=pairs + + return(features) +}