ImmunogenomicLandscape-BC / Git / [8e0848] /common_scripts/featurematrix/compute.pairwise.R

Models:
WandaB/
ImmunogenomicLandscape-BC
Downloads: 1
[8e0848]: / common_scripts / featurematrix / compute.pairwise.R
History
Download this file
559 lines (431 with data), 20.1 kB

regulon.feats=function(fm, genelist, cnv_annot=NULL, filter.val=3, filtertypes=NULL){
  fmname=rownames(fm)
  
  if(!is.null(filtertypes)){
    fmname=fmname[!substr(fmname, 1,6)%in%filtertypes]
  }
  feature.gene=suppressWarnings(do.call(rbind, strsplit(fmname, ":"))[,3])
  
  if(any(grepl(":", genelist)))genelist[grepl(":", genelist)]=do.call(rbind, strsplit(genelist[grepl(":", genelist)], ":"))[,3]
  
  if(!is.null(cnv_annot)){
    l.genes=strsplit(cnv_annot[,2], ",")
    names(l.genes)=cnv_annot[,1]
    st=stack(l.genes)
    st=st[st[,1]%in%genelist,]
    st[,2]=as.character(st[,2])
  }
  
  nested.features=function(gene, fmname, feature.gene){
    direct=fmname[feature.gene%in%gene]
    if(!is.null(cnv_annot))direct=c(direct, st[st[,1]%in%gene,2])
    
    # order this to numeric, then to gexp-cnv-meth-gnab
    n=unique(direct)
    types=substr(n, 1, 6)
    ord=c("N:GEXP","B:GEXP","B:GNAB", "N:CNVR", "B:CNVR","N:METH", "N:MIRN", "N:SAMP", "B:SAMP", "N:CLIN", "B:CLIN", "N:DRUG")
    ord=ord[ord%in%types]
    
    r=unlist(lapply(ord, function(o) n[types%in%o]))
    
    return(r)
  }
  feats=lapply(genelist, nested.features, fmname, feature.gene)
  
  # Filter if not enough data:
  m=fm[rownames(fm)%in%unlist(feats),]
  s=is.na(m)
  m=m[!rowSums(s)==dim(m)[1]&rowSums(!s)>=filter.val, !colSums(s)==dim(m)[2]]
  
  feats=lapply(feats, function(n)n[n%in%rownames(m)])
  names(feats)=genelist
  
  feats=feats[!unlist(lapply(feats, length))<1]
  
  return(feats)
}

get.feat.within.region=function(INPUT, FEAT){
  
  temp=tempfile()
  temp2=tempfile()
  
  # write out bed format temp file:
  write.table(INPUT, temp, quote = F, row.names = F, col.names = F, sep="\t")
  
  # intersect with features
  system(paste0("bedtools intersect -a ",FEAT, " -b ",temp, " -wa -wb > ", temp2))  # genes in region
  
  # read in R
  d = read.delim(temp2, header=F, stringsAsFactors = F)
  
  d[,4]=gsub(":.*.", "", d[,4])
  
  d2=d[,c(10, 4)]
  
  unlink(temp)
  unlink(temp2)
  
  return(d2)
}

fun.correlation.regulon=function(i, l.regulon.gene, fm, filter.val=3, method.cor="spearman"){
  feats=l.regulon.gene[[i]]
  genename=names(l.regulon.gene)[i]
  x=data.matrix(t(as.matrix(fm[match(feats,rownames(fm)),,drop=F])))
  class(x) <- "numeric"
  
  if(dim(x)[2]<2)return(NULL) # this one did not have pairs to test
  
  # filter if binary has less than n samples
  if(any(grepl("^B:", colnames(x)))){
    check=colSums(x[,grepl("^B:", colnames(x)),drop=F], na.rm = T)
    x=x[,!colnames(x)%in%names(check[check<filter.val]),drop=F]
  }
  
  # filter if numeric has less than n non-zero samples
  if(any(grepl("^N:", colnames(x)))){
    check=colSums(x[,grepl("^N:", colnames(x)),drop=F]!=0, na.rm = T)
    x=x[,!colnames(x)%in%names(check[check<filter.val]),drop=F]
  }
  
  if(dim(x)[2]<2)return(NULL) # this one did not have pairs to test
  
  cor_rho=cor(x,use = "pairwise.complete.obs", method = method.cor)
  cor_pval=cor.test.p.m(x, method = method.cor, use = "pairwise.complete.obs")
  results=flat_cor_mat(cor_rho, cor_pval)
  
  # respect original order:
  results=do.call(rbind, lapply(colnames(x), function(n)results[results[,1]%in%n,]))
  
  rownames(results)=NULL
  
  # filter pairwise result:
  results=results[!duplicated(t(apply(results[,1:2], 1, sort))),]
  
  # add gene to cnvr:
  # results[grepl(":CNVR:", results[,1]),1]=gsub(":CNVR:", paste0(":CNVR:",genename, "@"), results[grepl(":CNVR:", results[,1]),1])
  # results[grepl(":CNVR:", results[,2]),2]=gsub(":CNVR:", paste0(":CNVR:",genename, "@"), results[grepl(":CNVR:", results[,2]),2])
  
  return(results)
}

fun.correlation.extrafeatures=function(i, l.regulon.gene, fm, extrafeatures, filter.val=3, method.cor="spearman"){
  feats=l.regulon.gene[[i]]
  genename=names(l.regulon.gene)[i]
  x=data.matrix(t(as.matrix(fm[match(feats,rownames(fm)),,drop=F])))
  class(x) <- "numeric"
  
  # filter if binary has less than n samples
  if(any(grepl("^B:", colnames(x)))){
    check=colSums(x[,grepl("^B:", colnames(x)),drop=F], na.rm = T)
    x=x[,!colnames(x)%in%names(check[check<filter.val]),drop=F]
  }
  
  # filter if numeric has less than n non-zero samples
  if(any(grepl("^N:", colnames(x)))){
    check=colSums(x[,grepl("^N:", colnames(x)),drop=F]!=0, na.rm = T)
    x=x[,!colnames(x)%in%names(check[check<filter.val]),drop=F]
  }
  
  # remove if in both lists
  extrafeatures=extrafeatures[!extrafeatures%in%feats]
  
  # Filter if not enough data (less than 3 not NA):
  m2=fm[rownames(fm)%in%extrafeatures,]
  s=is.na(m2)
  m2=m2[!rowSums(s)==dim(m2)[2]&rowSums(!s)>=filter.val, ]
  y=data.matrix(t(as.matrix(m2)))
  class(y) <- "numeric"
  
  y=cbind(x,y)
  
  # filter if binary has less than n 3 samples
  if(any(grepl("^B:", colnames(y)))){
    check=colSums(y[,grepl("^B:", colnames(y)),drop=F], na.rm = T)
    y=y[,!colnames(y)%in%names(check[check<filter.val])]
  }
  
  # filter if numeric has less than n non-zero samples
  if(any(grepl("^N:", colnames(y)))){
    check=colSums(y[,grepl("^N:", colnames(y)),drop=F]!=0, na.rm = T)
    y=y[,!colnames(y)%in%names(check[check<filter.val]),drop=F]
  }
  
  if(dim(x)[2]==0|dim(y)[2]==0)return(NULL) # no pairs to test
  
  cor_rho=cor(x,y,use = "pairwise.complete.obs", method = method.cor)
  
  # compute cor pval
  cor_pval=t(apply(x, 2, function(v1){
    apply(y, 2, function(v2){
      if(sum(complete.cases(cbind(v1, v2)))<filter.val)return(NA)
      cor.test.p(v1, v2, method = method.cor, use = "pairwise.complete.obs")
    })
  }))
  
  results=flat_cor_mat(data.matrix(cor_rho), data.matrix(cor_pval))
  
  results=do.call(rbind, lapply(colnames(x), function(n)results[results[,1]%in%n,]))
  
  rownames(results)=NULL
  
  # remove regulonfeatures, they are coming from another test:
  results=results[!(results[,1]%in%colnames(x)&results[,2]%in%colnames(x)),]
  
  # filter pairwise result:
  results=results[!duplicated(t(apply(results[,1:2], 1, sort))),]
  
  # add gene to cnvr: BUG for wilcox
  # results[grepl(":CNVR:", results[,1]),1]=gsub(":CNVR:", paste0(":CNVR:",genename, "@"), results[grepl(":CNVR:", results[,1]),1])
  
  return(results)
}


p.adjust.datatype=function(results, adjust.method="BH", log10=F, prettyNumbers=T, orderdata=T){
  
  # number of features tested:
  datatypes=suppressWarnings(do.call(rbind, strsplit(results[,2], ":"))[,2]) # sometimes more than 3 columns
  datatypes2=suppressWarnings(do.call(rbind, strsplit(results[,1], ":"))[,2])
  datatypes3=apply(cbind(datatypes2,datatypes),1,paste, collapse=":")
  datatypes=datatypes3
  
  results=results[order(datatypes),]
  datatypes=datatypes[order(datatypes)]
  
  vals=seq(datatypes)
  
  vals=unlist(lapply(unique(datatypes), function(d)p.adjust(results$p[datatypes%in%d], method = adjust.method)))
  
  if(log10){
    vals=abs(log10(vals))
  }
  if(prettyNumbers){
    vals=prettyNum(signif(vals,2))
    results[,3]=prettyNum(signif(results[,3],2))
    results[,4]=prettyNum(signif(results[,4],2))
  }else{
    vals=as.numeric(vals)
    results[,3]=as.numeric(results[,3])
    results[,4]=as.numeric(results[,4])
  }
  
  results$adj.p=vals
  results$signifCode=""
  results$signifCode[as.numeric(vals)<0.1]="."
  results$signifCode[as.numeric(vals)<0.05]="*"
  results$signifCode[as.numeric(vals)<0.01]="**"
  results$signifCode[as.numeric(vals)<0.001]="***"
  results$signifCode[as.numeric(vals)<0.0001]="****"
  
  results$datapairs=datatypes
  
  if(orderdata)results=results[order(results[,1], datatypes, -as.numeric(results$cor)>0, as.numeric(results$adj.p), decreasing = F),]
  
  return(results)
}


ft.alt <- function(a, b, c, d, alternative="greater") {
  as.numeric(fisher.test(matrix(c(a, b, c, d), 2), alternative = alternative)$p.value)
}

fisher.test.pwpw=function(results, find, fm, fisher.alternative, cores){
  results.o=results
  # remove gene anno if CNV type, to match with FM
  results.o[,1]=gsub("CNVR:.*.@", "CNVR:", results.o[,1])
  results.o[,2]=gsub("CNVR:.*.@", "CNVR:", results.o[,2])
  
  featsA=as.list(as.data.frame(t(fm[results.o[find,1],])))
  featsB=as.list(as.data.frame(t(fm[results.o[find,2],])))
  
  grid=data.frame(do.call(rbind, mclapply(seq(find), function(i)c(table(featsA[[i]], featsB[[i]])), mc.cores=cores)))
  colnames(grid)=c("a", "b", "c", "d")
  
  if(fisher.alternative=="two.sided"){
    p.ft <- with(grid, HighSpeedStats::ultrafastfet(a, b, c, d))
    results$p[find]=p.ft

  }else{
    p.ft <- with(grid, mapply(ft.alt, a, b, c, d, fisher.alternative))
    results$p[find]=p.ft
  }
  results$test.method[find]=paste0("Fisher´s exact (", fisher.alternative, ")")
  return(results)
}

# pairwise.correlation=function(l.regulon.gene, fm, extrafeatures=NULL, filter.val=3, cores=5, adjust.method="BH", method.cor="spearman", prettyNumbers=T, use.fisher=T, fisher.alternative="two.sided"){
#   
#   #****************************** Test within regulon ******************************
#   r=parallel::mclapply(seq(l.regulon.gene),fun.correlation.regulon,l.regulon.gene, fm, filter.val=filter.val,method.cor=method.cor, mc.cores=cores)
#   results=do.call(rbind, r)
#   results=results[!is.na(results$p),,drop=F]
#   
#   if(!is.null(results)){
#     results$test.method=paste(method.cor, "correlation")
#     results$test.group="gene.features"
#     
#     # if any pairs are binary-binary, add fisher test p-value
#     fa=substr(results[,1], 0, 1)=="B"
#     fb=substr(results[,2], 0, 1)=="B"
#     find=which(fa&fb)
#     findl=fa&fb
#     
#     if(use.fisher&any(findl)){
#       results=fisher.test.pwpw(results, find, fm, fisher.alternative, cores)
#       results1=p.adjust.datatype(results[findl,], orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers)
#       results2=p.adjust.datatype(results[!findl,], orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers)
#       results=rbind(results1, results2)
#     }else{
#       results=p.adjust.datatype(results, orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers)
#     }
#   }
#   #*********************************************************************************
#   
#   # Test against extrafeatures if they exist:
#   if(!is.null(extrafeatures)){
#     r=parallel::mclapply(seq(l.regulon.gene),fun.correlation.extrafeatures, l.regulon.gene, fm, extrafeatures=extrafeatures, filter.val=filter.val,method.cor=method.cor, mc.cores=cores)
#     results.e=do.call(rbind, r)
#     results.e=results.e[!is.na(results.e$p),]
#     results.e$test.method=paste(method.cor, "correlation")
#     results.e$test.group="extra.features"
#     
#     # if any pairs are binary-binary, add fisher test p-value
#     fa=substr(results.e[,1], 0, 1)=="B"
#     fb=substr(results.e[,2], 0, 1)=="B"
#     find=which(fa&fb)
#     findl=fa&fb
#     
#     if(use.fisher&any(findl)){
#       results.e=fisher.test.pwpw(results.e, find,fm, fisher.alternative, cores)
#       results1=p.adjust.datatype(results.e[findl,], orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers)
#       if(sum(!findl)>0){
#         results2=p.adjust.datatype(results.e[!findl,], orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers)
#         results.e=rbind(results1, results2)
#       }else{
#         results.e=results1
#       }
#     }else{
#       results.e=p.adjust.datatype(results.e, orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers)
#     }
#     results=rbind(results, results.e)
#   }
#   
#   if(adjust.method=="BH")adj="FDR"
#   if(adjust.method=="bonferroni")adj="FWER"
#   colnames(results)[7]=adj
#   results[grepl("Fisher", results$test.method),3]=""
#   
#   # order columnts:
#   cols=c("featureA", "featureB", "cor", "p", adj, "signifCode", "test.method", "test.group", "datapairs")
#   results=results[,cols]
#   return(results)
# }

filter.pairwise.res=function(results, filter.table=NULL, FDR=0.1){
  if(is.null(filter.table)){
    filter.table=data.frame("pair"=unique(results$datapairs), "FDR"=rep(FDR, length(unique(results$datapairs))), stringsAsFactors = F)
    type1=gsub(":.*.", "", filter.table[,1])
    type2=gsub(".*.:", "", filter.table[,1])
    
    # usually fewer features
    filter.table$FDR[type1=="GNAB"|type2=="GNAB"]=0.25 # mutations can be interesting, high P-value kept
    filter.table$FDR[type1=="SAMP"&type2=="SAMP"]=0.1 # these can be very different
    filter.table$FDR[type1=="CLIN"&type2=="CLIN"]=0.1 # these can be very different
    
    # exclude, not usually meaningful
    filter.table$FDR[type1=="CNVR"&type2=="CNVR"]=0 # remove, highly correlated
    filter.table$FDR[type1=="METH"&type2=="METH"]=0 # remove, highly correlated
    
    # SAMP special rules:
    filter.table$FDR[type1=="SAMP"&type2=="GEXP"]=1e-30 
    filter.table$FDR[type1=="SAMP"&type2=="MIRN"]=1e-30 
    filter.table$FDR[type1=="SAMP"&type2=="METH"]=1e-8 
    
    print("default table used: ")
    print(filter.table)
  }
  
  pairs=results$datapairs
  p=unique(pairs)
  
  # make sure all pairs are found in filter.table
  if(!all(p%in%filter.table[,1]))stop(paste("Not all pairs were found: ", p[!p%in%filter.table[,1],]))
  
  filter.table=filter.table[match(p, filter.table[,1]),]
  
  # p-value per pairs:
  vals=do.call(rbind, lapply(seq(p), function(i){
    
    r=results[pairs%in%p[i],]
    
    filt=as.numeric(r[,5])<as.numeric(filter.table[i,2])
    
    r=r[filt,]
  }))
  
  return(vals)
}

pairwise.correlation=function(l.regulon.gene, fm, extrafeatures=NULL, filter.val=3, cores=5, adjust.method="BH", method.cor="spearman", prettyNumbers=T, use.fisher=T, use.wilcox=T, fisher.alternative="two.sided", wilcox.alternative="two.sided"){
  
  #****************************** Test within regulon ******************************
  r=parallel::mclapply(seq(l.regulon.gene),fun.correlation.regulon,l.regulon.gene, fm, filter.val=filter.val,method.cor=method.cor, mc.cores=cores)
  results=do.call(rbind, r)
  results=results[!is.na(results$p),,drop=F]
  
  if(!is.null(results)){
    results$test.method=paste(method.cor, "correlation")
    results$test.group="gene.features"
    
    # if any pairs are binary-binary, add fisher test p-value
    fa=substr(results[,1], 0, 1)=="B"
    fb=substr(results[,2], 0, 1)=="B"
    find=which(fa&fb)
    findl=fa&fb
    
    # if any pairs are binary-numeric, add wilcoxon test p-value
    fc=substr(results[,1], 0, 1)=="N"
    fd=substr(results[,2], 0, 1)=="B"
    fe=substr(results[,1], 0, 1)=="B"
    ff=substr(results[,2], 0, 1)=="N"
    
    findw=fc&fd|fe&ff
    
    # compute p-value depending on test type:
    if(use.fisher&any(findl)){
      # update B vs B tests:
      results=fisher.test.pwpw(results, find, fm, fisher.alternative, cores)
    }
    
    # compute p-value depending on test type:
    if(use.wilcox&any(findw)) {
      # update B vs N tests:
      results=test.feats.wilcox(features = results, rows = findw, data = fm, wilcox.alternative = wilcox.alternative, cores = cores)
    }
    
    # go through each test and adjust p-value per data type:
    results=do.call(rbind, lapply(unique(results$test.method), function(type){
      p.adjust.datatype(results[results$test.method%in%type,], orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers)
    }))
  }
  #*********************************************************************************
  
  # Test against extrafeatures if they exist:
  if(!is.null(extrafeatures)){
    r=parallel::mclapply(seq(l.regulon.gene),fun.correlation.extrafeatures, l.regulon.gene, fm, extrafeatures=extrafeatures, filter.val=filter.val,method.cor=method.cor, mc.cores=cores)
    results.e=do.call(rbind, r)
    results.e=results.e[!is.na(results.e$p),]
    results.e$test.method=paste(method.cor, "correlation")
    results.e$test.group="extra.features"
    
    # if any pairs are binary-binary, add fisher test p-value
    fa=substr(results.e[,1], 0, 1)=="B"
    fb=substr(results.e[,2], 0, 1)=="B"
    find=which(fa&fb)
    findl=fa&fb
    
    # if any pairs are binary-numeric, add wilcoxon test p-value
    fc=substr(results.e[,1], 0, 1)=="N"
    fd=substr(results.e[,2], 0, 1)=="B"
    fe=substr(results.e[,1], 0, 1)=="B"
    ff=substr(results.e[,2], 0, 1)=="N"
    findw=fc&fd|fe&ff
  
    # compute p-value depending on test type:
    if(use.fisher&any(findl)){
    # update B vs B tests:
    results.e=fisher.test.pwpw(results.e, find, fm, fisher.alternative, cores)
    }
    
    # compute p-value depending on test type:
    if(use.wilcox&any(findw)) {
    # update B vs N tests:
    results.e=test.feats.wilcox(features = results.e, rows = findw, data = fm, wilcox.alternative = wilcox.alternative, cores = cores)
    }
    
    # go through each test and adjust p-value per data type
    results.e=do.call(rbind, lapply(unique(results.e$test.method), function(type){
      p.adjust.datatype(results.e[results.e$test.method%in%type,], orderdata=T, adjust.method=adjust.method, prettyNumbers=prettyNumbers)
    }))
    
    results=rbind(results, results.e)
  }
  
  if(adjust.method=="BH")adj="FDR"
  if(adjust.method=="bonferroni")adj="FWER"
  colnames(results)[7]=adj
  results[grepl("Fisher", results$test.method),3]=""
  
  # order columnts:
  cols=c("featureA", "featureB", "cor", "p", adj, "signifCode", "test.method", "test.group", "datapairs")
  results=results[,cols]
  return(results)
}

filter.pairwise.res=function(results, filter.table=NULL, FDR=0.1){
  if(is.null(filter.table)){
    filter.table=data.frame("pair"=unique(results$datapairs), "FDR"=rep(FDR, length(unique(results$datapairs))), stringsAsFactors = F)
    type1=gsub(":.*.", "", filter.table[,1])
    type2=gsub(".*.:", "", filter.table[,1])
    
    # usually fewer features
    filter.table$FDR[type1=="GNAB"|type2=="GNAB"]=0.25 # mutations can be interesting, high P-value kept
    filter.table$FDR[type1=="SAMP"&type2=="SAMP"]=0.1 # these can be very different
    filter.table$FDR[type1=="CLIN"&type2=="CLIN"]=0.1 # these can be very different
    
    # exclude, not usually meaningful
    filter.table$FDR[type1=="CNVR"&type2=="CNVR"]=0 # remove, highly correlated
    filter.table$FDR[type1=="METH"&type2=="METH"]=0 # remove, highly correlated
    
    # SAMP special rules:
    filter.table$FDR[type1=="SAMP"&type2=="GEXP"]=1e-30
    filter.table$FDR[type1=="SAMP"&type2=="MIRN"]=1e-30
    filter.table$FDR[type1=="SAMP"&type2=="METH"]=1e-8
    
    print("default table used: ")
    print(filter.table)
  }
  
  pairs=results$datapairs
  p=unique(pairs)
  
  # make sure all pairs are found in filter.table
  if(!all(p%in%filter.table[,1]))stop(paste("Not all pairs were found: ", p[!p%in%filter.table[,1],]))
  
  filter.table=filter.table[match(p, filter.table[,1]),]
  
  # p-value per pairs:
  vals=do.call(rbind, lapply(seq(p), function(i){
    
    r=results[pairs%in%p[i],]
    
    filt=as.numeric(r[,5])<as.numeric(filter.table[i,2])&!is.na(r[,5])
    
    r=r[filt,]
  }))
  
  return(vals)
}

test.feats.wilcox <- function(features, rows=NA, data, wilcox.alternative="two.sided", cores=1) {
  if(length(rows)>=1) {
    pairs <- features[rows,]
  } else pairs <- features
  
  p=unlist(parallel::mclapply(1:nrow(pairs), function(i){
    binary=substr(pairs[i,2], 0, 1)=="B"
    
    if(binary) {
      
      x <- as.numeric(data[pairs[i,1],])
      y <- as.logical(as.numeric(data[pairs[i,2],]))
      
    }else {
      
      x <- as.numeric(data[pairs[i,2],])
      y <- as.logical(as.numeric(data[pairs[i,1],]))
      
    }
    
    a <- vector()
    b <- vector()
    
    for(j in 1:length(y)) {
      
      if(is.na(y[j])) next
      if(y[j]) a <- c(a, x[j])
      if(!y[j]) b <- c(b, x[j])
      
    }
    
    if((is.numeric(a) && is.numeric(b))&&(length(a)>0&&length(b>0))&&sum(is.na(a))!=length(a)&&sum(is.na(b))!=length(b)) {
      p <- wilcox.test(a,b, alternative = wilcox.alternative)$p.value
    } else {
      p=NA # not enough data
    }
    return(p)
  }, mc.cores=cores))
  
  pairs$test.method=paste0("Wilcoxon test (", wilcox.alternative, ")")
  pairs$p<- as.numeric(as.character(p))
  
  # return original table but with updated p and test.method
  features[rows,]=pairs
  
  return(features)
}