ImmunogenomicLandscape-BC / Git / [8e0848] /common_scripts/statistics/statistics

Models:
WandaB/
ImmunogenomicLandscape-BC
Downloads: 1
[8e0848]: / common_scripts / statistics / statistics_wrappers.R
History
Download this file
326 lines (248 with data), 12.0 kB

wrapper.wilcoxtest=function(genelist, data, logicalVectors, logicalVector_normals=NULL, ALTERNATIVE="greater", adj.method="BH", CORES=1, prettynum=T){
  library(parallel)
  data=do.call(rbind, mclapply(genelist, TestGeneWilcox, data, logicalVectors, logicalVector_normals=logicalVector_normals, ALTERNATIVE, prettynum, mc.cores = CORES))
  data$adj.p=p.adjust(data$p, method = adj.method)
  
  data$adj.method=adj.method
  
  # signifcode
  data$signifCode=""
  data$signifCode[as.numeric(data$adj.p)<0.1]="."
  data$signifCode[as.numeric(data$adj.p)<0.05]="*"
  data$signifCode[as.numeric(data$adj.p)<0.01]="**"
  data$signifCode[as.numeric(data$adj.p)<0.001]="***"
  data$signifCode[as.numeric(data$adj.p)<0.0001]="****"
  
  if(prettynum){
    data$adj.p=prettyNum(signif(data$adj.p,2))
  }
  
  if(adj.method=="BH"){
    colnames(data)[colnames(data)%in%"adj.p"]="FDR"
  }
  
  data=data[,c(1:5, 7,9, 6,8)]
  
  return(data)
}

FUN_GOTHROUGH=function(i, genelist, list_cancers, logicalVector_normals, TEST_FAILS=F, HG_PVAL=0.001, MW_PVAL=0.001, ALTERNATIVE="greater"){
  name=names(list_cancers)[i]
  logicalVector=as.logical(unlist(list_cancers[[i]]))
  logicalVector_normals=logicalVector_normals[!names(logicalVector_normals)%in%name]
  
  result=FUN_HGT_MW_TEST(genelist, logicalVector,logicalVector_normals, name, TEST_FAILS=F, data = data, profile = profile,HG_PVAL = HG_PVAL, MW_PVAL=MW_PVAL, ALTERNATIVE = ALTERNATIVE, CORES = 10)
}

fisher.wrapper=function(lv.list1, lv.list2, alternative="two.sided", method="bonferroni",log10=F, prettyNumbers=F,orderdata=F, cores=1){
  
  n1=names(lv.list1)
  n2=names(lv.list2)
  
  res=do.call(rbind,mclapply(seq(lv.list1), function(i){
    do.call(rbind,lapply(seq(lv.list2), function(j){
      data.frame("featureA"=n1[i], "featureB"=n2[j],fisher.2x2(lv.list1[[i]], lv.list2[[j]]))
    }))
  }, mc.cores=cores))
  
  res$adj.p=p.adjust(res$p, method=method)
  res$signifCode=""
  res$signifCode[as.numeric(res$adj.p)<0.1]="."
  res$signifCode[as.numeric(res$adj.p)<0.05]="*"
  res$signifCode[as.numeric(res$adj.p)<0.01]="**"
  res$signifCode[as.numeric(res$adj.p)<0.001]="***"
  res$signifCode[as.numeric(res$adj.p)<0.0001]="****"
  
  if(log10){
    res$p=abs(log10(res$p))
    res$adj.p=abs(log10(res$p))
  }
  
  if(prettyNumbers){
    res$log.odds=prettyNum(signif(res$log.odds,2))
    res$p=prettyNum(signif(res$p,2))
    res$adj.p=prettyNum(signif(res$adj.p,2))
  }
  
  if(orderdata)res=res[order(res$featureA, as.numeric(res$adj.p), decreasing = F),]
  return(res)
}

fisher.matrix.pairwise=function(E, plot=F, alternative="two.sided"){
  # perform a fisher exact test for each pairs of gene and saves the oddsRatio and p-value
  res <- NULL
  for(i in seq(dim(E)[2])){
    for(j in seq(dim(E)[2])){
      if(i!=j){
        f <- fisher.test(E[,i], E[,j], alternative = alternative) 
        res <- rbind(res,cbind(geneA=colnames(E)[i],geneB=colnames(E)[j],oddsRatio=f$estimate,pvalue=f$p.value))
      }   
    }
  }
  # some formatting
  res <- as.data.frame(res)
  res$geneA <- factor(res$geneA,levels=unique(res$geneA))
  res$geneB <- factor(res$geneB,levels=unique(res$geneB))
  res$oddsRatio <- as.numeric(as.character(res$oddsRatio))
  
  # avoid inf values
  res$oddsRatio[res$oddsRatio>10]=10
  res$oddsRatio[res$oddsRatio<=0]=0.01
  
  res$pvalue <- as.numeric(as.character(res$pvalue))
  # use p.adjust to correct for multi testing using a FDR
  res <- cbind(res,fdr=p.adjust(res$pvalue,"fdr"))
  # change the FDR in labels for plotting
  res$stars <- cut(res$fdr, breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ""))
 
  if(plot){
    # plot with ggplot 2
    require(ggplot2)
    require(cowplot) # not necessary but the plot is nicer
    p <- ggplot(res, aes(geneA, geneB)) + geom_tile(aes(fill = log2(oddsRatio+0.01)),colour = "white") + scale_fill_gradient2(midpoint=1) + geom_text(aes(label=stars), color="black", size=5)
    p
    return(p)
  }
 
  return(res)
}

fun.get.cox=function(i, logicalv, DATA, univariate=T, pretty=T, TIME, STATUS){
  logicalvector=logicalv[[i]]
  n=names(logicalv)[i]

  # remove if all are NA
  DATA=DATA[,!apply(DATA[logicalvector,,drop=F], 2, function(v)all(is.na(v)))]
  
  if(univariate){
    univ=do.call(rbind, lapply(seq(dim(DATA)[2]), function(j){
      y=Surv(TIME[logicalvector], STATUS[logicalvector])
      fit=coxph(formula = as.formula(paste("y ~ ", paste(colnames(DATA)[j], collapse="+"))), DATA[logicalvector,,drop=F])

      b=cox.zph(fit)[[1]][3] # p.value

      a=summary(fit)

      pval=a$coefficients[1,5]
      coef=a$conf.int[c(1,3,4)]
      v=data.frame(colnames(DATA)[j], t(coef), pval,"","", a$concordance[1], b<0.05, stringsAsFactors = F)
      rownames(v)=NULL
      return(v)
    }))

    # adjust P-value depending on number of genes tested:
    univ[,6]=p.adjust(univ[,5], method="BH")

  }else{
    y=Surv(TIME[logicalvector], STATUS[logicalvector])
    fit=coxph(formula = as.formula(paste("y ~ ", paste(colnames(DATA), collapse="+"))), DATA[logicalvector,])

    b=cox.zph(fit)$table[,3] # p.value
    b=b[!names(b)%in%"GLOBAL"]

    a=summary(fit)

    pval=a$coefficients[,5]
    coef=a$conf.int[,c(1,3,4)]
    univ=data.frame(rownames(a$coefficients), coef, pval, "", "", a$concordance[1], b<0.05, stringsAsFactors = F)
    rownames(univ)=NULL

    # no need to adjust P-value here:
    univ[,6]=univ[,5]
  }

  univ=univ[order(univ[,5]),]

  univ[,7][as.numeric(univ[,6 ])<0.1]="."
  univ[,7][as.numeric(univ[,6 ])<0.05]="*"
  univ[,7][as.numeric(univ[,6 ])<0.01]="**"
  univ[,7][as.numeric(univ[,6 ])<0.001]="***"
  univ[,7][as.numeric(univ[,6 ])<0.0001]="****"
  univ[,7][is.na(univ[,7])]=""

  val=data.frame(univ, n, stringsAsFactors = F)
  colnames(val)=c("Feature", "exp(coef)", "lower .95", "upper .95", "P", "Adj.P", "Signif", "concordance", "zph P < 0.05", "Name")

  if(pretty){
    for(i in c(2:6,8)){
      val[,i]=prettyNum(signif(val[,i],2))
    }
  }

  return(val)
}

fun.cox.elasticnet=function(DATA_ORG, time, status, cores=8, nfold=10,min.elnet=0.01,max.elnet=0.99, summary.km="85th_15th_percentile", percentage=0.25, REPEATS=100){
  library(glmnet)

  # run regularized regression
  a <- seq(min.elnet, max.elnet, 0.05)

  if(percentage==0){
    y=Surv(time, status)

    formdf1 <- as.formula(paste(" ~ ", paste(colnames(DATA_ORG),collapse="+")))
    x=model.matrix(formdf1, data=DATA_ORG)

    # go through alpha sequence with nfolds and repeat (100 times etc) cross validation on different sets of samples
    enet.rep=do.call(rbind, parallel::mclapply(seq(REPEATS), function(r){

      s <- do.call(rbind, lapply(a, function(i){
        cv <- cv.glmnet(x=x,y=y, family = "cox", nfold = nfold, type.measure = "deviance", alpha = i, standardize=F)
        data.frame(cvm = cv$cvm[cv$lambda == cv$lambda.min], lambda.min = cv$lambda.min, alpha = i)
      }))

      return(s)
    }, mc.cores=cores))

    # find mean alpha and lambda using all repeats
    search_m=do.call(rbind, lapply(a, function(alpha){
      cvm=mean(enet.rep$cvm[enet.rep$alpha==alpha])
      lambda=mean(enet.rep$lambda.min[enet.rep$alpha==alpha])
      data.frame(alpha, cvm, lambda)
    }))

    # minimum cvm error:
    cv3 <- search_m[search_m$cvm == min(search_m$cvm), ]

    print(cv3$alpha)
    print(cv3$lambda)
    print(cv3$cvm)

    # fit model with tuned alpha, use lambda sequence here, not single value
    md3 <- glmnet(x=x,y=y, family = "cox", alpha = cv3$alpha, standardize=F)

    coefficients <- coef(md3, s = cv3$lambda)[-1,,drop=F]
    coefficients=coefficients[order(abs(coefficients[,1]), decreasing = T),,drop=F]
    active_coefficients <- coefficients[,1] != 0

    coefficients_all=coefficients[active_coefficients,,drop=F]

    comprisk=DATA_ORG[,match(rownames(coefficients_all), colnames(DATA_ORG))]
    risk_patient=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk)))

    df=data.frame(x=risk_patient)
    ggplot(df, aes(x=x))+
      geom_density(color="lightblue", fill="lightblue")

    fun.kapplanMeier(time, status, CONTINUOUS=risk_patient, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "predicted risk")

    a=list(summary(coxph(y ~ PI.train, data.frame("PI.train"=risk_patient))))
    c=list(coefficients_all)

    out=c(a,c)
    names(out)=c("PI.test", "coefficients")
    return(out)
  }else{
    print("Splitting to test and train data")
    # divide data so that there are equal proportions of outcomes with training and test set!
    nr.samples=floor(percentage*dim(DATA_ORG)[1])
    nr.events=floor(table(status)*percentage)

    set.seed(0)
    x <- which(status==1)
    vf <- logical(length(status==1))
    vf[x[sample.int(length(x), nr.events[2])]] <- TRUE

    set.seed(0)
    x <- which(status==0)
    vf2 <- logical(length(status==0))
    vf2[x[sample.int(length(x), nr.events[1])]] <- TRUE

    find=vf|vf2

    # define training and testing datasets
    TRAIN=DATA_ORG[!find,]
    TEST=DATA_ORG[find,]

    ytrain=Surv(time[!find], status[!find])
    ytest=Surv(time[find], status[find])

    # this must look the same:
    log=fun.kapplanMeier(time, status, GROUPS=(rownames(DATA_ORG)%in%rownames(TRAIN)*1), MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=T, NAME = "These must be similar\nand high pval: 1 TRAIN, 0 TEST")

    formdf1 <- as.formula(paste(" ~ ", paste(colnames(TRAIN),collapse="+")))
    x=model.matrix(formdf1, data=TRAIN)

    # go through alpha sequence with nfolds and repeat (100 times etc) cross validation on different sets of samples
    enet.rep=do.call(rbind, parallel::mclapply(seq(REPEATS), function(r){

      s <- do.call(rbind, lapply(a, function(i){
        cv <- cv.glmnet(x=x,y=ytrain, family = "cox", nfold = floor((dim(DATA_ORG)[1]/10)), type.measure = "deviance", alpha = i, standardize=F)
        data.frame(cvm = cv$cvm[cv$lambda == cv$lambda.min], lambda.min = cv$lambda.min, alpha = i)
      }))

      return(s)
    }, mc.cores=cores))

    # find mean alpha and lambda using all repeats
    search_m=do.call(rbind, lapply(a, function(alpha){
      cvm=mean(enet.rep$cvm[enet.rep$alpha==alpha])
      lambda=mean(enet.rep$lambda.min[enet.rep$alpha==alpha])
      data.frame(alpha, cvm, lambda)
    }))

    # minimum cvm error:
    cv3 <- search_m[search_m$cvm == min(search_m$cvm), ]

    print(cv3$alpha)
    print(cv3$lambda)
    print(cv3$cvm)

    # fit model with tuned alpha, use lambda sequence here, not single value
    md3 <- glmnet(x=x,y=ytrain, family = "cox", alpha = cv3$alpha, standardize=F)

    coefficients <- coef(md3, s = cv3$lambda)[-1,,drop=F]
    coefficients=coefficients[order(abs(coefficients[,1]), decreasing = T),,drop=F]
    active_coefficients <- coefficients[,1] != 0

    coefficients_all=coefficients[active_coefficients,,drop=F]
    comprisk=TRAIN[,match(rownames(coefficients_all), colnames(TRAIN))]
    risk_patient=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk)))

    comprisk=TEST[,match(rownames(coefficients_all), colnames(TEST))]
    risk_patient_test=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk)))

    df=data.frame(x=risk_patient)
    ggplot(df, aes(x=x))+
      geom_density(color="lightblue", fill="lightblue")

    df=data.frame(x=risk_patient_test)
    ggplot(df, aes(x=x))+
      geom_density(color="indianred", fill="indianred")

    fun.kapplanMeier(time[!find], status[!find], CONTINUOUS=risk_patient, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "predicted risk")
    fun.kapplanMeier(time[find], status[find], CONTINUOUS=risk_patient_test, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "validated risk")

    a=list(summary(coxph(ytrain ~ PI.train, data.frame("PI.train"=risk_patient))))
    b=list(summary(coxph(ytest ~ PI.test, data.frame("PI.test"=risk_patient_test))))
    c=list(coefficients_all)

    out=c(a,b,c)
    names(out)=c("PI.train", "PI.test", "coefficients")
    return(out)
  }
}