--- a +++ b/common_scripts/statistics/statistics_wrappers.R @@ -0,0 +1,325 @@ +wrapper.wilcoxtest=function(genelist, data, logicalVectors, logicalVector_normals=NULL, ALTERNATIVE="greater", adj.method="BH", CORES=1, prettynum=T){ + library(parallel) + data=do.call(rbind, mclapply(genelist, TestGeneWilcox, data, logicalVectors, logicalVector_normals=logicalVector_normals, ALTERNATIVE, prettynum, mc.cores = CORES)) + data$adj.p=p.adjust(data$p, method = adj.method) + + data$adj.method=adj.method + + # signifcode + data$signifCode="" + data$signifCode[as.numeric(data$adj.p)<0.1]="." + data$signifCode[as.numeric(data$adj.p)<0.05]="*" + data$signifCode[as.numeric(data$adj.p)<0.01]="**" + data$signifCode[as.numeric(data$adj.p)<0.001]="***" + data$signifCode[as.numeric(data$adj.p)<0.0001]="****" + + if(prettynum){ + data$adj.p=prettyNum(signif(data$adj.p,2)) + } + + if(adj.method=="BH"){ + colnames(data)[colnames(data)%in%"adj.p"]="FDR" + } + + data=data[,c(1:5, 7,9, 6,8)] + + return(data) +} + +FUN_GOTHROUGH=function(i, genelist, list_cancers, logicalVector_normals, TEST_FAILS=F, HG_PVAL=0.001, MW_PVAL=0.001, ALTERNATIVE="greater"){ + name=names(list_cancers)[i] + logicalVector=as.logical(unlist(list_cancers[[i]])) + logicalVector_normals=logicalVector_normals[!names(logicalVector_normals)%in%name] + + result=FUN_HGT_MW_TEST(genelist, logicalVector,logicalVector_normals, name, TEST_FAILS=F, data = data, profile = profile,HG_PVAL = HG_PVAL, MW_PVAL=MW_PVAL, ALTERNATIVE = ALTERNATIVE, CORES = 10) +} + +fisher.wrapper=function(lv.list1, lv.list2, alternative="two.sided", method="bonferroni",log10=F, prettyNumbers=F,orderdata=F, cores=1){ + + n1=names(lv.list1) + n2=names(lv.list2) + + res=do.call(rbind,mclapply(seq(lv.list1), function(i){ + do.call(rbind,lapply(seq(lv.list2), function(j){ + data.frame("featureA"=n1[i], "featureB"=n2[j],fisher.2x2(lv.list1[[i]], lv.list2[[j]])) + })) + }, mc.cores=cores)) + + res$adj.p=p.adjust(res$p, method=method) + res$signifCode="" + res$signifCode[as.numeric(res$adj.p)<0.1]="." + res$signifCode[as.numeric(res$adj.p)<0.05]="*" + res$signifCode[as.numeric(res$adj.p)<0.01]="**" + res$signifCode[as.numeric(res$adj.p)<0.001]="***" + res$signifCode[as.numeric(res$adj.p)<0.0001]="****" + + if(log10){ + res$p=abs(log10(res$p)) + res$adj.p=abs(log10(res$p)) + } + + if(prettyNumbers){ + res$log.odds=prettyNum(signif(res$log.odds,2)) + res$p=prettyNum(signif(res$p,2)) + res$adj.p=prettyNum(signif(res$adj.p,2)) + } + + if(orderdata)res=res[order(res$featureA, as.numeric(res$adj.p), decreasing = F),] + return(res) +} + +fisher.matrix.pairwise=function(E, plot=F, alternative="two.sided"){ + # perform a fisher exact test for each pairs of gene and saves the oddsRatio and p-value + res <- NULL + for(i in seq(dim(E)[2])){ + for(j in seq(dim(E)[2])){ + if(i!=j){ + f <- fisher.test(E[,i], E[,j], alternative = alternative) + res <- rbind(res,cbind(geneA=colnames(E)[i],geneB=colnames(E)[j],oddsRatio=f$estimate,pvalue=f$p.value)) + } + } + } + # some formatting + res <- as.data.frame(res) + res$geneA <- factor(res$geneA,levels=unique(res$geneA)) + res$geneB <- factor(res$geneB,levels=unique(res$geneB)) + res$oddsRatio <- as.numeric(as.character(res$oddsRatio)) + + # avoid inf values + res$oddsRatio[res$oddsRatio>10]=10 + res$oddsRatio[res$oddsRatio<=0]=0.01 + + res$pvalue <- as.numeric(as.character(res$pvalue)) + # use p.adjust to correct for multi testing using a FDR + res <- cbind(res,fdr=p.adjust(res$pvalue,"fdr")) + # change the FDR in labels for plotting + res$stars <- cut(res$fdr, breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", "")) + + if(plot){ + # plot with ggplot 2 + require(ggplot2) + require(cowplot) # not necessary but the plot is nicer + p <- ggplot(res, aes(geneA, geneB)) + geom_tile(aes(fill = log2(oddsRatio+0.01)),colour = "white") + scale_fill_gradient2(midpoint=1) + geom_text(aes(label=stars), color="black", size=5) + p + return(p) + } + + return(res) +} + +fun.get.cox=function(i, logicalv, DATA, univariate=T, pretty=T, TIME, STATUS){ + logicalvector=logicalv[[i]] + n=names(logicalv)[i] + + # remove if all are NA + DATA=DATA[,!apply(DATA[logicalvector,,drop=F], 2, function(v)all(is.na(v)))] + + if(univariate){ + univ=do.call(rbind, lapply(seq(dim(DATA)[2]), function(j){ + y=Surv(TIME[logicalvector], STATUS[logicalvector]) + fit=coxph(formula = as.formula(paste("y ~ ", paste(colnames(DATA)[j], collapse="+"))), DATA[logicalvector,,drop=F]) + + b=cox.zph(fit)[[1]][3] # p.value + + a=summary(fit) + + pval=a$coefficients[1,5] + coef=a$conf.int[c(1,3,4)] + v=data.frame(colnames(DATA)[j], t(coef), pval,"","", a$concordance[1], b<0.05, stringsAsFactors = F) + rownames(v)=NULL + return(v) + })) + + # adjust P-value depending on number of genes tested: + univ[,6]=p.adjust(univ[,5], method="BH") + + }else{ + y=Surv(TIME[logicalvector], STATUS[logicalvector]) + fit=coxph(formula = as.formula(paste("y ~ ", paste(colnames(DATA), collapse="+"))), DATA[logicalvector,]) + + b=cox.zph(fit)$table[,3] # p.value + b=b[!names(b)%in%"GLOBAL"] + + a=summary(fit) + + pval=a$coefficients[,5] + coef=a$conf.int[,c(1,3,4)] + univ=data.frame(rownames(a$coefficients), coef, pval, "", "", a$concordance[1], b<0.05, stringsAsFactors = F) + rownames(univ)=NULL + + # no need to adjust P-value here: + univ[,6]=univ[,5] + } + + univ=univ[order(univ[,5]),] + + univ[,7][as.numeric(univ[,6 ])<0.1]="." + univ[,7][as.numeric(univ[,6 ])<0.05]="*" + univ[,7][as.numeric(univ[,6 ])<0.01]="**" + univ[,7][as.numeric(univ[,6 ])<0.001]="***" + univ[,7][as.numeric(univ[,6 ])<0.0001]="****" + univ[,7][is.na(univ[,7])]="" + + val=data.frame(univ, n, stringsAsFactors = F) + colnames(val)=c("Feature", "exp(coef)", "lower .95", "upper .95", "P", "Adj.P", "Signif", "concordance", "zph P < 0.05", "Name") + + if(pretty){ + for(i in c(2:6,8)){ + val[,i]=prettyNum(signif(val[,i],2)) + } + } + + return(val) +} + +fun.cox.elasticnet=function(DATA_ORG, time, status, cores=8, nfold=10,min.elnet=0.01,max.elnet=0.99, summary.km="85th_15th_percentile", percentage=0.25, REPEATS=100){ + library(glmnet) + + # run regularized regression + a <- seq(min.elnet, max.elnet, 0.05) + + if(percentage==0){ + y=Surv(time, status) + + formdf1 <- as.formula(paste(" ~ ", paste(colnames(DATA_ORG),collapse="+"))) + x=model.matrix(formdf1, data=DATA_ORG) + + # go through alpha sequence with nfolds and repeat (100 times etc) cross validation on different sets of samples + enet.rep=do.call(rbind, parallel::mclapply(seq(REPEATS), function(r){ + + s <- do.call(rbind, lapply(a, function(i){ + cv <- cv.glmnet(x=x,y=y, family = "cox", nfold = nfold, type.measure = "deviance", alpha = i, standardize=F) + data.frame(cvm = cv$cvm[cv$lambda == cv$lambda.min], lambda.min = cv$lambda.min, alpha = i) + })) + + return(s) + }, mc.cores=cores)) + + # find mean alpha and lambda using all repeats + search_m=do.call(rbind, lapply(a, function(alpha){ + cvm=mean(enet.rep$cvm[enet.rep$alpha==alpha]) + lambda=mean(enet.rep$lambda.min[enet.rep$alpha==alpha]) + data.frame(alpha, cvm, lambda) + })) + + # minimum cvm error: + cv3 <- search_m[search_m$cvm == min(search_m$cvm), ] + + print(cv3$alpha) + print(cv3$lambda) + print(cv3$cvm) + + # fit model with tuned alpha, use lambda sequence here, not single value + md3 <- glmnet(x=x,y=y, family = "cox", alpha = cv3$alpha, standardize=F) + + coefficients <- coef(md3, s = cv3$lambda)[-1,,drop=F] + coefficients=coefficients[order(abs(coefficients[,1]), decreasing = T),,drop=F] + active_coefficients <- coefficients[,1] != 0 + + coefficients_all=coefficients[active_coefficients,,drop=F] + + comprisk=DATA_ORG[,match(rownames(coefficients_all), colnames(DATA_ORG))] + risk_patient=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk))) + + df=data.frame(x=risk_patient) + ggplot(df, aes(x=x))+ + geom_density(color="lightblue", fill="lightblue") + + fun.kapplanMeier(time, status, CONTINUOUS=risk_patient, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "predicted risk") + + a=list(summary(coxph(y ~ PI.train, data.frame("PI.train"=risk_patient)))) + c=list(coefficients_all) + + out=c(a,c) + names(out)=c("PI.test", "coefficients") + return(out) + }else{ + print("Splitting to test and train data") + # divide data so that there are equal proportions of outcomes with training and test set! + nr.samples=floor(percentage*dim(DATA_ORG)[1]) + nr.events=floor(table(status)*percentage) + + set.seed(0) + x <- which(status==1) + vf <- logical(length(status==1)) + vf[x[sample.int(length(x), nr.events[2])]] <- TRUE + + set.seed(0) + x <- which(status==0) + vf2 <- logical(length(status==0)) + vf2[x[sample.int(length(x), nr.events[1])]] <- TRUE + + find=vf|vf2 + + # define training and testing datasets + TRAIN=DATA_ORG[!find,] + TEST=DATA_ORG[find,] + + ytrain=Surv(time[!find], status[!find]) + ytest=Surv(time[find], status[find]) + + # this must look the same: + log=fun.kapplanMeier(time, status, GROUPS=(rownames(DATA_ORG)%in%rownames(TRAIN)*1), MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=T, NAME = "These must be similar\nand high pval: 1 TRAIN, 0 TEST") + + formdf1 <- as.formula(paste(" ~ ", paste(colnames(TRAIN),collapse="+"))) + x=model.matrix(formdf1, data=TRAIN) + + # go through alpha sequence with nfolds and repeat (100 times etc) cross validation on different sets of samples + enet.rep=do.call(rbind, parallel::mclapply(seq(REPEATS), function(r){ + + s <- do.call(rbind, lapply(a, function(i){ + cv <- cv.glmnet(x=x,y=ytrain, family = "cox", nfold = floor((dim(DATA_ORG)[1]/10)), type.measure = "deviance", alpha = i, standardize=F) + data.frame(cvm = cv$cvm[cv$lambda == cv$lambda.min], lambda.min = cv$lambda.min, alpha = i) + })) + + return(s) + }, mc.cores=cores)) + + # find mean alpha and lambda using all repeats + search_m=do.call(rbind, lapply(a, function(alpha){ + cvm=mean(enet.rep$cvm[enet.rep$alpha==alpha]) + lambda=mean(enet.rep$lambda.min[enet.rep$alpha==alpha]) + data.frame(alpha, cvm, lambda) + })) + + # minimum cvm error: + cv3 <- search_m[search_m$cvm == min(search_m$cvm), ] + + print(cv3$alpha) + print(cv3$lambda) + print(cv3$cvm) + + # fit model with tuned alpha, use lambda sequence here, not single value + md3 <- glmnet(x=x,y=ytrain, family = "cox", alpha = cv3$alpha, standardize=F) + + coefficients <- coef(md3, s = cv3$lambda)[-1,,drop=F] + coefficients=coefficients[order(abs(coefficients[,1]), decreasing = T),,drop=F] + active_coefficients <- coefficients[,1] != 0 + + coefficients_all=coefficients[active_coefficients,,drop=F] + comprisk=TRAIN[,match(rownames(coefficients_all), colnames(TRAIN))] + risk_patient=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk))) + + comprisk=TEST[,match(rownames(coefficients_all), colnames(TEST))] + risk_patient_test=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk))) + + df=data.frame(x=risk_patient) + ggplot(df, aes(x=x))+ + geom_density(color="lightblue", fill="lightblue") + + df=data.frame(x=risk_patient_test) + ggplot(df, aes(x=x))+ + geom_density(color="indianred", fill="indianred") + + fun.kapplanMeier(time[!find], status[!find], CONTINUOUS=risk_patient, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "predicted risk") + fun.kapplanMeier(time[find], status[find], CONTINUOUS=risk_patient_test, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "validated risk") + + a=list(summary(coxph(ytrain ~ PI.train, data.frame("PI.train"=risk_patient)))) + b=list(summary(coxph(ytest ~ PI.test, data.frame("PI.test"=risk_patient_test)))) + c=list(coefficients_all) + + out=c(a,b,c) + names(out)=c("PI.train", "PI.test", "coefficients") + return(out) + } +}