wrapper.wilcoxtest=function(genelist, data, logicalVectors, logicalVector_normals=NULL, ALTERNATIVE="greater", adj.method="BH", CORES=1, prettynum=T){
library(parallel)
data=do.call(rbind, mclapply(genelist, TestGeneWilcox, data, logicalVectors, logicalVector_normals=logicalVector_normals, ALTERNATIVE, prettynum, mc.cores = CORES))
data$adj.p=p.adjust(data$p, method = adj.method)
data$adj.method=adj.method
# signifcode
data$signifCode=""
data$signifCode[as.numeric(data$adj.p)<0.1]="."
data$signifCode[as.numeric(data$adj.p)<0.05]="*"
data$signifCode[as.numeric(data$adj.p)<0.01]="**"
data$signifCode[as.numeric(data$adj.p)<0.001]="***"
data$signifCode[as.numeric(data$adj.p)<0.0001]="****"
if(prettynum){
data$adj.p=prettyNum(signif(data$adj.p,2))
}
if(adj.method=="BH"){
colnames(data)[colnames(data)%in%"adj.p"]="FDR"
}
data=data[,c(1:5, 7,9, 6,8)]
return(data)
}
FUN_GOTHROUGH=function(i, genelist, list_cancers, logicalVector_normals, TEST_FAILS=F, HG_PVAL=0.001, MW_PVAL=0.001, ALTERNATIVE="greater"){
name=names(list_cancers)[i]
logicalVector=as.logical(unlist(list_cancers[[i]]))
logicalVector_normals=logicalVector_normals[!names(logicalVector_normals)%in%name]
result=FUN_HGT_MW_TEST(genelist, logicalVector,logicalVector_normals, name, TEST_FAILS=F, data = data, profile = profile,HG_PVAL = HG_PVAL, MW_PVAL=MW_PVAL, ALTERNATIVE = ALTERNATIVE, CORES = 10)
}
fisher.wrapper=function(lv.list1, lv.list2, alternative="two.sided", method="bonferroni",log10=F, prettyNumbers=F,orderdata=F, cores=1){
n1=names(lv.list1)
n2=names(lv.list2)
res=do.call(rbind,mclapply(seq(lv.list1), function(i){
do.call(rbind,lapply(seq(lv.list2), function(j){
data.frame("featureA"=n1[i], "featureB"=n2[j],fisher.2x2(lv.list1[[i]], lv.list2[[j]]))
}))
}, mc.cores=cores))
res$adj.p=p.adjust(res$p, method=method)
res$signifCode=""
res$signifCode[as.numeric(res$adj.p)<0.1]="."
res$signifCode[as.numeric(res$adj.p)<0.05]="*"
res$signifCode[as.numeric(res$adj.p)<0.01]="**"
res$signifCode[as.numeric(res$adj.p)<0.001]="***"
res$signifCode[as.numeric(res$adj.p)<0.0001]="****"
if(log10){
res$p=abs(log10(res$p))
res$adj.p=abs(log10(res$p))
}
if(prettyNumbers){
res$log.odds=prettyNum(signif(res$log.odds,2))
res$p=prettyNum(signif(res$p,2))
res$adj.p=prettyNum(signif(res$adj.p,2))
}
if(orderdata)res=res[order(res$featureA, as.numeric(res$adj.p), decreasing = F),]
return(res)
}
fisher.matrix.pairwise=function(E, plot=F, alternative="two.sided"){
# perform a fisher exact test for each pairs of gene and saves the oddsRatio and p-value
res <- NULL
for(i in seq(dim(E)[2])){
for(j in seq(dim(E)[2])){
if(i!=j){
f <- fisher.test(E[,i], E[,j], alternative = alternative)
res <- rbind(res,cbind(geneA=colnames(E)[i],geneB=colnames(E)[j],oddsRatio=f$estimate,pvalue=f$p.value))
}
}
}
# some formatting
res <- as.data.frame(res)
res$geneA <- factor(res$geneA,levels=unique(res$geneA))
res$geneB <- factor(res$geneB,levels=unique(res$geneB))
res$oddsRatio <- as.numeric(as.character(res$oddsRatio))
# avoid inf values
res$oddsRatio[res$oddsRatio>10]=10
res$oddsRatio[res$oddsRatio<=0]=0.01
res$pvalue <- as.numeric(as.character(res$pvalue))
# use p.adjust to correct for multi testing using a FDR
res <- cbind(res,fdr=p.adjust(res$pvalue,"fdr"))
# change the FDR in labels for plotting
res$stars <- cut(res$fdr, breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ""))
if(plot){
# plot with ggplot 2
require(ggplot2)
require(cowplot) # not necessary but the plot is nicer
p <- ggplot(res, aes(geneA, geneB)) + geom_tile(aes(fill = log2(oddsRatio+0.01)),colour = "white") + scale_fill_gradient2(midpoint=1) + geom_text(aes(label=stars), color="black", size=5)
p
return(p)
}
return(res)
}
fun.get.cox=function(i, logicalv, DATA, univariate=T, pretty=T, TIME, STATUS){
logicalvector=logicalv[[i]]
n=names(logicalv)[i]
# remove if all are NA
DATA=DATA[,!apply(DATA[logicalvector,,drop=F], 2, function(v)all(is.na(v)))]
if(univariate){
univ=do.call(rbind, lapply(seq(dim(DATA)[2]), function(j){
y=Surv(TIME[logicalvector], STATUS[logicalvector])
fit=coxph(formula = as.formula(paste("y ~ ", paste(colnames(DATA)[j], collapse="+"))), DATA[logicalvector,,drop=F])
b=cox.zph(fit)[[1]][3] # p.value
a=summary(fit)
pval=a$coefficients[1,5]
coef=a$conf.int[c(1,3,4)]
v=data.frame(colnames(DATA)[j], t(coef), pval,"","", a$concordance[1], b<0.05, stringsAsFactors = F)
rownames(v)=NULL
return(v)
}))
# adjust P-value depending on number of genes tested:
univ[,6]=p.adjust(univ[,5], method="BH")
}else{
y=Surv(TIME[logicalvector], STATUS[logicalvector])
fit=coxph(formula = as.formula(paste("y ~ ", paste(colnames(DATA), collapse="+"))), DATA[logicalvector,])
b=cox.zph(fit)$table[,3] # p.value
b=b[!names(b)%in%"GLOBAL"]
a=summary(fit)
pval=a$coefficients[,5]
coef=a$conf.int[,c(1,3,4)]
univ=data.frame(rownames(a$coefficients), coef, pval, "", "", a$concordance[1], b<0.05, stringsAsFactors = F)
rownames(univ)=NULL
# no need to adjust P-value here:
univ[,6]=univ[,5]
}
univ=univ[order(univ[,5]),]
univ[,7][as.numeric(univ[,6 ])<0.1]="."
univ[,7][as.numeric(univ[,6 ])<0.05]="*"
univ[,7][as.numeric(univ[,6 ])<0.01]="**"
univ[,7][as.numeric(univ[,6 ])<0.001]="***"
univ[,7][as.numeric(univ[,6 ])<0.0001]="****"
univ[,7][is.na(univ[,7])]=""
val=data.frame(univ, n, stringsAsFactors = F)
colnames(val)=c("Feature", "exp(coef)", "lower .95", "upper .95", "P", "Adj.P", "Signif", "concordance", "zph P < 0.05", "Name")
if(pretty){
for(i in c(2:6,8)){
val[,i]=prettyNum(signif(val[,i],2))
}
}
return(val)
}
fun.cox.elasticnet=function(DATA_ORG, time, status, cores=8, nfold=10,min.elnet=0.01,max.elnet=0.99, summary.km="85th_15th_percentile", percentage=0.25, REPEATS=100){
library(glmnet)
# run regularized regression
a <- seq(min.elnet, max.elnet, 0.05)
if(percentage==0){
y=Surv(time, status)
formdf1 <- as.formula(paste(" ~ ", paste(colnames(DATA_ORG),collapse="+")))
x=model.matrix(formdf1, data=DATA_ORG)
# go through alpha sequence with nfolds and repeat (100 times etc) cross validation on different sets of samples
enet.rep=do.call(rbind, parallel::mclapply(seq(REPEATS), function(r){
s <- do.call(rbind, lapply(a, function(i){
cv <- cv.glmnet(x=x,y=y, family = "cox", nfold = nfold, type.measure = "deviance", alpha = i, standardize=F)
data.frame(cvm = cv$cvm[cv$lambda == cv$lambda.min], lambda.min = cv$lambda.min, alpha = i)
}))
return(s)
}, mc.cores=cores))
# find mean alpha and lambda using all repeats
search_m=do.call(rbind, lapply(a, function(alpha){
cvm=mean(enet.rep$cvm[enet.rep$alpha==alpha])
lambda=mean(enet.rep$lambda.min[enet.rep$alpha==alpha])
data.frame(alpha, cvm, lambda)
}))
# minimum cvm error:
cv3 <- search_m[search_m$cvm == min(search_m$cvm), ]
print(cv3$alpha)
print(cv3$lambda)
print(cv3$cvm)
# fit model with tuned alpha, use lambda sequence here, not single value
md3 <- glmnet(x=x,y=y, family = "cox", alpha = cv3$alpha, standardize=F)
coefficients <- coef(md3, s = cv3$lambda)[-1,,drop=F]
coefficients=coefficients[order(abs(coefficients[,1]), decreasing = T),,drop=F]
active_coefficients <- coefficients[,1] != 0
coefficients_all=coefficients[active_coefficients,,drop=F]
comprisk=DATA_ORG[,match(rownames(coefficients_all), colnames(DATA_ORG))]
risk_patient=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk)))
df=data.frame(x=risk_patient)
ggplot(df, aes(x=x))+
geom_density(color="lightblue", fill="lightblue")
fun.kapplanMeier(time, status, CONTINUOUS=risk_patient, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "predicted risk")
a=list(summary(coxph(y ~ PI.train, data.frame("PI.train"=risk_patient))))
c=list(coefficients_all)
out=c(a,c)
names(out)=c("PI.test", "coefficients")
return(out)
}else{
print("Splitting to test and train data")
# divide data so that there are equal proportions of outcomes with training and test set!
nr.samples=floor(percentage*dim(DATA_ORG)[1])
nr.events=floor(table(status)*percentage)
set.seed(0)
x <- which(status==1)
vf <- logical(length(status==1))
vf[x[sample.int(length(x), nr.events[2])]] <- TRUE
set.seed(0)
x <- which(status==0)
vf2 <- logical(length(status==0))
vf2[x[sample.int(length(x), nr.events[1])]] <- TRUE
find=vf|vf2
# define training and testing datasets
TRAIN=DATA_ORG[!find,]
TEST=DATA_ORG[find,]
ytrain=Surv(time[!find], status[!find])
ytest=Surv(time[find], status[find])
# this must look the same:
log=fun.kapplanMeier(time, status, GROUPS=(rownames(DATA_ORG)%in%rownames(TRAIN)*1), MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=T, NAME = "These must be similar\nand high pval: 1 TRAIN, 0 TEST")
formdf1 <- as.formula(paste(" ~ ", paste(colnames(TRAIN),collapse="+")))
x=model.matrix(formdf1, data=TRAIN)
# go through alpha sequence with nfolds and repeat (100 times etc) cross validation on different sets of samples
enet.rep=do.call(rbind, parallel::mclapply(seq(REPEATS), function(r){
s <- do.call(rbind, lapply(a, function(i){
cv <- cv.glmnet(x=x,y=ytrain, family = "cox", nfold = floor((dim(DATA_ORG)[1]/10)), type.measure = "deviance", alpha = i, standardize=F)
data.frame(cvm = cv$cvm[cv$lambda == cv$lambda.min], lambda.min = cv$lambda.min, alpha = i)
}))
return(s)
}, mc.cores=cores))
# find mean alpha and lambda using all repeats
search_m=do.call(rbind, lapply(a, function(alpha){
cvm=mean(enet.rep$cvm[enet.rep$alpha==alpha])
lambda=mean(enet.rep$lambda.min[enet.rep$alpha==alpha])
data.frame(alpha, cvm, lambda)
}))
# minimum cvm error:
cv3 <- search_m[search_m$cvm == min(search_m$cvm), ]
print(cv3$alpha)
print(cv3$lambda)
print(cv3$cvm)
# fit model with tuned alpha, use lambda sequence here, not single value
md3 <- glmnet(x=x,y=ytrain, family = "cox", alpha = cv3$alpha, standardize=F)
coefficients <- coef(md3, s = cv3$lambda)[-1,,drop=F]
coefficients=coefficients[order(abs(coefficients[,1]), decreasing = T),,drop=F]
active_coefficients <- coefficients[,1] != 0
coefficients_all=coefficients[active_coefficients,,drop=F]
comprisk=TRAIN[,match(rownames(coefficients_all), colnames(TRAIN))]
risk_patient=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk)))
comprisk=TEST[,match(rownames(coefficients_all), colnames(TEST))]
risk_patient_test=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk)))
df=data.frame(x=risk_patient)
ggplot(df, aes(x=x))+
geom_density(color="lightblue", fill="lightblue")
df=data.frame(x=risk_patient_test)
ggplot(df, aes(x=x))+
geom_density(color="indianred", fill="indianred")
fun.kapplanMeier(time[!find], status[!find], CONTINUOUS=risk_patient, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "predicted risk")
fun.kapplanMeier(time[find], status[find], CONTINUOUS=risk_patient_test, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "validated risk")
a=list(summary(coxph(ytrain ~ PI.train, data.frame("PI.train"=risk_patient))))
b=list(summary(coxph(ytest ~ PI.test, data.frame("PI.test"=risk_patient_test))))
c=list(coefficients_all)
out=c(a,b,c)
names(out)=c("PI.train", "PI.test", "coefficients")
return(out)
}
}