a b/common_scripts/statistics/statistics_wrappers.R
1
wrapper.wilcoxtest=function(genelist, data, logicalVectors, logicalVector_normals=NULL, ALTERNATIVE="greater", adj.method="BH", CORES=1, prettynum=T){
2
  library(parallel)
3
  data=do.call(rbind, mclapply(genelist, TestGeneWilcox, data, logicalVectors, logicalVector_normals=logicalVector_normals, ALTERNATIVE, prettynum, mc.cores = CORES))
4
  data$adj.p=p.adjust(data$p, method = adj.method)
5
  
6
  data$adj.method=adj.method
7
  
8
  # signifcode
9
  data$signifCode=""
10
  data$signifCode[as.numeric(data$adj.p)<0.1]="."
11
  data$signifCode[as.numeric(data$adj.p)<0.05]="*"
12
  data$signifCode[as.numeric(data$adj.p)<0.01]="**"
13
  data$signifCode[as.numeric(data$adj.p)<0.001]="***"
14
  data$signifCode[as.numeric(data$adj.p)<0.0001]="****"
15
  
16
  if(prettynum){
17
    data$adj.p=prettyNum(signif(data$adj.p,2))
18
  }
19
  
20
  if(adj.method=="BH"){
21
    colnames(data)[colnames(data)%in%"adj.p"]="FDR"
22
  }
23
  
24
  data=data[,c(1:5, 7,9, 6,8)]
25
  
26
  return(data)
27
}
28
29
FUN_GOTHROUGH=function(i, genelist, list_cancers, logicalVector_normals, TEST_FAILS=F, HG_PVAL=0.001, MW_PVAL=0.001, ALTERNATIVE="greater"){
30
  name=names(list_cancers)[i]
31
  logicalVector=as.logical(unlist(list_cancers[[i]]))
32
  logicalVector_normals=logicalVector_normals[!names(logicalVector_normals)%in%name]
33
  
34
  result=FUN_HGT_MW_TEST(genelist, logicalVector,logicalVector_normals, name, TEST_FAILS=F, data = data, profile = profile,HG_PVAL = HG_PVAL, MW_PVAL=MW_PVAL, ALTERNATIVE = ALTERNATIVE, CORES = 10)
35
}
36
37
fisher.wrapper=function(lv.list1, lv.list2, alternative="two.sided", method="bonferroni",log10=F, prettyNumbers=F,orderdata=F, cores=1){
38
  
39
  n1=names(lv.list1)
40
  n2=names(lv.list2)
41
  
42
  res=do.call(rbind,mclapply(seq(lv.list1), function(i){
43
    do.call(rbind,lapply(seq(lv.list2), function(j){
44
      data.frame("featureA"=n1[i], "featureB"=n2[j],fisher.2x2(lv.list1[[i]], lv.list2[[j]]))
45
    }))
46
  }, mc.cores=cores))
47
  
48
  res$adj.p=p.adjust(res$p, method=method)
49
  res$signifCode=""
50
  res$signifCode[as.numeric(res$adj.p)<0.1]="."
51
  res$signifCode[as.numeric(res$adj.p)<0.05]="*"
52
  res$signifCode[as.numeric(res$adj.p)<0.01]="**"
53
  res$signifCode[as.numeric(res$adj.p)<0.001]="***"
54
  res$signifCode[as.numeric(res$adj.p)<0.0001]="****"
55
  
56
  if(log10){
57
    res$p=abs(log10(res$p))
58
    res$adj.p=abs(log10(res$p))
59
  }
60
  
61
  if(prettyNumbers){
62
    res$log.odds=prettyNum(signif(res$log.odds,2))
63
    res$p=prettyNum(signif(res$p,2))
64
    res$adj.p=prettyNum(signif(res$adj.p,2))
65
  }
66
  
67
  if(orderdata)res=res[order(res$featureA, as.numeric(res$adj.p), decreasing = F),]
68
  return(res)
69
}
70
71
fisher.matrix.pairwise=function(E, plot=F, alternative="two.sided"){
72
  # perform a fisher exact test for each pairs of gene and saves the oddsRatio and p-value
73
  res <- NULL
74
  for(i in seq(dim(E)[2])){
75
    for(j in seq(dim(E)[2])){
76
      if(i!=j){
77
        f <- fisher.test(E[,i], E[,j], alternative = alternative) 
78
        res <- rbind(res,cbind(geneA=colnames(E)[i],geneB=colnames(E)[j],oddsRatio=f$estimate,pvalue=f$p.value))
79
      }   
80
    }
81
  }
82
  # some formatting
83
  res <- as.data.frame(res)
84
  res$geneA <- factor(res$geneA,levels=unique(res$geneA))
85
  res$geneB <- factor(res$geneB,levels=unique(res$geneB))
86
  res$oddsRatio <- as.numeric(as.character(res$oddsRatio))
87
  
88
  # avoid inf values
89
  res$oddsRatio[res$oddsRatio>10]=10
90
  res$oddsRatio[res$oddsRatio<=0]=0.01
91
  
92
  res$pvalue <- as.numeric(as.character(res$pvalue))
93
  # use p.adjust to correct for multi testing using a FDR
94
  res <- cbind(res,fdr=p.adjust(res$pvalue,"fdr"))
95
  # change the FDR in labels for plotting
96
  res$stars <- cut(res$fdr, breaks=c(-Inf, 0.001, 0.01, 0.05, Inf), label=c("***", "**", "*", ""))
97
 
98
  if(plot){
99
    # plot with ggplot 2
100
    require(ggplot2)
101
    require(cowplot) # not necessary but the plot is nicer
102
    p <- ggplot(res, aes(geneA, geneB)) + geom_tile(aes(fill = log2(oddsRatio+0.01)),colour = "white") + scale_fill_gradient2(midpoint=1) + geom_text(aes(label=stars), color="black", size=5)
103
    p
104
    return(p)
105
  }
106
 
107
  return(res)
108
}
109
110
fun.get.cox=function(i, logicalv, DATA, univariate=T, pretty=T, TIME, STATUS){
111
  logicalvector=logicalv[[i]]
112
  n=names(logicalv)[i]
113
114
  # remove if all are NA
115
  DATA=DATA[,!apply(DATA[logicalvector,,drop=F], 2, function(v)all(is.na(v)))]
116
  
117
  if(univariate){
118
    univ=do.call(rbind, lapply(seq(dim(DATA)[2]), function(j){
119
      y=Surv(TIME[logicalvector], STATUS[logicalvector])
120
      fit=coxph(formula = as.formula(paste("y ~ ", paste(colnames(DATA)[j], collapse="+"))), DATA[logicalvector,,drop=F])
121
122
      b=cox.zph(fit)[[1]][3] # p.value
123
124
      a=summary(fit)
125
126
      pval=a$coefficients[1,5]
127
      coef=a$conf.int[c(1,3,4)]
128
      v=data.frame(colnames(DATA)[j], t(coef), pval,"","", a$concordance[1], b<0.05, stringsAsFactors = F)
129
      rownames(v)=NULL
130
      return(v)
131
    }))
132
133
    # adjust P-value depending on number of genes tested:
134
    univ[,6]=p.adjust(univ[,5], method="BH")
135
136
  }else{
137
    y=Surv(TIME[logicalvector], STATUS[logicalvector])
138
    fit=coxph(formula = as.formula(paste("y ~ ", paste(colnames(DATA), collapse="+"))), DATA[logicalvector,])
139
140
    b=cox.zph(fit)$table[,3] # p.value
141
    b=b[!names(b)%in%"GLOBAL"]
142
143
    a=summary(fit)
144
145
    pval=a$coefficients[,5]
146
    coef=a$conf.int[,c(1,3,4)]
147
    univ=data.frame(rownames(a$coefficients), coef, pval, "", "", a$concordance[1], b<0.05, stringsAsFactors = F)
148
    rownames(univ)=NULL
149
150
    # no need to adjust P-value here:
151
    univ[,6]=univ[,5]
152
  }
153
154
  univ=univ[order(univ[,5]),]
155
156
  univ[,7][as.numeric(univ[,6 ])<0.1]="."
157
  univ[,7][as.numeric(univ[,6 ])<0.05]="*"
158
  univ[,7][as.numeric(univ[,6 ])<0.01]="**"
159
  univ[,7][as.numeric(univ[,6 ])<0.001]="***"
160
  univ[,7][as.numeric(univ[,6 ])<0.0001]="****"
161
  univ[,7][is.na(univ[,7])]=""
162
163
  val=data.frame(univ, n, stringsAsFactors = F)
164
  colnames(val)=c("Feature", "exp(coef)", "lower .95", "upper .95", "P", "Adj.P", "Signif", "concordance", "zph P < 0.05", "Name")
165
166
  if(pretty){
167
    for(i in c(2:6,8)){
168
      val[,i]=prettyNum(signif(val[,i],2))
169
    }
170
  }
171
172
  return(val)
173
}
174
175
fun.cox.elasticnet=function(DATA_ORG, time, status, cores=8, nfold=10,min.elnet=0.01,max.elnet=0.99, summary.km="85th_15th_percentile", percentage=0.25, REPEATS=100){
176
  library(glmnet)
177
178
  # run regularized regression
179
  a <- seq(min.elnet, max.elnet, 0.05)
180
181
  if(percentage==0){
182
    y=Surv(time, status)
183
184
    formdf1 <- as.formula(paste(" ~ ", paste(colnames(DATA_ORG),collapse="+")))
185
    x=model.matrix(formdf1, data=DATA_ORG)
186
187
    # go through alpha sequence with nfolds and repeat (100 times etc) cross validation on different sets of samples
188
    enet.rep=do.call(rbind, parallel::mclapply(seq(REPEATS), function(r){
189
190
      s <- do.call(rbind, lapply(a, function(i){
191
        cv <- cv.glmnet(x=x,y=y, family = "cox", nfold = nfold, type.measure = "deviance", alpha = i, standardize=F)
192
        data.frame(cvm = cv$cvm[cv$lambda == cv$lambda.min], lambda.min = cv$lambda.min, alpha = i)
193
      }))
194
195
      return(s)
196
    }, mc.cores=cores))
197
198
    # find mean alpha and lambda using all repeats
199
    search_m=do.call(rbind, lapply(a, function(alpha){
200
      cvm=mean(enet.rep$cvm[enet.rep$alpha==alpha])
201
      lambda=mean(enet.rep$lambda.min[enet.rep$alpha==alpha])
202
      data.frame(alpha, cvm, lambda)
203
    }))
204
205
    # minimum cvm error:
206
    cv3 <- search_m[search_m$cvm == min(search_m$cvm), ]
207
208
    print(cv3$alpha)
209
    print(cv3$lambda)
210
    print(cv3$cvm)
211
212
    # fit model with tuned alpha, use lambda sequence here, not single value
213
    md3 <- glmnet(x=x,y=y, family = "cox", alpha = cv3$alpha, standardize=F)
214
215
    coefficients <- coef(md3, s = cv3$lambda)[-1,,drop=F]
216
    coefficients=coefficients[order(abs(coefficients[,1]), decreasing = T),,drop=F]
217
    active_coefficients <- coefficients[,1] != 0
218
219
    coefficients_all=coefficients[active_coefficients,,drop=F]
220
221
    comprisk=DATA_ORG[,match(rownames(coefficients_all), colnames(DATA_ORG))]
222
    risk_patient=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk)))
223
224
    df=data.frame(x=risk_patient)
225
    ggplot(df, aes(x=x))+
226
      geom_density(color="lightblue", fill="lightblue")
227
228
    fun.kapplanMeier(time, status, CONTINUOUS=risk_patient, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "predicted risk")
229
230
    a=list(summary(coxph(y ~ PI.train, data.frame("PI.train"=risk_patient))))
231
    c=list(coefficients_all)
232
233
    out=c(a,c)
234
    names(out)=c("PI.test", "coefficients")
235
    return(out)
236
  }else{
237
    print("Splitting to test and train data")
238
    # divide data so that there are equal proportions of outcomes with training and test set!
239
    nr.samples=floor(percentage*dim(DATA_ORG)[1])
240
    nr.events=floor(table(status)*percentage)
241
242
    set.seed(0)
243
    x <- which(status==1)
244
    vf <- logical(length(status==1))
245
    vf[x[sample.int(length(x), nr.events[2])]] <- TRUE
246
247
    set.seed(0)
248
    x <- which(status==0)
249
    vf2 <- logical(length(status==0))
250
    vf2[x[sample.int(length(x), nr.events[1])]] <- TRUE
251
252
    find=vf|vf2
253
254
    # define training and testing datasets
255
    TRAIN=DATA_ORG[!find,]
256
    TEST=DATA_ORG[find,]
257
258
    ytrain=Surv(time[!find], status[!find])
259
    ytest=Surv(time[find], status[find])
260
261
    # this must look the same:
262
    log=fun.kapplanMeier(time, status, GROUPS=(rownames(DATA_ORG)%in%rownames(TRAIN)*1), MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=T, NAME = "These must be similar\nand high pval: 1 TRAIN, 0 TEST")
263
264
    formdf1 <- as.formula(paste(" ~ ", paste(colnames(TRAIN),collapse="+")))
265
    x=model.matrix(formdf1, data=TRAIN)
266
267
    # go through alpha sequence with nfolds and repeat (100 times etc) cross validation on different sets of samples
268
    enet.rep=do.call(rbind, parallel::mclapply(seq(REPEATS), function(r){
269
270
      s <- do.call(rbind, lapply(a, function(i){
271
        cv <- cv.glmnet(x=x,y=ytrain, family = "cox", nfold = floor((dim(DATA_ORG)[1]/10)), type.measure = "deviance", alpha = i, standardize=F)
272
        data.frame(cvm = cv$cvm[cv$lambda == cv$lambda.min], lambda.min = cv$lambda.min, alpha = i)
273
      }))
274
275
      return(s)
276
    }, mc.cores=cores))
277
278
    # find mean alpha and lambda using all repeats
279
    search_m=do.call(rbind, lapply(a, function(alpha){
280
      cvm=mean(enet.rep$cvm[enet.rep$alpha==alpha])
281
      lambda=mean(enet.rep$lambda.min[enet.rep$alpha==alpha])
282
      data.frame(alpha, cvm, lambda)
283
    }))
284
285
    # minimum cvm error:
286
    cv3 <- search_m[search_m$cvm == min(search_m$cvm), ]
287
288
    print(cv3$alpha)
289
    print(cv3$lambda)
290
    print(cv3$cvm)
291
292
    # fit model with tuned alpha, use lambda sequence here, not single value
293
    md3 <- glmnet(x=x,y=ytrain, family = "cox", alpha = cv3$alpha, standardize=F)
294
295
    coefficients <- coef(md3, s = cv3$lambda)[-1,,drop=F]
296
    coefficients=coefficients[order(abs(coefficients[,1]), decreasing = T),,drop=F]
297
    active_coefficients <- coefficients[,1] != 0
298
299
    coefficients_all=coefficients[active_coefficients,,drop=F]
300
    comprisk=TRAIN[,match(rownames(coefficients_all), colnames(TRAIN))]
301
    risk_patient=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk)))
302
303
    comprisk=TEST[,match(rownames(coefficients_all), colnames(TEST))]
304
    risk_patient_test=as.numeric(as.numeric(coefficients_all) %*% data.matrix(t(comprisk)))
305
306
    df=data.frame(x=risk_patient)
307
    ggplot(df, aes(x=x))+
308
      geom_density(color="lightblue", fill="lightblue")
309
310
    df=data.frame(x=risk_patient_test)
311
    ggplot(df, aes(x=x))+
312
      geom_density(color="indianred", fill="indianred")
313
314
    fun.kapplanMeier(time[!find], status[!find], CONTINUOUS=risk_patient, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "predicted risk")
315
    fun.kapplanMeier(time[find], status[find], CONTINUOUS=risk_patient_test, CONTINUOUS_SUMMARY = summary.km, MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=F, NAME = "validated risk")
316
317
    a=list(summary(coxph(ytrain ~ PI.train, data.frame("PI.train"=risk_patient))))
318
    b=list(summary(coxph(ytest ~ PI.test, data.frame("PI.test"=risk_patient_test))))
319
    c=list(coefficients_all)
320
321
    out=c(a,b,c)
322
    names(out)=c("PI.train", "PI.test", "coefficients")
323
    return(out)
324
  }
325
}