Switch to unified view

a b/common_scripts/statistics/functions_statistics.R
1
fisher.2x2=function(lv1, lv2, alternative="two.sided", usefast=T){
2
  cont=table(lv1, lv2)
3
  cont[is.na(cont)]=0
4
  
5
  if(usefast){
6
    p=HighSpeedStats::ultrafastfet(cont[1,1], cont[1,2], cont[2,1], cont[2,2])
7
    odds=NA
8
  }
9
  
10
  else{
11
    ft=fisher.test(cont, alternative=alternative)
12
    p=ft$p.value
13
    odds=ft$estimate
14
  }
15
  
16
  data.frame("p"=p, "log.odds"=odds, stringsAsFactors = F)
17
}
18
19
TestGeneWilcox=function(gene, datam, logicalVectors, logicalVector_normals=NULL, ALTERNATIVE="greater", prettynum=T){
20
  if(!gene%in%rownames(datam)) return(NULL)
21
  D = as.numeric(datam[rownames(datam)%in%gene,])
22
  
23
  d_list=lapply(logicalVectors, function(v){
24
    D[v][!is.na(D[v])]
25
  })
26
  
27
  # in this case, compute against rest of the samples for the logicalV
28
  if(is.null(logicalVector_normals)){
29
    logicalVector_normals=lapply(logicalVectors, '!') # negate
30
    names(logicalVector_normals)=rep("Rest", length(logicalVector_normals))
31
    
32
    d_list2=lapply(logicalVector_normals, function(v){
33
      D[v][!is.na(D[v])]
34
    })
35
    
36
    stats=do.call(rbind, lapply(seq(d_list), function(i, d_list, d_list2){
37
      l=d_list[[i]]
38
      name1=names(d_list)[i]
39
      name2=names(d_list2)[i]
40
      
41
      P.value=sapply(seq(d_list2)[i], function(j, l, d_list2){
42
        l2=d_list2[[j]]
43
        signif(wilcox.test(l, l2, ALTERNATIVE)$p.value,2)
44
      }, l, d_list2)
45
      
46
      FC=sapply(d_list2[i], function(l2, l1){
47
        2^(mean(l)-mean(l2))
48
      })
49
      
50
      df=data.frame("Gene"=gene, "Group1"=name1, "Group2"=name2, FC, "p"=P.value, "alternative"=ALTERNATIVE, stringsAsFactors=F)
51
      rownames(df)=NULL
52
      df=df[!df$Group1==df$Group2,]
53
      
54
      if(prettynum){
55
        df[,4]=prettyNum(signif(df[,4], 2))
56
        df[,5]=prettyNum(signif(df[,5], 2))
57
      }
58
      
59
      return(df)
60
    },d_list, d_list2))
61
    return(stats)
62
  }
63
  
64
  d_list2=lapply(logicalVector_normals, function(v){
65
    D[v][!is.na(D[v])]
66
  })
67
  
68
  stats=do.call(rbind, lapply(seq(d_list), function(i, d_list, d_list2){
69
    l=d_list[[i]]
70
    name1=names(d_list)[i]
71
    name2=names(d_list2)
72
    
73
    P.value=sapply(seq(d_list2), function(j, l, d_list2){
74
      l2=d_list2[[j]]
75
      signif(wilcox.test(l, l2, ALTERNATIVE)$p.value,2)
76
    }, l, d_list2)
77
    
78
    FC=sapply(d_list2, function(l2, l1){
79
      2^(mean(l)-mean(l2))
80
    })
81
    
82
    df=data.frame("Gene"=gene, "Group1"=name1, "Group2"=name2, FC, "p"=P.value, "alternative"=ALTERNATIVE, stringsAsFactors=F)
83
    rownames(df)=NULL
84
    df=df[!df$Group1==df$Group2,]
85
    
86
    if(prettynum){
87
      df[,4]=prettyNum(signif(df[,4], 2))
88
      df[,5]=prettyNum(signif(df[,5], 2))
89
    }
90
    
91
    return(df)
92
  },d_list, d_list2))
93
  
94
  return(stats)
95
}
96
97
hgt.enrichment=function(genes, logicalVector, profile = profile, adjust.method="bonferroni") {
98
  
99
  # Initialize stuff
100
  genes = genes[genes%in%colnames(profile)]
101
  P = profile[,colnames(profile)%in%genes]
102
103
  R = as.data.frame(matrix(NA, nrow = length(genes), ncol = 3))
104
  colnames(R) = c("gene", "pvalue", "adj.pvalue")
105
  R[,1] = genes
106
  
107
  
108
  #************ Hyperg test, disease association ***************
109
  FUN=function(j){
110
    r = genes[j]
111
    k = sum(logicalVector)
112
    x = P[,colnames(P)==r]
113
    m = sum(x)
114
    n = length(x) - m
115
    q = sum(x[logicalVector])
116
    phyper(q-1,m,n,k, lower.tail = F)
117
  }
118
  
119
  R[,2]=unlist(mclapply(1:nrow(R), FUN, mc.cores=CORES))
120
  
121
  R[,3] = p.adjust(R[,2], adjust.method)
122
  
123
  # log10 transformation and filtering
124
  R = R[order(R$adj.pvalue),]
125
  R$pvalue = round(abs(log10(R$pvalue)),1)
126
  R$adj.pvalue = round(abs(log10(R$adj.pvalue)),1)
127
  return(R)
128
}
129
130
fisher.matrix.lv=function(logicalVector.list, data, features=NULL, fisher.alternative="greater", adjust.method=NULL, to.log10=T, cores=5){
131
  if(!is.null(features))data=data[rownames(data)%in%features,]
132
  
133
  m=do.call(cbind, mclapply(logicalVector.list, function(lv){
134
    
135
    featsA=lv
136
    featsB=as.list(as.data.frame(t(data)))
137
    
138
    grid=data.frame(do.call(rbind, mclapply(seq(featsB), function(i)c(table(featsA, featsB[[i]])), mc.cores=1)))
139
    colnames(grid)=c("a", "b", "c", "d")
140
    
141
    if(fisher.alternative=="two.sided"){
142
      p.ft <- with(grid, HighSpeedStats::ultrafastfet(a, b, c, d))
143
144
    }else{
145
      p.ft <- with(grid, mapply(ft.alt, a, b, c, d, fisher.alternative))
146
    }  
147
  }, mc.cores=5))
148
  
149
  rownames(m)=rownames(data)
150
  colnames(m)=names(logicalVector.list)
151
  
152
  # order data;
153
  m=m[do.call(order, as.data.frame(m)),]
154
  
155
  if(to.log10)m=round(abs(log10(m)),1)
156
  
157
  return(m)
158
}
159
160
FUN_HGT_MW_TEST=function(genes, logicalVector,logicalVector_normals, name, TEST_FAILS, ALTERNATIVE="greater", data = data, profile = profile, CORES=3, HG_PVAL=0.001, MW_PVAL=0.001, P.ADJUST.METH="bonferroni") {
161
  
162
  # transpose if needed
163
  if(any(grepl("TP53", rownames(data))))data=t(data)
164
  if(any(grepl("TP53", rownames(profile))))profile=t(profile)
165
  
166
  # Check some prerequisites
167
  if(nrow(data)!=nrow(profile)) stop("data and profile dimension mismatch")
168
  if(length(logicalVector)!=nrow(data)) stop("logicalVector dimension mismatch.")
169
  
170
  # Initialize stuff
171
  genes = genes[genes%in%colnames(data)]
172
  P = profile[,colnames(profile)%in%genes]
173
  D = data[,colnames(data)%in%genes]
174
  
175
  R = as.data.frame(matrix(NA, nrow = length(genes), ncol = 3))
176
  colnames(R) = c("gene", "pvalue", "adj.pvalue")
177
  R[,1] = genes
178
  
179
  
180
  #************ Hyperg test, disease association ***************
181
  FUN=function(j){
182
    r = genes[j]
183
    k = sum(logicalVector)
184
    x = P[,colnames(P)==r]
185
    m = sum(x)
186
    n = length(x) - m
187
    q = sum(x[logicalVector])
188
    phyper(q-1,m,n,k, lower.tail = F)
189
  }
190
  
191
  R[,2]=unlist(mclapply(1:nrow(R), FUN, mc.cores=CORES))
192
  
193
  R[,3] = p.adjust(R[,2], P.ADJUST.METH)
194
  
195
  # log10 transformation and filtering
196
  R = R[order(R$adj.pvalue),]
197
  R = R[R$adj.pvalue < HG_PVAL,] # FILTER
198
  R$pvalue = round(abs(log10(R$pvalue)),1)
199
  R$adj.pvalue = round(abs(log10(R$adj.pvalue)),1)
200
  
201
  #******************************************************************
202
  
203
  if(dim(R)[1]>0){
204
    
205
    # Mann-whitney, higher expression in normal
206
    MW = do.call(rbind, mclapply(1:dim(R)[1],function(i) {
207
      x=R[i,]
208
      gene = as.character(x[1])
209
      S = as.numeric(D[,colnames(D)==gene])
210
      logicalVector2=as.numeric(P[,colnames(D)==gene])
211
      
212
      # choose
213
      S_D = S[logicalVector&logicalVector2]
214
      S_D = S[logicalVector]
215
      
216
      ttp=sapply(logicalVector_normals, function(lv){
217
        S_normal = D[lv,colnames(D)==gene]
218
        tt = wilcox.test(S_D, S_normal,alternative=ALTERNATIVE)
219
        return(tt$p.value)
220
      })
221
      
222
      ttp = p.adjust(ttp, P.ADJUST.METH)
223
    }))
224
    
225
    testsig=MW>MW_PVAL
226
    
227
    name_normals=names(logicalVector_normals)
228
    
229
    res=do.call(cbind, lapply(seq(name_normals), function(i){
230
      ifelse(testsig[,i], name_normals[i], "")
231
    }))
232
    
233
    fails=apply(res, 1, function(v)paste(v[!v==""], collapse=","))
234
    
235
    # also compute fold change against all normals
236
    options("scipen"=100, "digits"=3)
237
    FC = do.call(rbind, mclapply(1:dim(R)[1],function(i) {
238
      x=R[i,]
239
      gene = as.character(x[1])
240
      S = as.numeric(D[,colnames(D)==gene])
241
      logicalVector2=as.numeric(P[,colnames(D)==gene])
242
      
243
      # choose
244
      S_D = S[logicalVector&logicalVector2]
245
      S_D = S[logicalVector]
246
      
247
      fc=sapply(logicalVector_normals, function(lv){
248
        S_normal = D[lv,colnames(D)==gene]
249
        FC=mean(S_normal)-mean(S_D)
250
      })
251
    }, mc.cores=CORES))
252
    
253
    FoldChange=round(rowMeans(FC)*-1,1)
254
    
255
    FoldChange_min=round(apply(FC, 1, min)*-1,1)
256
    FoldChange_max=round(apply(FC, 1, max)*-1,1)
257
    
258
    # report all of these
259
    df=data.frame(R, FoldChange, fails, FoldChange_min, FoldChange_max)
260
    
261
  }else{
262
    return(NULL)
263
  }
264
  
265
  # output
266
  if(dim(df)[1]>0)  return(cbind(df, name, stringsAsFactors=F))
267
}
268
269
fun.kapplanMeier=function(TIME, STATUS, GROUPS=NULL, CONTINUOUS=NULL, CONTINUOUS_SUMMARY=c("maxrank", "5quantiles","4quantiles","3quantiles","85th_15th_percentile", "80th_20th_percentile", "75th_25th_percentile", "median"), NAME="", MONTHS=F, PVAL=1, INDIVIDUAL_GROUPS=T, LWD=3, labels=NULL, COLORV=NULL, conf.int=F,font.size=8, xlab="Time in months"){
270
  
271
  CONTINUOUS_SUMMARY <- match.arg(CONTINUOUS_SUMMARY)
272
  
273
  # cutpoints https://github.com/kassambara/survminer/issues/41
274
  if(MONTHS){
275
    TIME=as.numeric(TIME)*0.0328767
276
  }
277
  
278
  if(is.null(GROUPS)&!is.null(CONTINUOUS)){
279
    df=data.frame("time"=TIME, "status"=STATUS, "continuous"=CONTINUOUS, stringsAsFactors = F)
280
    
281
    NAME=paste(NAME, CONTINUOUS_SUMMARY)
282
    
283
    if(CONTINUOUS_SUMMARY=="maxrank"){
284
      library(survminer)
285
      cutoff <- surv_cutpoint(
286
        df,
287
        time = "time",
288
        event = "status",
289
        variables = c("continuous")
290
      )
291
      # print(plot(cutoff))
292
      GROUPS=rep(paste0("low (<", signif(cutoff$cutpoint$cutpoint,1), ")"), length(CONTINUOUS))
293
      GROUPS[CONTINUOUS>cutoff$cutpoint$cutpoint]=paste0("High (>=", signif(cutoff$cutpoint$cutpoint,1), ")")
294
    }
295
    
296
    if(CONTINUOUS_SUMMARY=="4quantiles"){
297
      
298
      library(dplyr)
299
      GROUPS <- paste0("Q",ntile(CONTINUOUS, 4))
300
      
301
    }
302
    
303
    if(CONTINUOUS_SUMMARY=="3quantiles"){
304
      
305
      library(dplyr)
306
      GROUPS <- paste0("Q",ntile(CONTINUOUS, 3))
307
      
308
    }
309
    
310
    if(CONTINUOUS_SUMMARY=="5quantiles"){
311
      
312
      library(dplyr)
313
      GROUPS <- paste0("Q",ntile(CONTINUOUS, 5))
314
      
315
    }
316
    
317
    
318
    if(CONTINUOUS_SUMMARY=="85th_15th_percentile"){
319
      
320
      q=quantile(CONTINUOUS, probs = c(0.15, 0.85)) # quartile
321
      GROUPS=rep("Q2", length(CONTINUOUS))
322
      GROUPS[CONTINUOUS>q[2]]="Q3"
323
      GROUPS[CONTINUOUS<q[1]]="Q1"
324
      
325
    }
326
    
327
    if(CONTINUOUS_SUMMARY=="80th_20th_percentile"){
328
      
329
      q=quantile(CONTINUOUS, probs = c(0.2, 0.8)) # quartile
330
      GROUPS=rep("Q2", length(CONTINUOUS))
331
      GROUPS[CONTINUOUS>q[2]]="Q3"
332
      GROUPS[CONTINUOUS<q[1]]="Q1"
333
      
334
    }
335
    
336
    if(CONTINUOUS_SUMMARY=="75th_25th_percentile"){
337
      
338
      q=quantile(CONTINUOUS, probs = c(0.25, 0.75)) # quartile
339
      GROUPS=rep("Q2", length(CONTINUOUS))
340
      GROUPS[CONTINUOUS>q[2]]="Q3"
341
      GROUPS[CONTINUOUS<q[1]]="Q1"
342
      
343
    }
344
    
345
    if(CONTINUOUS_SUMMARY=="median"){
346
      
347
      library(dplyr)
348
      GROUPS <- paste0("Q",ntile(CONTINUOUS, 2))
349
      
350
    }
351
  }
352
  NAME=gsub("_", " ", NAME)
353
  classes=as.character(GROUPS)
354
  size=c(length(unique(classes)))
355
  
356
  if(!is.null(COLORV)){
357
    col1 <- COLORV
358
  }else{
359
    size=c(length(unique(classes)))
360
    col1=c("red", "gray75")
361
    if(size>2){
362
      col1 <- colorRampPalette(c(brewer.pal(min(size, 9), "Set1"), brewer.pal(min(size, 8), "Set2"), brewer.pal(min(size, 9), "Set3")))(size)
363
      col1=col1[order(unique(classes))]
364
    }
365
    if(all(unique(classes)%in%c("Q1", "Q2", "Q3"))){
366
      unique(classes)
367
      
368
      col1=data.frame(c("Q1", "Q2", "Q3"), c("darkblue", "grey75", "red"), stringsAsFactors = F)
369
      
370
      col1=col1[match(unique(classes), col1[,1]),2]
371
    }
372
  }
373
  
374
  if(!INDIVIDUAL_GROUPS){
375
    DF.surv2=data.frame("time"=as.numeric(TIME), "status"=as.character(STATUS)=="DECEASED"|as.character(STATUS)=="1"|as.character(STATUS)=="Dead", "x"=factor(GROUPS, levels=unique(GROUPS)), stringsAsFactors=T)
376
    d.surv2 <- survfit(Surv(time,status) ~ x, data = DF.surv2, type="kaplan-meier", conf.type="log")
377
    
378
    
379
    # OLD
380
    # plot(d.surv2, col=col1, lwd=rep(as.numeric(LWD), length(col1)), mark.time=c(as.numeric(TIME)), mark=3, cex=LWD/2, fun="log", conf.int=F, ylim=c(0, 1), ylab="Percentage Surviving", xlab="Months")
381
    # legend("topright", paste(names(d.surv2$strata)," n=",d.surv2$n, sep=""), lwd=rep(LWD, length(col1)), col=col1)
382
    # title(paste("Kaplan-Meier Curves\n", NAME))
383
    
384
    theme0 <- function(...) theme( 
385
      # legend.position = "none",
386
      panel.background = element_blank(),
387
      panel.grid.major = element_blank(),
388
      panel.grid.minor = element_blank(),
389
      axis.line = element_line(),
390
      plot.margin=unit(c(1,1,1,1), "mm"),
391
      axis.text.y = element_text(colour="grey20",size=font.size,face="plain", family="Helvetica"),  
392
      axis.text.x = element_text(colour="grey20",size=font.size,face="plain", family="Helvetica"),
393
      text =   element_text(colour="grey20",size=font.size,face="plain", family="Helvetica"),
394
      plot.title = element_text(size=font.size, face = "plain", family="Helvetica")
395
      # panel.spacing = unit(0,"null"),
396
      # axis.ticks = element_blank(),
397
      # axis.text.x = element_blank(),
398
      # axis.text.y = element_blank(),
399
      # axis.title.x = element_blank(),
400
      # axis.title.y = element_blank(),
401
      # axis.ticks.length = unit(0,"null"),
402
      # panel.border=element_rect(color=NA)
403
    )
404
    
405
    ggsurv <- survminer::ggsurvplot(
406
      d.surv2,                     # survfit object with calculated statistics.
407
      data = DF.surv2,             # data used to fit survival curves.
408
      risk.table = F,       # show risk table.
409
      pval = TRUE,             # show p-value of log-rank test.
410
      conf.int = conf.int,         # show confidence intervals for 
411
      # point estimates of survival curves.
412
      palette = col1,
413
      pval.size=3,
414
      pval.method = T,
415
      size = LWD,
416
      censor.size = LWD*1.5,
417
      censor.shape="|",
418
      title=NAME,
419
      # xlim = c(0,500),         # present narrower X axis, but not affect
420
      # survival estimates.
421
      xlab = xlab,
422
      # break.time.by = 100,     # break X axis in time intervals by 500.
423
      ggtheme = theme0(), # customize plot and risk table with a theme.
424
      risk.table.y.text.col = F,# colour risk table text annotations.
425
      risk.table.height = 0.25, # the height of the risk table
426
      risk.table.y.text = FALSE,# show bars instead of names in text annotations
427
      # in legend of risk table.
428
      ncensor.plot = F,      # plot the number of censored subjects at time t
429
      ncensor.plot.height = 0.25,
430
      conf.int.style = "step",  # customize style of confidence intervals
431
      # surv.median.line = "hv",  # add the median survival pointer.
432
      legend.labs = paste0(unique(DF.surv2$x)," n=", d.surv2$n) # change legend labels.
433
    )
434
    return(ggsurv)
435
    
436
  }else{
437
    
438
    unfeat=unique(as.character(GROUPS))
439
    
440
    plots=lapply(unfeat, function(a){
441
      group=ifelse(GROUPS%in%a, a, "Rest")
442
      
443
      
444
      DF.surv2=data.frame("time"=as.numeric(TIME), "status"=as.character(STATUS)=="DECEASED"|as.character(STATUS)=="1"|as.character(STATUS)=="Dead", "x"=as.character(group), stringsAsFactors=F)
445
      d.surv2 <- survfit(Surv(time,status) ~ x, data = DF.surv2, type="kaplan-meier", conf.type="log")
446
      
447
      classes=as.character(GROUPS)
448
      size=c(length(unique(classes)))
449
      
450
      if(!is.null(COLORV)){
451
        col1 <- COLORV
452
      }else{
453
        size=c(length(unique(classes)))
454
        col1=c("red", "gray75")
455
      }
456
      
457
      # why is this here?
458
      # if(all(table(DF.surv2[,2], DF.surv2[,3])>=2)){
459
      Hazard.t=coxph(Surv(time,status) ~ x, data = DF.surv2)
460
      add=signif(summary(Hazard.t)$logtest["pvalue"],2)[1]
461
      # }else{
462
      #   add=NA
463
      # }
464
      
465
      if(!is.na(add)&add<=PVAL){
466
        
467
        # plot(d.surv2, col=col1, lwd=rep(as.numeric(LWD), length(col1)), mark.time=c(as.numeric(TIME)), mark=3,cex=LWD/2, fun="log", conf.int=F, ylim=c(0, 1), xlab="Months", ylab="Percentage Surviving")
468
        # legend("topright", paste(gsub("x=", "", names(d.surv2$strata))," n=",d.surv2$n, " ", c("", paste0("p.value=", add)), sep=""), lwd=rep(LWD, length(col1)), col=col1)
469
        # title(paste("Kaplan-Meier Curves\n", NAME))
470
        theme0 <- function(...) theme( 
471
          # legend.position = "none",
472
          panel.background = element_blank(),
473
          panel.grid.major = element_blank(),
474
          panel.grid.minor = element_blank(),
475
          axis.line = element_line(),
476
          plot.margin=unit(c(1,1,1,1), "mm"),
477
          axis.text.y = element_text(colour="grey20",size=font.size,face="plain", family="Helvetica"),  
478
          axis.text.x = element_text(colour="grey20",size=font.size,face="plain", family="Helvetica"),
479
          text =   element_text(colour="grey20",size=font.size,face="plain", family="Helvetica"),
480
          plot.title = element_text(size=font.size, face = "plain", family="Helvetica")
481
          # panel.spacing = unit(0,"null"),
482
          # axis.ticks = element_blank(),
483
          # axis.text.x = element_blank(),
484
          # axis.text.y = element_blank(),
485
          # axis.title.x = element_blank(),
486
          # axis.title.y = element_blank(),
487
          # axis.ticks.length = unit(0,"null"),
488
          # panel.border=element_rect(color=NA)
489
        )
490
        
491
        
492
        ggsurv <- survminer::ggsurvplot(
493
          d.surv2,                     # survfit object with calculated statistics.
494
          data = DF.surv2,             # data used to fit survival curves.
495
          risk.table = F,       # show risk table.
496
          pval = TRUE,             # show p-value of log-rank test.
497
          conf.int = conf.int,         # show confidence intervals for 
498
          # point estimates of survival curves.
499
          palette = col1,
500
          pval.size=3,
501
          pval.method = T,
502
          title=NAME,
503
          size = LWD,
504
          censor.size = LWD*1.5,
505
          censor.shape="|",
506
          # xlim = c(0,500),         # present narrower X axis, but not affect
507
          # survival estimates.
508
          xlab = xlab,
509
          # break.time.by = 100,     # break X axis in time intervals by 500.
510
          ggtheme = theme0(), # customize plot and risk table with a theme.
511
          risk.table.y.text.col = F,# colour risk table text annotations.
512
          risk.table.height = 0.25, # the height of the risk table
513
          risk.table.y.text = FALSE,# show bars instead of names in text annotations
514
          # in legend of risk table.
515
          ncensor.plot = F,      # plot the number of censored subjects at time t
516
          ncensor.plot.height = 0.25,
517
          conf.int.style = "step",  # customize style of confidence intervals
518
          # surv.median.line = "hv",  # add the median survival pointer.
519
          legend.labs = paste0(unique(unique(group))," n=", d.surv2$n) # change legend labels.
520
        )
521
        # print(ggsurv)
522
        return(ggsurv)
523
      }
524
    })
525
    plots.together=arrange_ggsurvplots(plots, print = TRUE,
526
                                       ncol = 2, nrow = ceiling(length(plots)/2))
527
    return(plots.together)
528
  }
529
}
530
531
ft.alt <- function(a, b, c, d, alternative="greater") {
532
  as.numeric(fisher.test(matrix(c(a, b, c, d), 2), alternative = alternative)$p.value)
533
}
534
535
cor.test.p=function(x, y, method="spearman", use="pairwise.complete.obs") cor.test(x, y, method=method, use=use)[["p.value"]]
536
537
cor.test.p.m <- function(x, method="spearman", use="pairwise.complete.obs"){
538
  
539
  z <- outer(
540
    colnames(x),
541
    colnames(x),
542
    Vectorize(function(i,j){
543
      no.pairs=sum(complete.cases(x[,j]==x[,i]))<3
544
      if(no.pairs)return(NA)
545
      cor.test.p(x[,i], x[,j], method=method, use=use)
546
    })
547
  )
548
  dimnames(z) <- list(colnames(x), colnames(x))
549
  z
550
}
551
552
flat_cor_mat <- function(cor_rho, cor_pval){
553
  # cor_3 <- rcorr(as.matrix(mtcars[, 1:7]))
554
  #This function provides a simple formatting of a correlation matrix
555
  #into a table with 4 columns containing :
556
  # Column 1 : row names (variable 1 for the correlation test)
557
  # Column 2 : column names (variable 2 for the correlation test)
558
  # Column 3 : the correlation coefficients
559
  # Column 4 : the p-values of the correlations
560
  library(tidyr)
561
  library(dplyr)
562
  library(tibble)
563
  options(scipen=4)
564
  cor_rho <- rownames_to_column(as.data.frame(cor_rho), var = "row")
565
  cor_rho <- gather(cor_rho, column, cor, -1)
566
  cor_pval <- rownames_to_column(as.data.frame(cor_pval), var = "row")
567
  cor_pval <- gather(cor_pval, column, p, -1)
568
  cor_pval_matrix <- left_join(cor_rho, cor_pval, by = c("row", "column"))
569
  colnames(cor_pval_matrix)[1:2]=c("featureA", "featureB")
570
  cor_pval_matrix=cor_pval_matrix[!cor_pval_matrix[,1]==cor_pval_matrix[,2],]
571
  cor_pval_matrix=cor_pval_matrix[order(cor_pval_matrix[,1], cor_pval_matrix[,3]),]
572
  rownames(cor_pval_matrix)=NULL
573
  cor_pval_matrix
574
}
575
576
chart.Correlation.custom=function (R,filterv=NULL, histogram = TRUE, method = c("pearson", "kendall","spearman"), ...){
577
  x = checkData(R, method = "matrix")
578
  if (missing(method))
579
    method = method[1]
580
  panel.cor <- function(x, y, digits = 2, prefix = "", use = "pairwise.complete.obs",
581
                        method, cex.cor, ...) {
582
    usr <- par("usr")
583
    on.exit(par(usr))
584
    par(usr = c(0, 1, 0, 1))
585
    r <- cor(x, y, use = use, method = method)
586
    txt <- format(c(r, 0.123456789), digits = digits)[1]
587
    txt <- paste(prefix, txt, sep = "")
588
    if (missing(cex.cor))
589
      cex <- 0.8/strwidth(txt)
590
    test <- cor.test(x, y, method = method)
591
    Signif <- symnum(test$p.value, corr = FALSE, na = FALSE,
592
                     cutpoints = c(0, 0.001, 0.01, 0.05, 0.1, 1), symbols = c("***",
593
                                                                              "**", "*", ".", " "))
594
    text(0.5, 0.5, txt, cex = cex * (abs(r) + 0.3)/1.3)
595
    text(0.8, 0.8, Signif, cex = cex, col = 2)
596
  }
597
  f <- function(t) {
598
    dnorm(t, mean = mean(x), sd = sd.xts(x))
599
  }
600
  hist.panel = function(x, ...) {
601
    par(new = TRUE)
602
    hist(x, col = "light gray", probability = TRUE, axes = FALSE,
603
         main = "", breaks = "FD")
604
    lines(density(x, na.rm = TRUE), col = "red", lwd = 1)
605
    rug(x)
606
  }
607
  if (histogram)
608
    pairs(x, gap = 0, lower.panel = panel.smooth, upper.panel = panel.cor,
609
          diag.panel = hist.panel, method = method, subset=colnames(x)%in%rownames(test)[1:5])
610
  else pairs(x, gap = 0, lower.panel = panel.smooth, upper.panel = panel.cor,
611
             method = method, ...)
612
}
613
614
# make sure  data is on a linear non-log scale
615
# check 0 values, if a lot between 0-1 better to use add.one. If counts, remove works too quite well
616
# zero fix methods: https://arxiv.org/pdf/1806.06403.pdf
617
gm_mean=function(x, na.rm=TRUE, zero.handle=c("remove", "add.one", "zero.propagate")){
618
  zero.handle=match.arg(zero.handle)
619
  if(any(x < 0, na.rm = TRUE)){
620
    warning("Negative values produced NaN - is the data on linear - non-log scale?")
621
    return(NaN)
622
  }
623
  
624
  if(zero.handle=="remove"){
625
    return(exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x)))
626
  }
627
  
628
  if(zero.handle=="add.one"){
629
    return(exp(mean(log(x+1), na.rm = na.rm))-1)
630
  }
631
  
632
  if(zero.handle=="zero.propagate"){
633
    if(any(x == 0, na.rm = TRUE)){
634
      return(0)
635
    }
636
    return(exp(mean(log(x), na.rm = na.rm)))
637
  }
638
  
639
}