Diff of /R/compTMB.R [000000] .. [494cbf]

Switch to unified view

a b/R/compTMB.R
1
#' @name compTMB
2
#' @title Comparsion of total mutation burden
3
#' @description This function calculates Total Mutation Burden (TMB) compares them among curent subtypes identified from multi-omics integrative clustering algorithms.
4
#'
5
#' @param moic.res An object returned by `getMOIC()` with one specified algorithm or `get\%algorithm_name\%` or `getConsensusMOIC()` with a list of multiple algorithms.
6
#' @param maf A data frame of MAF file that has been already read with at least 10 columns as following: c('Tumor_Sample_Barcode', 'Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2')
7
#' @param rmDup A logical value to indicate if removing repeated variants in a particuar sample, mapped to multiple transcripts of same Gene. TRUE by default.
8
#' @param rmFLAGS A logical value to indicate if removing possible FLAGS. These FLAGS genes are often non-pathogenic and passengers, but are frequently mutated in most of the public exome studies, some of which are fishy. Examples of such genes include TTN, MUC16, etc. FALSE by default.
9
#' @param nonSyn A string vector to indicate a list of variant claccifications that should be considered as non-synonymous and the rest will be considered synonymous (silent) variants. Default value of NULL uses Variant Classifications with High/Moderate variant consequences, including c('Frame_Shift_Del', 'Frame_Shift_Ins', 'Splice_Site', 'Translation_Start_Site', 'Nonsense_Mutation', 'Nonstop_Mutation', 'In_Frame_Del', 'In_Frame_Ins', 'Missense_Mutation'). See details at \url{http://asia.ensembl.org/Help/Glossary?id=535}
10
#' @param clust.col A string vector storing colors for annotating each Subtype.
11
#' @param test.method A string value to indicate the method for statistical testing. Allowed values contain c('nonparametric', 'parametric'); nonparametric means two-sample wilcoxon rank sum test for two subtypes and Kruskal-Wallis rank sum test for multiple subtypes; parametric means two-sample t-test when only two subtypes are identified, and anova for multiple subtypes comparison; "nonparametric" by default.
12
#' @param show.size A logical value to indicate if showing the sample size within each subtype at the top of the figure. TRUE by default.
13
#' @param fig.name  A string value to indicate the name of the boxviolin plot.
14
#' @param fig.path A string value to indicate the output path for storing the boxviolin plot.
15
#' @param exome.size An integer value to indicate the estimation of exome size. 38 by default (see \url{https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-017-0424-2}).
16
#' @param width A numeric value to indicate the width of boxviolin plot.
17
#' @param height A numeric value to indicate the height of boxviolin plot.
18
#' @return A figure of TMB and TiTv distribution (.pdf) and a list with the following components:
19
#'
20
#'         \code{TMB.dat}       a data.frame storing the TMB per sample within each subtype.
21
#'
22
#'         \code{TMB.median}    a data.frame storing the median of TMB for each subtype.
23
#'
24
#'         \code{titv.dat}      a data.frame storing the fraction contributions of TiTv per sample within each subtype.
25
#'
26
#'         \code{maf.nonsilent} a data.frame storing the information for non-synonymous mutations.
27
#'
28
#'         \code{maf.silent}    a data.frame storing the information for synonymous mutations.
29
#'
30
#'         \code{maf.FLAGS}     a data.frame storing the information for FLAGS mutations if\code{rmFLAGS = TRUE}.
31
#'
32
#'         \code{FLAGS.count}   a data.frame storing the summarization per FLAGS if\code{rmFLAGS = TRUE}.
33
#' @export
34
#' @importFrom maftools read.maf titv
35
#' @importFrom dplyr %>% group_by summarise mutate n
36
#' @importFrom grDevices dev.copy2pdf
37
#' @examples # There is no example and please refer to vignette.
38
#' @references Mayakonda A, Lin D, Assenov Y, Plass C, Koeffler PH (2018). Maftools: efficient and comprehensive analysis of somatic variants in cancer. Genome Res, 28(11): 1747-1756.
39
#'
40
#' Shyr C, Tarailo-Graovac M, Gottlieb M, Lee JJ, van Karnebeek C, Wasserman WW. (2014). FLAGS, frequently mutated genes in public exomes. BMC Med Genomics, 7(1): 1-14.
41
#'
42
#' Chalmers Z R, Connelly C F, Fabrizio D, et al. (2017). Analysis of 100,000 human cancer genomes reveals the landscape of tumor mutational burden. Genome Med, 9(1):34.
43
compTMB <- function(moic.res    = NULL,
44
                    maf         = NULL,
45
                    rmDup       = TRUE,
46
                    rmFLAGS     = FALSE,
47
                    nonSyn      = NULL,
48
                    exome.size  = 38,
49
                    clust.col   = c("#2EC4B6","#E71D36","#FF9F1C","#BDD5EA","#FFA5AB","#011627","#023E8A","#9D4EDD"),
50
                    test.method = "nonparametric",
51
                    show.size   = TRUE,
52
                    fig.path    = getwd(),
53
                    fig.name    = NULL,
54
                    width       = 6,
55
                    height      = 6) {
56
57
  label <- c("Tumor_Sample_Barcode",
58
             "Hugo_Symbol",
59
             "Chromosome",
60
             "Start_Position",
61
             "End_Position",
62
             "Variant_Classification",
63
             "Variant_Type",
64
             "Reference_Allele",
65
             "Tumor_Seq_Allele1",
66
             "Tumor_Seq_Allele2")
67
68
  maf <- as.data.frame(maf)
69
70
  # check arguments
71
  if(!all(is.element(label, colnames(maf)))) {
72
    stop(paste0("maf data must have the following columns: \n ", paste(label, collapse = "\n "),"\n\nmissing required fields from maf: ", paste(setdiff(label, colnames(maf)), collapse = "\n")))
73
  }
74
75
  if(!is.element(test.method, c("nonparametric","parametric"))) {
76
    stop("test.method can be one of nonparametric or parametric.")
77
  }
78
79
  # check data
80
  comsam <- intersect(moic.res$clust.res$samID, unique(maf$Tumor_Sample_Barcode))
81
  if(length(comsam) == nrow(moic.res$clust.res)) {
82
    message("--all samples matched.")
83
  } else {
84
    message(paste0("--",(nrow(moic.res$clust.res)-length(comsam))," samples mismatched from current subtypes."))
85
  }
86
87
  maf <- maf[which(maf$Tumor_Sample_Barcode %in% comsam),]
88
  clust.res <- moic.res$clust.res[comsam, , drop = FALSE]
89
  n.moic <- length(unique(clust.res$clust))
90
  colvec <- clust.col[1:n.moic]
91
  names(colvec) <- paste0("CS",unique(clust.res$clust))
92
  col.titv <- c("#E64B35CC", "#4DBBD5CC", "#00A087CC", "#3C5488CC", "#F39B7FCC", "#8491B4CC")
93
  names(col.titv) <- c("C>T", "T>C", "C>A", "C>G", "T>A", "T>G")
94
95
  if(rmFLAGS) {
96
    FLAGS <- c("TTN",
97
               "MUC16",
98
               "OBSCN",
99
               "AHNAK2",
100
               "SYNE1",
101
               "FLG",
102
               "MUC5B",
103
               "DNAH17",
104
               "PLEC",
105
               "DST",
106
               "SYNE2",
107
               "NEB",
108
               "HSPG2",
109
               "LAMA5",
110
               "AHNAK",
111
               "HMCN1",
112
               "USH2A",
113
               "DNAH11",
114
               "MACF1",
115
               "MUC17",
116
               "DNAH5",
117
               "GPR98",
118
               "FAT1",
119
               "PKD1",
120
               "MDN1",
121
               "RNF213",
122
               "RYR1",
123
               "DNAH2",
124
               "DNAH3",
125
               "DNAH8",
126
               "DNAH1",
127
               "DNAH9",
128
               "ABCA13",
129
               "APOB",
130
               "SRRM2",
131
               "CUBN",
132
               "SPTBN5",
133
               "PKHD1",
134
               "LRP2",
135
               "FBN3",
136
               "CDH23",
137
               "DNAH10",
138
               "FAT4",
139
               "RYR3",
140
               "PKHD1L1",
141
               "FAT2",
142
               "CSMD1",
143
               "PCNT",
144
               "COL6A3",
145
               "FRAS1",
146
               "FCGBP",
147
               "DNAH7",
148
               "RP1L1",
149
               "PCLO",
150
               "ZFHX3",
151
               "COL7A1",
152
               "LRP1B",
153
               "FAT3",
154
               "EPPK1",
155
               "VPS13C",
156
               "HRNR",
157
               "MKI67",
158
               "MYO15A",
159
               "STAB1",
160
               "ZAN",
161
               "UBR4",
162
               "VPS13B",
163
               "LAMA1",
164
               "XIRP2",
165
               "BSN",
166
               "KMT2C",
167
               "ALMS1",
168
               "CELSR1",
169
               "TG",
170
               "LAMA3",
171
               "DYNC2H1",
172
               "KMT2D",
173
               "BRCA2",
174
               "CMYA5",
175
               "SACS",
176
               "STAB2",
177
               "AKAP13",
178
               "UTRN",
179
               "VWF",
180
               "VPS13D",
181
               "ANK3",
182
               "FREM2",
183
               "PKD1L1",
184
               "LAMA2",
185
               "ABCA7",
186
               "LRP1",
187
               "ASPM",
188
               "MYOM2",
189
               "PDE4DIP",
190
               "TACC2",
191
               "MUC2",
192
               "TEP1",
193
               "HELZ2",
194
               "HERC2",
195
               "ABCA4")
196
197
    if(sum(is.element(FLAGS, unique(maf$Hugo_Symbol))) > 0) {
198
      maf.FLAGS <- maf[which(maf$Hugo_Symbol %in% FLAGS),]
199
      maf.rmFLAGS <- maf[-which(maf$Hugo_Symbol %in% FLAGS),]
200
      count.flags <- maf.FLAGS %>% group_by(Hugo_Symbol) %>% summarise(count = dplyr::n())
201
      message("--remove possible FLAGS as below:")
202
      head(count.flags)
203
204
      maf.ob <- maftools::read.maf(maf                      = maf.rmFLAGS,
205
                                   removeDuplicatedVariants = rmDup,
206
                                   vc_nonSyn                = nonSyn)
207
    } else {
208
      maf.ob <- maftools::read.maf(maf                      = maf,
209
                                   removeDuplicatedVariants = rmDup,
210
                                   vc_nonSyn                = nonSyn)
211
    }
212
  }   else {
213
      maf.ob <- maftools::read.maf(maf                      = maf,
214
                                   removeDuplicatedVariants = rmDup,
215
                                   vc_nonSyn                = nonSyn)
216
  }
217
218
  # classifies Single Nucleotide Variants into Transitions and Transversions
219
  titv.dat <- maftools::titv(maf = maf.ob, plot = FALSE, useSyn = FALSE)$fraction.contribution %>%
220
    as.data.frame() %>%
221
    mutate(Subtype = paste0("CS",clust.res[as.character(.$Tumor_Sample_Barcode),"clust"]))
222
  titv.dat.backup <- titv.dat
223
  titv.dat <- split(titv.dat, f = titv.dat$Subtype)
224
225
  # extract silent and nonsilent mutations
226
  maf.silent    <- as.data.frame(maf.ob@maf.silent)
227
  maf.nonsilent <- as.data.frame(maf.ob@data)
228
229
  # extract total mutation burden
230
  TMB.dat <- as.data.frame(maf.ob@variants.per.sample)
231
  TMB.dat <- data.frame(samID            = as.character(TMB.dat$Tumor_Sample_Barcode),
232
                        variants         = as.character(TMB.dat$Variants),
233
                        TMB              = as.numeric(TMB.dat$Variants)/exome.size,
234
                        log10TMB         = log10(as.numeric(TMB.dat$Variants)/exome.size + 1),
235
                        Subtype          = paste0("CS", clust.res[as.character(TMB.dat$Tumor_Sample_Barcode), "clust"]),
236
                        stringsAsFactors = FALSE)
237
  TMB.dat <- TMB.dat[order(TMB.dat$Subtype), , drop = FALSE]
238
  TMB.med <- TMB.dat %>% group_by(Subtype) %>% summarize(median = median(TMB)) %>% as.data.frame()
239
  TMB.dat <- TMB.dat[order(TMB.dat$Subtype, TMB.dat$TMB, decreasing = FALSE), ]
240
241
  # sample order in bottom panel
242
  sampleorder <- TMB.dat %>% split(.$Subtype) %>% lapply("[[", 1) %>% lapply(., as.character)
243
  TMB.plot <- split(TMB.dat, as.factor(TMB.dat$Subtype))
244
  TMB.plot <- lapply(seq_len(length(TMB.plot)), function(i) {
245
    x = TMB.plot[[i]]
246
    x = data.frame(x = seq(i - 1, i, length.out = nrow(x)),
247
                   TMB = x[, "TMB"],
248
                   Subtype = x[, "Subtype"])
249
    return(x)
250
  })
251
  names(TMB.plot) <- levels(as.factor(TMB.dat$Subtype))
252
253
  # prepare titv data
254
  titv.dat2 <- lapply(TMB.med$Subtype, function(x){
255
    tmp <- titv.dat[[x]]
256
    tmp <- tmp[match(sampleorder[[x]], as.character(tmp$Tumor_Sample_Barcode)), ]
257
    return(tmp)
258
    if (!all(tmp$Tumor_Sample_Barcode == sampleorder[[x]])){
259
      stop("inconsistent sample order...")
260
    }
261
  })
262
263
  names(titv.dat2) <- TMB.med$Subtype
264
  titv.dat2 <- lapply(titv.dat2, function(x){
265
    x <- as.data.frame(x)
266
    rownames(x) <- x$Tumor_Sample_Barcode
267
    x <- x[, setdiff(colnames(x), c("Tumor_Sample_Barcode","Subtype"))]
268
    x <- t(x)
269
    #delete samples without mutation
270
    if (length(which(colSums(x) == 0)) > 0) {
271
      x = x[, -which(colSums(x) == 0), drop = FALSE]
272
    }
273
    return(x)
274
  })
275
276
  TMB.med$Median_Mutations_log10 <- log10(TMB.med$median + 1)
277
278
  # statistical testing
279
  if(n.moic == 2 & test.method == "nonparametric") {
280
    statistic <- "wilcox.test"
281
    TMB.test  <- wilcox.test(TMB.dat$log10TMB ~ TMB.dat$Subtype)$p.value
282
    cat(paste0("Wilcoxon rank sum test p value = ", formatC(TMB.test, format = "e", digits = 2)))
283
  }
284
285
  if(n.moic == 2 & test.method == "parametric") {
286
    statistic <- "t.test"
287
    TMB.test  <- t.test(TMB.dat$log10TMB ~ TMB.dat$Subtype)$p.value
288
    cat(paste0("Student's t test p value = ", formatC(TMB.test, format = "e", digits = 2)))
289
  }
290
291
  if(n.moic > 2 & test.method == "nonparametric") {
292
    statistic <- "kruskal.test"
293
    TMB.test  <- kruskal.test(TMB.dat$log10TMB ~ TMB.dat$Subtype)$p.value
294
    pairwise.TMB.test <- pairwise.wilcox.test(TMB.dat$log10TMB,TMB.dat$Subtype,p.adjust.method = "BH")
295
    # pairwise.TMB.test <- dunnTest(log10TMB ~ as.factor(Subtype),
296
    #                               data = TMB.dat,
297
    #                               method = "bh")
298
    cat(paste0("Kruskal-Wallis rank sum test p value = ", formatC(TMB.test, format = "e", digits = 2),"\npost-hoc pairwise wilcoxon rank sum test with Benjamini-Hochberg adjustment presents below:\n"))
299
    print(formatC(pairwise.TMB.test$p.value, format = "e", digits = 2))
300
  }
301
302
  if(n.moic > 2 & test.method == "parametric") {
303
    statistic <- "anova"
304
    TMB.test  <- summary(aov(TMB.dat$log10TMB ~ TMB.dat$Subtype))[[1]][["Pr(>F)"]][1]
305
    pairwise.TMB.test <- pairwise.t.test(TMB.dat$log10TMB,TMB.dat$Subtype,p.adjust.method = "BH")
306
    cat(paste0("One-way anova test p value = ", formatC(TMB.test, format = "e", digits = 2),"\npost-hoc pairwise Student's t test with Benjamini-Hochberg adjustment presents below:\n"))
307
    print(formatC(pairwise.TMB.test$p.value, format = "e", digits = 2))
308
  }
309
310
  # start illustration
311
  if(is.null(fig.name)) {
312
    outFig <- "distribution of TMB and titv.pdf"
313
  } else {
314
    outFig <- paste0(fig.name,".pdf")
315
  }
316
317
  # base layout
318
  n1 <- seq(from = 0.105, to = 0.97-(0.97-0.05)*0.04, length.out = n.moic + 1)
319
  n2 <- n1[2:length(n1)]
320
  n  <- data.frame(n1 = n1[1:n.moic], n2 = n2, n3 = 0.05, n4 = 0.2) %>%
321
    rbind(c(0.05, 0.97, 0.25, 1), ., c(0, 0.1, 0.05, 0.25)) %>% as.matrix()
322
323
  opar <- par(no.readonly = TRUE)
324
  invisible(suppressWarnings(split.screen(n, erase = TRUE)))
325
  screen(1, new = TRUE)
326
  par(xpd = TRUE, mar = c(3, 1, 2, 0), oma = c(0, 0, 0, 0),bty = "o", mgp = c(2, 0.5, 0), tcl=-.25)
327
  y_lims = range(log10(unlist(lapply(TMB.plot, function(x) max(x[, "TMB"], na.rm = TRUE))) + 1))
328
  y_lims[1] = 0
329
  y_lims[2] = ceiling(max(y_lims))
330
  y_at = y_lims[1]:y_lims[2]
331
  x_top_label <- as.numeric(unlist(lapply(TMB.plot, nrow)))
332
  plot(NA, NA, xlim = c(0, length(TMB.plot)), ylim = y_lims,
333
       xlab = NA, ylab = NA, xaxt="n", yaxt = "n", xaxs = "r", yaxs = "r")
334
  rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#EAE9E9", border = FALSE)
335
  grid(col = "white", lty = 1, lwd = 1.5) # add grid
336
337
  par(new = TRUE, bty="o")
338
  plot(NA, NA,
339
       col = "white",
340
       xlim = c(0, length(TMB.plot)), ylim = y_lims,
341
       xlab = "", ylab = "",
342
       xaxt = "n", yaxt = "n")
343
344
  text(x = length(TMB.plot)/2, y = y_lims[2] - 0.1, cex = 1.2,
345
       label = paste0(statistic, " p = ", formatC(TMB.test, digits = 1, format = "e")))
346
347
  # add median TMB
348
  invisible(lapply(seq_len(nrow(TMB.med)), function(i) {
349
    segments(x0  = i - 1,
350
             x1  = i,
351
             y0  = TMB.med[i, "Median_Mutations_log10"],
352
             y1  = TMB.med[i, "Median_Mutations_log10"],
353
             col = "#2b2d42",
354
             lwd = 2)}))
355
356
  # add scatter TMB
357
  invisible(lapply(seq_len(length(TMB.plot)), function(i){
358
    tmp = TMB.plot[[i]]
359
    points(tmp$x, log10(tmp$TMB + 1), pch = 16, cex = 0.5, col = colvec[i])
360
  }))
361
362
  # modify axis
363
  axis(side = 1, at = seq(0.5, length(TMB.plot) - 0.5, 1), labels = names(TMB.plot),
364
       las = 1, tick = TRUE, line = 0)
365
  axis(side = 2, at = y_at, las = 2, line = 0, tick = TRUE, labels = y_at)
366
  if(show.size) {
367
    axis(side = 3, at = seq(0.5, length(TMB.plot) - 0.5, 1), labels = paste0("n = ",x_top_label),
368
         tick = TRUE, line = 0, cex.axis = 0.9)
369
  }
370
  mtext(text = bquote("log"[10]~"(TMB + 1)"), side = 2, line = 1.15, cex = 1.1)
371
372
  # add titv
373
  invisible(lapply(seq_len(length(titv.dat2)), function(i){
374
    screen(i + 1, new = TRUE)
375
    par(xpd = TRUE, mar = c(0, 0, 0, 0), oma = c(0, 0, 0, 0), bty = "o")
376
    tmp <- titv.dat2[[i]]
377
    barplot(tmp, col = col.titv[rownames(tmp)], names.arg = rep("", ncol(tmp)),
378
            xaxs = "i", yaxs = "i",
379
            axes = FALSE, space = 0, border = NA, lwd = 1.2)
380
    box()
381
  }))
382
383
  # add legend
384
  screen(n.moic + 2, new = TRUE)
385
  par(xpd = TRUE, mar = c(0, 0, 0, 0), oma = c(0, 0, 0, 0), bty = "n")
386
  plot(NA, NA, xlim = c(0, 1), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA)
387
  legend("center",
388
         fill   = col.titv,
389
         legend = names(col.titv),
390
         border = NA,
391
         bty    = "n",
392
         cex    = 0.8)
393
  close.screen(all.screens = TRUE)
394
  invisible(dev.copy2pdf(file = file.path(fig.path, outFig), width = width, height = height))
395
396
  if(rmFLAGS) {
397
    return(list(TMB.dat = TMB.dat, TMB.median = TMB.med, titv.dat = titv.dat.backup, maf.nonsilent = maf.nonsilent, maf.silent = maf.silent, maf.FLAGS = maf.FLAGS, FLAGS.count = as.data.frame(count.flags)))
398
  } else {
399
    return(list(TMB.dat = TMB.dat, TMB.median = TMB.med, titv.dat = titv.dat.backup, maf.nonsilent = maf.nonsilent, maf.silent = maf.silent))
400
  }
401
}