--- a +++ b/R/compTMB.R @@ -0,0 +1,401 @@ +#' @name compTMB +#' @title Comparsion of total mutation burden +#' @description This function calculates Total Mutation Burden (TMB) compares them among curent subtypes identified from multi-omics integrative clustering algorithms. +#' +#' @param moic.res An object returned by `getMOIC()` with one specified algorithm or `get\%algorithm_name\%` or `getConsensusMOIC()` with a list of multiple algorithms. +#' @param maf A data frame of MAF file that has been already read with at least 10 columns as following: c('Tumor_Sample_Barcode', 'Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2') +#' @param rmDup A logical value to indicate if removing repeated variants in a particuar sample, mapped to multiple transcripts of same Gene. TRUE by default. +#' @param rmFLAGS A logical value to indicate if removing possible FLAGS. These FLAGS genes are often non-pathogenic and passengers, but are frequently mutated in most of the public exome studies, some of which are fishy. Examples of such genes include TTN, MUC16, etc. FALSE by default. +#' @param nonSyn A string vector to indicate a list of variant claccifications that should be considered as non-synonymous and the rest will be considered synonymous (silent) variants. Default value of NULL uses Variant Classifications with High/Moderate variant consequences, including c('Frame_Shift_Del', 'Frame_Shift_Ins', 'Splice_Site', 'Translation_Start_Site', 'Nonsense_Mutation', 'Nonstop_Mutation', 'In_Frame_Del', 'In_Frame_Ins', 'Missense_Mutation'). See details at \url{http://asia.ensembl.org/Help/Glossary?id=535} +#' @param clust.col A string vector storing colors for annotating each Subtype. +#' @param test.method A string value to indicate the method for statistical testing. Allowed values contain c('nonparametric', 'parametric'); nonparametric means two-sample wilcoxon rank sum test for two subtypes and Kruskal-Wallis rank sum test for multiple subtypes; parametric means two-sample t-test when only two subtypes are identified, and anova for multiple subtypes comparison; "nonparametric" by default. +#' @param show.size A logical value to indicate if showing the sample size within each subtype at the top of the figure. TRUE by default. +#' @param fig.name A string value to indicate the name of the boxviolin plot. +#' @param fig.path A string value to indicate the output path for storing the boxviolin plot. +#' @param exome.size An integer value to indicate the estimation of exome size. 38 by default (see \url{https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-017-0424-2}). +#' @param width A numeric value to indicate the width of boxviolin plot. +#' @param height A numeric value to indicate the height of boxviolin plot. +#' @return A figure of TMB and TiTv distribution (.pdf) and a list with the following components: +#' +#' \code{TMB.dat} a data.frame storing the TMB per sample within each subtype. +#' +#' \code{TMB.median} a data.frame storing the median of TMB for each subtype. +#' +#' \code{titv.dat} a data.frame storing the fraction contributions of TiTv per sample within each subtype. +#' +#' \code{maf.nonsilent} a data.frame storing the information for non-synonymous mutations. +#' +#' \code{maf.silent} a data.frame storing the information for synonymous mutations. +#' +#' \code{maf.FLAGS} a data.frame storing the information for FLAGS mutations if\code{rmFLAGS = TRUE}. +#' +#' \code{FLAGS.count} a data.frame storing the summarization per FLAGS if\code{rmFLAGS = TRUE}. +#' @export +#' @importFrom maftools read.maf titv +#' @importFrom dplyr %>% group_by summarise mutate n +#' @importFrom grDevices dev.copy2pdf +#' @examples # There is no example and please refer to vignette. +#' @references Mayakonda A, Lin D, Assenov Y, Plass C, Koeffler PH (2018). Maftools: efficient and comprehensive analysis of somatic variants in cancer. Genome Res, 28(11): 1747-1756. +#' +#' Shyr C, Tarailo-Graovac M, Gottlieb M, Lee JJ, van Karnebeek C, Wasserman WW. (2014). FLAGS, frequently mutated genes in public exomes. BMC Med Genomics, 7(1): 1-14. +#' +#' Chalmers Z R, Connelly C F, Fabrizio D, et al. (2017). Analysis of 100,000 human cancer genomes reveals the landscape of tumor mutational burden. Genome Med, 9(1):34. +compTMB <- function(moic.res = NULL, + maf = NULL, + rmDup = TRUE, + rmFLAGS = FALSE, + nonSyn = NULL, + exome.size = 38, + clust.col = c("#2EC4B6","#E71D36","#FF9F1C","#BDD5EA","#FFA5AB","#011627","#023E8A","#9D4EDD"), + test.method = "nonparametric", + show.size = TRUE, + fig.path = getwd(), + fig.name = NULL, + width = 6, + height = 6) { + + label <- c("Tumor_Sample_Barcode", + "Hugo_Symbol", + "Chromosome", + "Start_Position", + "End_Position", + "Variant_Classification", + "Variant_Type", + "Reference_Allele", + "Tumor_Seq_Allele1", + "Tumor_Seq_Allele2") + + maf <- as.data.frame(maf) + + # check arguments + if(!all(is.element(label, colnames(maf)))) { + stop(paste0("maf data must have the following columns: \n ", paste(label, collapse = "\n "),"\n\nmissing required fields from maf: ", paste(setdiff(label, colnames(maf)), collapse = "\n"))) + } + + if(!is.element(test.method, c("nonparametric","parametric"))) { + stop("test.method can be one of nonparametric or parametric.") + } + + # check data + comsam <- intersect(moic.res$clust.res$samID, unique(maf$Tumor_Sample_Barcode)) + if(length(comsam) == nrow(moic.res$clust.res)) { + message("--all samples matched.") + } else { + message(paste0("--",(nrow(moic.res$clust.res)-length(comsam))," samples mismatched from current subtypes.")) + } + + maf <- maf[which(maf$Tumor_Sample_Barcode %in% comsam),] + clust.res <- moic.res$clust.res[comsam, , drop = FALSE] + n.moic <- length(unique(clust.res$clust)) + colvec <- clust.col[1:n.moic] + names(colvec) <- paste0("CS",unique(clust.res$clust)) + col.titv <- c("#E64B35CC", "#4DBBD5CC", "#00A087CC", "#3C5488CC", "#F39B7FCC", "#8491B4CC") + names(col.titv) <- c("C>T", "T>C", "C>A", "C>G", "T>A", "T>G") + + if(rmFLAGS) { + FLAGS <- c("TTN", + "MUC16", + "OBSCN", + "AHNAK2", + "SYNE1", + "FLG", + "MUC5B", + "DNAH17", + "PLEC", + "DST", + "SYNE2", + "NEB", + "HSPG2", + "LAMA5", + "AHNAK", + "HMCN1", + "USH2A", + "DNAH11", + "MACF1", + "MUC17", + "DNAH5", + "GPR98", + "FAT1", + "PKD1", + "MDN1", + "RNF213", + "RYR1", + "DNAH2", + "DNAH3", + "DNAH8", + "DNAH1", + "DNAH9", + "ABCA13", + "APOB", + "SRRM2", + "CUBN", + "SPTBN5", + "PKHD1", + "LRP2", + "FBN3", + "CDH23", + "DNAH10", + "FAT4", + "RYR3", + "PKHD1L1", + "FAT2", + "CSMD1", + "PCNT", + "COL6A3", + "FRAS1", + "FCGBP", + "DNAH7", + "RP1L1", + "PCLO", + "ZFHX3", + "COL7A1", + "LRP1B", + "FAT3", + "EPPK1", + "VPS13C", + "HRNR", + "MKI67", + "MYO15A", + "STAB1", + "ZAN", + "UBR4", + "VPS13B", + "LAMA1", + "XIRP2", + "BSN", + "KMT2C", + "ALMS1", + "CELSR1", + "TG", + "LAMA3", + "DYNC2H1", + "KMT2D", + "BRCA2", + "CMYA5", + "SACS", + "STAB2", + "AKAP13", + "UTRN", + "VWF", + "VPS13D", + "ANK3", + "FREM2", + "PKD1L1", + "LAMA2", + "ABCA7", + "LRP1", + "ASPM", + "MYOM2", + "PDE4DIP", + "TACC2", + "MUC2", + "TEP1", + "HELZ2", + "HERC2", + "ABCA4") + + if(sum(is.element(FLAGS, unique(maf$Hugo_Symbol))) > 0) { + maf.FLAGS <- maf[which(maf$Hugo_Symbol %in% FLAGS),] + maf.rmFLAGS <- maf[-which(maf$Hugo_Symbol %in% FLAGS),] + count.flags <- maf.FLAGS %>% group_by(Hugo_Symbol) %>% summarise(count = dplyr::n()) + message("--remove possible FLAGS as below:") + head(count.flags) + + maf.ob <- maftools::read.maf(maf = maf.rmFLAGS, + removeDuplicatedVariants = rmDup, + vc_nonSyn = nonSyn) + } else { + maf.ob <- maftools::read.maf(maf = maf, + removeDuplicatedVariants = rmDup, + vc_nonSyn = nonSyn) + } + } else { + maf.ob <- maftools::read.maf(maf = maf, + removeDuplicatedVariants = rmDup, + vc_nonSyn = nonSyn) + } + + # classifies Single Nucleotide Variants into Transitions and Transversions + titv.dat <- maftools::titv(maf = maf.ob, plot = FALSE, useSyn = FALSE)$fraction.contribution %>% + as.data.frame() %>% + mutate(Subtype = paste0("CS",clust.res[as.character(.$Tumor_Sample_Barcode),"clust"])) + titv.dat.backup <- titv.dat + titv.dat <- split(titv.dat, f = titv.dat$Subtype) + + # extract silent and nonsilent mutations + maf.silent <- as.data.frame(maf.ob@maf.silent) + maf.nonsilent <- as.data.frame(maf.ob@data) + + # extract total mutation burden + TMB.dat <- as.data.frame(maf.ob@variants.per.sample) + TMB.dat <- data.frame(samID = as.character(TMB.dat$Tumor_Sample_Barcode), + variants = as.character(TMB.dat$Variants), + TMB = as.numeric(TMB.dat$Variants)/exome.size, + log10TMB = log10(as.numeric(TMB.dat$Variants)/exome.size + 1), + Subtype = paste0("CS", clust.res[as.character(TMB.dat$Tumor_Sample_Barcode), "clust"]), + stringsAsFactors = FALSE) + TMB.dat <- TMB.dat[order(TMB.dat$Subtype), , drop = FALSE] + TMB.med <- TMB.dat %>% group_by(Subtype) %>% summarize(median = median(TMB)) %>% as.data.frame() + TMB.dat <- TMB.dat[order(TMB.dat$Subtype, TMB.dat$TMB, decreasing = FALSE), ] + + # sample order in bottom panel + sampleorder <- TMB.dat %>% split(.$Subtype) %>% lapply("[[", 1) %>% lapply(., as.character) + TMB.plot <- split(TMB.dat, as.factor(TMB.dat$Subtype)) + TMB.plot <- lapply(seq_len(length(TMB.plot)), function(i) { + x = TMB.plot[[i]] + x = data.frame(x = seq(i - 1, i, length.out = nrow(x)), + TMB = x[, "TMB"], + Subtype = x[, "Subtype"]) + return(x) + }) + names(TMB.plot) <- levels(as.factor(TMB.dat$Subtype)) + + # prepare titv data + titv.dat2 <- lapply(TMB.med$Subtype, function(x){ + tmp <- titv.dat[[x]] + tmp <- tmp[match(sampleorder[[x]], as.character(tmp$Tumor_Sample_Barcode)), ] + return(tmp) + if (!all(tmp$Tumor_Sample_Barcode == sampleorder[[x]])){ + stop("inconsistent sample order...") + } + }) + + names(titv.dat2) <- TMB.med$Subtype + titv.dat2 <- lapply(titv.dat2, function(x){ + x <- as.data.frame(x) + rownames(x) <- x$Tumor_Sample_Barcode + x <- x[, setdiff(colnames(x), c("Tumor_Sample_Barcode","Subtype"))] + x <- t(x) + #delete samples without mutation + if (length(which(colSums(x) == 0)) > 0) { + x = x[, -which(colSums(x) == 0), drop = FALSE] + } + return(x) + }) + + TMB.med$Median_Mutations_log10 <- log10(TMB.med$median + 1) + + # statistical testing + if(n.moic == 2 & test.method == "nonparametric") { + statistic <- "wilcox.test" + TMB.test <- wilcox.test(TMB.dat$log10TMB ~ TMB.dat$Subtype)$p.value + cat(paste0("Wilcoxon rank sum test p value = ", formatC(TMB.test, format = "e", digits = 2))) + } + + if(n.moic == 2 & test.method == "parametric") { + statistic <- "t.test" + TMB.test <- t.test(TMB.dat$log10TMB ~ TMB.dat$Subtype)$p.value + cat(paste0("Student's t test p value = ", formatC(TMB.test, format = "e", digits = 2))) + } + + if(n.moic > 2 & test.method == "nonparametric") { + statistic <- "kruskal.test" + TMB.test <- kruskal.test(TMB.dat$log10TMB ~ TMB.dat$Subtype)$p.value + pairwise.TMB.test <- pairwise.wilcox.test(TMB.dat$log10TMB,TMB.dat$Subtype,p.adjust.method = "BH") + # pairwise.TMB.test <- dunnTest(log10TMB ~ as.factor(Subtype), + # data = TMB.dat, + # method = "bh") + cat(paste0("Kruskal-Wallis rank sum test p value = ", formatC(TMB.test, format = "e", digits = 2),"\npost-hoc pairwise wilcoxon rank sum test with Benjamini-Hochberg adjustment presents below:\n")) + print(formatC(pairwise.TMB.test$p.value, format = "e", digits = 2)) + } + + if(n.moic > 2 & test.method == "parametric") { + statistic <- "anova" + TMB.test <- summary(aov(TMB.dat$log10TMB ~ TMB.dat$Subtype))[[1]][["Pr(>F)"]][1] + pairwise.TMB.test <- pairwise.t.test(TMB.dat$log10TMB,TMB.dat$Subtype,p.adjust.method = "BH") + cat(paste0("One-way anova test p value = ", formatC(TMB.test, format = "e", digits = 2),"\npost-hoc pairwise Student's t test with Benjamini-Hochberg adjustment presents below:\n")) + print(formatC(pairwise.TMB.test$p.value, format = "e", digits = 2)) + } + + # start illustration + if(is.null(fig.name)) { + outFig <- "distribution of TMB and titv.pdf" + } else { + outFig <- paste0(fig.name,".pdf") + } + + # base layout + n1 <- seq(from = 0.105, to = 0.97-(0.97-0.05)*0.04, length.out = n.moic + 1) + n2 <- n1[2:length(n1)] + n <- data.frame(n1 = n1[1:n.moic], n2 = n2, n3 = 0.05, n4 = 0.2) %>% + rbind(c(0.05, 0.97, 0.25, 1), ., c(0, 0.1, 0.05, 0.25)) %>% as.matrix() + + opar <- par(no.readonly = TRUE) + invisible(suppressWarnings(split.screen(n, erase = TRUE))) + screen(1, new = TRUE) + par(xpd = TRUE, mar = c(3, 1, 2, 0), oma = c(0, 0, 0, 0),bty = "o", mgp = c(2, 0.5, 0), tcl=-.25) + y_lims = range(log10(unlist(lapply(TMB.plot, function(x) max(x[, "TMB"], na.rm = TRUE))) + 1)) + y_lims[1] = 0 + y_lims[2] = ceiling(max(y_lims)) + y_at = y_lims[1]:y_lims[2] + x_top_label <- as.numeric(unlist(lapply(TMB.plot, nrow))) + plot(NA, NA, xlim = c(0, length(TMB.plot)), ylim = y_lims, + xlab = NA, ylab = NA, xaxt="n", yaxt = "n", xaxs = "r", yaxs = "r") + rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#EAE9E9", border = FALSE) + grid(col = "white", lty = 1, lwd = 1.5) # add grid + + par(new = TRUE, bty="o") + plot(NA, NA, + col = "white", + xlim = c(0, length(TMB.plot)), ylim = y_lims, + xlab = "", ylab = "", + xaxt = "n", yaxt = "n") + + text(x = length(TMB.plot)/2, y = y_lims[2] - 0.1, cex = 1.2, + label = paste0(statistic, " p = ", formatC(TMB.test, digits = 1, format = "e"))) + + # add median TMB + invisible(lapply(seq_len(nrow(TMB.med)), function(i) { + segments(x0 = i - 1, + x1 = i, + y0 = TMB.med[i, "Median_Mutations_log10"], + y1 = TMB.med[i, "Median_Mutations_log10"], + col = "#2b2d42", + lwd = 2)})) + + # add scatter TMB + invisible(lapply(seq_len(length(TMB.plot)), function(i){ + tmp = TMB.plot[[i]] + points(tmp$x, log10(tmp$TMB + 1), pch = 16, cex = 0.5, col = colvec[i]) + })) + + # modify axis + axis(side = 1, at = seq(0.5, length(TMB.plot) - 0.5, 1), labels = names(TMB.plot), + las = 1, tick = TRUE, line = 0) + axis(side = 2, at = y_at, las = 2, line = 0, tick = TRUE, labels = y_at) + if(show.size) { + axis(side = 3, at = seq(0.5, length(TMB.plot) - 0.5, 1), labels = paste0("n = ",x_top_label), + tick = TRUE, line = 0, cex.axis = 0.9) + } + mtext(text = bquote("log"[10]~"(TMB + 1)"), side = 2, line = 1.15, cex = 1.1) + + # add titv + invisible(lapply(seq_len(length(titv.dat2)), function(i){ + screen(i + 1, new = TRUE) + par(xpd = TRUE, mar = c(0, 0, 0, 0), oma = c(0, 0, 0, 0), bty = "o") + tmp <- titv.dat2[[i]] + barplot(tmp, col = col.titv[rownames(tmp)], names.arg = rep("", ncol(tmp)), + xaxs = "i", yaxs = "i", + axes = FALSE, space = 0, border = NA, lwd = 1.2) + box() + })) + + # add legend + screen(n.moic + 2, new = TRUE) + par(xpd = TRUE, mar = c(0, 0, 0, 0), oma = c(0, 0, 0, 0), bty = "n") + plot(NA, NA, xlim = c(0, 1), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA) + legend("center", + fill = col.titv, + legend = names(col.titv), + border = NA, + bty = "n", + cex = 0.8) + close.screen(all.screens = TRUE) + invisible(dev.copy2pdf(file = file.path(fig.path, outFig), width = width, height = height)) + + if(rmFLAGS) { + return(list(TMB.dat = TMB.dat, TMB.median = TMB.med, titv.dat = titv.dat.backup, maf.nonsilent = maf.nonsilent, maf.silent = maf.silent, maf.FLAGS = maf.FLAGS, FLAGS.count = as.data.frame(count.flags))) + } else { + return(list(TMB.dat = TMB.dat, TMB.median = TMB.med, titv.dat = titv.dat.backup, maf.nonsilent = maf.nonsilent, maf.silent = maf.silent)) + } +}