--- a
+++ b/R/compTMB.R
@@ -0,0 +1,401 @@
+#' @name compTMB
+#' @title Comparsion of total mutation burden
+#' @description This function calculates Total Mutation Burden (TMB) compares them among curent subtypes identified from multi-omics integrative clustering algorithms.
+#'
+#' @param moic.res An object returned by `getMOIC()` with one specified algorithm or `get\%algorithm_name\%` or `getConsensusMOIC()` with a list of multiple algorithms.
+#' @param maf A data frame of MAF file that has been already read with at least 10 columns as following: c('Tumor_Sample_Barcode', 'Hugo_Symbol', 'Chromosome', 'Start_Position', 'End_Position', 'Variant_Classification', 'Variant_Type', 'Reference_Allele', 'Tumor_Seq_Allele1', 'Tumor_Seq_Allele2')
+#' @param rmDup A logical value to indicate if removing repeated variants in a particuar sample, mapped to multiple transcripts of same Gene. TRUE by default.
+#' @param rmFLAGS A logical value to indicate if removing possible FLAGS. These FLAGS genes are often non-pathogenic and passengers, but are frequently mutated in most of the public exome studies, some of which are fishy. Examples of such genes include TTN, MUC16, etc. FALSE by default.
+#' @param nonSyn A string vector to indicate a list of variant claccifications that should be considered as non-synonymous and the rest will be considered synonymous (silent) variants. Default value of NULL uses Variant Classifications with High/Moderate variant consequences, including c('Frame_Shift_Del', 'Frame_Shift_Ins', 'Splice_Site', 'Translation_Start_Site', 'Nonsense_Mutation', 'Nonstop_Mutation', 'In_Frame_Del', 'In_Frame_Ins', 'Missense_Mutation'). See details at \url{http://asia.ensembl.org/Help/Glossary?id=535}
+#' @param clust.col A string vector storing colors for annotating each Subtype.
+#' @param test.method A string value to indicate the method for statistical testing. Allowed values contain c('nonparametric', 'parametric'); nonparametric means two-sample wilcoxon rank sum test for two subtypes and Kruskal-Wallis rank sum test for multiple subtypes; parametric means two-sample t-test when only two subtypes are identified, and anova for multiple subtypes comparison; "nonparametric" by default.
+#' @param show.size A logical value to indicate if showing the sample size within each subtype at the top of the figure. TRUE by default.
+#' @param fig.name  A string value to indicate the name of the boxviolin plot.
+#' @param fig.path A string value to indicate the output path for storing the boxviolin plot.
+#' @param exome.size An integer value to indicate the estimation of exome size. 38 by default (see \url{https://genomemedicine.biomedcentral.com/articles/10.1186/s13073-017-0424-2}).
+#' @param width A numeric value to indicate the width of boxviolin plot.
+#' @param height A numeric value to indicate the height of boxviolin plot.
+#' @return A figure of TMB and TiTv distribution (.pdf) and a list with the following components:
+#'
+#'         \code{TMB.dat}       a data.frame storing the TMB per sample within each subtype.
+#'
+#'         \code{TMB.median}    a data.frame storing the median of TMB for each subtype.
+#'
+#'         \code{titv.dat}      a data.frame storing the fraction contributions of TiTv per sample within each subtype.
+#'
+#'         \code{maf.nonsilent} a data.frame storing the information for non-synonymous mutations.
+#'
+#'         \code{maf.silent}    a data.frame storing the information for synonymous mutations.
+#'
+#'         \code{maf.FLAGS}     a data.frame storing the information for FLAGS mutations if\code{rmFLAGS = TRUE}.
+#'
+#'         \code{FLAGS.count}   a data.frame storing the summarization per FLAGS if\code{rmFLAGS = TRUE}.
+#' @export
+#' @importFrom maftools read.maf titv
+#' @importFrom dplyr %>% group_by summarise mutate n
+#' @importFrom grDevices dev.copy2pdf
+#' @examples # There is no example and please refer to vignette.
+#' @references Mayakonda A, Lin D, Assenov Y, Plass C, Koeffler PH (2018). Maftools: efficient and comprehensive analysis of somatic variants in cancer. Genome Res, 28(11): 1747-1756.
+#'
+#' Shyr C, Tarailo-Graovac M, Gottlieb M, Lee JJ, van Karnebeek C, Wasserman WW. (2014). FLAGS, frequently mutated genes in public exomes. BMC Med Genomics, 7(1): 1-14.
+#'
+#' Chalmers Z R, Connelly C F, Fabrizio D, et al. (2017). Analysis of 100,000 human cancer genomes reveals the landscape of tumor mutational burden. Genome Med, 9(1):34.
+compTMB <- function(moic.res    = NULL,
+                    maf         = NULL,
+                    rmDup       = TRUE,
+                    rmFLAGS     = FALSE,
+                    nonSyn      = NULL,
+                    exome.size  = 38,
+                    clust.col   = c("#2EC4B6","#E71D36","#FF9F1C","#BDD5EA","#FFA5AB","#011627","#023E8A","#9D4EDD"),
+                    test.method = "nonparametric",
+                    show.size   = TRUE,
+                    fig.path    = getwd(),
+                    fig.name    = NULL,
+                    width       = 6,
+                    height      = 6) {
+
+  label <- c("Tumor_Sample_Barcode",
+             "Hugo_Symbol",
+             "Chromosome",
+             "Start_Position",
+             "End_Position",
+             "Variant_Classification",
+             "Variant_Type",
+             "Reference_Allele",
+             "Tumor_Seq_Allele1",
+             "Tumor_Seq_Allele2")
+
+  maf <- as.data.frame(maf)
+
+  # check arguments
+  if(!all(is.element(label, colnames(maf)))) {
+    stop(paste0("maf data must have the following columns: \n ", paste(label, collapse = "\n "),"\n\nmissing required fields from maf: ", paste(setdiff(label, colnames(maf)), collapse = "\n")))
+  }
+
+  if(!is.element(test.method, c("nonparametric","parametric"))) {
+    stop("test.method can be one of nonparametric or parametric.")
+  }
+
+  # check data
+  comsam <- intersect(moic.res$clust.res$samID, unique(maf$Tumor_Sample_Barcode))
+  if(length(comsam) == nrow(moic.res$clust.res)) {
+    message("--all samples matched.")
+  } else {
+    message(paste0("--",(nrow(moic.res$clust.res)-length(comsam))," samples mismatched from current subtypes."))
+  }
+
+  maf <- maf[which(maf$Tumor_Sample_Barcode %in% comsam),]
+  clust.res <- moic.res$clust.res[comsam, , drop = FALSE]
+  n.moic <- length(unique(clust.res$clust))
+  colvec <- clust.col[1:n.moic]
+  names(colvec) <- paste0("CS",unique(clust.res$clust))
+  col.titv <- c("#E64B35CC", "#4DBBD5CC", "#00A087CC", "#3C5488CC", "#F39B7FCC", "#8491B4CC")
+  names(col.titv) <- c("C>T", "T>C", "C>A", "C>G", "T>A", "T>G")
+
+  if(rmFLAGS) {
+    FLAGS <- c("TTN",
+               "MUC16",
+               "OBSCN",
+               "AHNAK2",
+               "SYNE1",
+               "FLG",
+               "MUC5B",
+               "DNAH17",
+               "PLEC",
+               "DST",
+               "SYNE2",
+               "NEB",
+               "HSPG2",
+               "LAMA5",
+               "AHNAK",
+               "HMCN1",
+               "USH2A",
+               "DNAH11",
+               "MACF1",
+               "MUC17",
+               "DNAH5",
+               "GPR98",
+               "FAT1",
+               "PKD1",
+               "MDN1",
+               "RNF213",
+               "RYR1",
+               "DNAH2",
+               "DNAH3",
+               "DNAH8",
+               "DNAH1",
+               "DNAH9",
+               "ABCA13",
+               "APOB",
+               "SRRM2",
+               "CUBN",
+               "SPTBN5",
+               "PKHD1",
+               "LRP2",
+               "FBN3",
+               "CDH23",
+               "DNAH10",
+               "FAT4",
+               "RYR3",
+               "PKHD1L1",
+               "FAT2",
+               "CSMD1",
+               "PCNT",
+               "COL6A3",
+               "FRAS1",
+               "FCGBP",
+               "DNAH7",
+               "RP1L1",
+               "PCLO",
+               "ZFHX3",
+               "COL7A1",
+               "LRP1B",
+               "FAT3",
+               "EPPK1",
+               "VPS13C",
+               "HRNR",
+               "MKI67",
+               "MYO15A",
+               "STAB1",
+               "ZAN",
+               "UBR4",
+               "VPS13B",
+               "LAMA1",
+               "XIRP2",
+               "BSN",
+               "KMT2C",
+               "ALMS1",
+               "CELSR1",
+               "TG",
+               "LAMA3",
+               "DYNC2H1",
+               "KMT2D",
+               "BRCA2",
+               "CMYA5",
+               "SACS",
+               "STAB2",
+               "AKAP13",
+               "UTRN",
+               "VWF",
+               "VPS13D",
+               "ANK3",
+               "FREM2",
+               "PKD1L1",
+               "LAMA2",
+               "ABCA7",
+               "LRP1",
+               "ASPM",
+               "MYOM2",
+               "PDE4DIP",
+               "TACC2",
+               "MUC2",
+               "TEP1",
+               "HELZ2",
+               "HERC2",
+               "ABCA4")
+
+    if(sum(is.element(FLAGS, unique(maf$Hugo_Symbol))) > 0) {
+      maf.FLAGS <- maf[which(maf$Hugo_Symbol %in% FLAGS),]
+      maf.rmFLAGS <- maf[-which(maf$Hugo_Symbol %in% FLAGS),]
+      count.flags <- maf.FLAGS %>% group_by(Hugo_Symbol) %>% summarise(count = dplyr::n())
+      message("--remove possible FLAGS as below:")
+      head(count.flags)
+
+      maf.ob <- maftools::read.maf(maf                      = maf.rmFLAGS,
+                                   removeDuplicatedVariants = rmDup,
+                                   vc_nonSyn                = nonSyn)
+    } else {
+      maf.ob <- maftools::read.maf(maf                      = maf,
+                                   removeDuplicatedVariants = rmDup,
+                                   vc_nonSyn                = nonSyn)
+    }
+  }   else {
+      maf.ob <- maftools::read.maf(maf                      = maf,
+                                   removeDuplicatedVariants = rmDup,
+                                   vc_nonSyn                = nonSyn)
+  }
+
+  # classifies Single Nucleotide Variants into Transitions and Transversions
+  titv.dat <- maftools::titv(maf = maf.ob, plot = FALSE, useSyn = FALSE)$fraction.contribution %>%
+    as.data.frame() %>%
+    mutate(Subtype = paste0("CS",clust.res[as.character(.$Tumor_Sample_Barcode),"clust"]))
+  titv.dat.backup <- titv.dat
+  titv.dat <- split(titv.dat, f = titv.dat$Subtype)
+
+  # extract silent and nonsilent mutations
+  maf.silent    <- as.data.frame(maf.ob@maf.silent)
+  maf.nonsilent <- as.data.frame(maf.ob@data)
+
+  # extract total mutation burden
+  TMB.dat <- as.data.frame(maf.ob@variants.per.sample)
+  TMB.dat <- data.frame(samID            = as.character(TMB.dat$Tumor_Sample_Barcode),
+                        variants         = as.character(TMB.dat$Variants),
+                        TMB              = as.numeric(TMB.dat$Variants)/exome.size,
+                        log10TMB         = log10(as.numeric(TMB.dat$Variants)/exome.size + 1),
+                        Subtype          = paste0("CS", clust.res[as.character(TMB.dat$Tumor_Sample_Barcode), "clust"]),
+                        stringsAsFactors = FALSE)
+  TMB.dat <- TMB.dat[order(TMB.dat$Subtype), , drop = FALSE]
+  TMB.med <- TMB.dat %>% group_by(Subtype) %>% summarize(median = median(TMB)) %>% as.data.frame()
+  TMB.dat <- TMB.dat[order(TMB.dat$Subtype, TMB.dat$TMB, decreasing = FALSE), ]
+
+  # sample order in bottom panel
+  sampleorder <- TMB.dat %>% split(.$Subtype) %>% lapply("[[", 1) %>% lapply(., as.character)
+  TMB.plot <- split(TMB.dat, as.factor(TMB.dat$Subtype))
+  TMB.plot <- lapply(seq_len(length(TMB.plot)), function(i) {
+    x = TMB.plot[[i]]
+    x = data.frame(x = seq(i - 1, i, length.out = nrow(x)),
+                   TMB = x[, "TMB"],
+                   Subtype = x[, "Subtype"])
+    return(x)
+  })
+  names(TMB.plot) <- levels(as.factor(TMB.dat$Subtype))
+
+  # prepare titv data
+  titv.dat2 <- lapply(TMB.med$Subtype, function(x){
+    tmp <- titv.dat[[x]]
+    tmp <- tmp[match(sampleorder[[x]], as.character(tmp$Tumor_Sample_Barcode)), ]
+    return(tmp)
+    if (!all(tmp$Tumor_Sample_Barcode == sampleorder[[x]])){
+      stop("inconsistent sample order...")
+    }
+  })
+
+  names(titv.dat2) <- TMB.med$Subtype
+  titv.dat2 <- lapply(titv.dat2, function(x){
+    x <- as.data.frame(x)
+    rownames(x) <- x$Tumor_Sample_Barcode
+    x <- x[, setdiff(colnames(x), c("Tumor_Sample_Barcode","Subtype"))]
+    x <- t(x)
+    #delete samples without mutation
+    if (length(which(colSums(x) == 0)) > 0) {
+      x = x[, -which(colSums(x) == 0), drop = FALSE]
+    }
+    return(x)
+  })
+
+  TMB.med$Median_Mutations_log10 <- log10(TMB.med$median + 1)
+
+  # statistical testing
+  if(n.moic == 2 & test.method == "nonparametric") {
+    statistic <- "wilcox.test"
+    TMB.test  <- wilcox.test(TMB.dat$log10TMB ~ TMB.dat$Subtype)$p.value
+    cat(paste0("Wilcoxon rank sum test p value = ", formatC(TMB.test, format = "e", digits = 2)))
+  }
+
+  if(n.moic == 2 & test.method == "parametric") {
+    statistic <- "t.test"
+    TMB.test  <- t.test(TMB.dat$log10TMB ~ TMB.dat$Subtype)$p.value
+    cat(paste0("Student's t test p value = ", formatC(TMB.test, format = "e", digits = 2)))
+  }
+
+  if(n.moic > 2 & test.method == "nonparametric") {
+    statistic <- "kruskal.test"
+    TMB.test  <- kruskal.test(TMB.dat$log10TMB ~ TMB.dat$Subtype)$p.value
+    pairwise.TMB.test <- pairwise.wilcox.test(TMB.dat$log10TMB,TMB.dat$Subtype,p.adjust.method = "BH")
+    # pairwise.TMB.test <- dunnTest(log10TMB ~ as.factor(Subtype),
+    #                               data = TMB.dat,
+    #                               method = "bh")
+    cat(paste0("Kruskal-Wallis rank sum test p value = ", formatC(TMB.test, format = "e", digits = 2),"\npost-hoc pairwise wilcoxon rank sum test with Benjamini-Hochberg adjustment presents below:\n"))
+    print(formatC(pairwise.TMB.test$p.value, format = "e", digits = 2))
+  }
+
+  if(n.moic > 2 & test.method == "parametric") {
+    statistic <- "anova"
+    TMB.test  <- summary(aov(TMB.dat$log10TMB ~ TMB.dat$Subtype))[[1]][["Pr(>F)"]][1]
+    pairwise.TMB.test <- pairwise.t.test(TMB.dat$log10TMB,TMB.dat$Subtype,p.adjust.method = "BH")
+    cat(paste0("One-way anova test p value = ", formatC(TMB.test, format = "e", digits = 2),"\npost-hoc pairwise Student's t test with Benjamini-Hochberg adjustment presents below:\n"))
+    print(formatC(pairwise.TMB.test$p.value, format = "e", digits = 2))
+  }
+
+  # start illustration
+  if(is.null(fig.name)) {
+    outFig <- "distribution of TMB and titv.pdf"
+  } else {
+    outFig <- paste0(fig.name,".pdf")
+  }
+
+  # base layout
+  n1 <- seq(from = 0.105, to = 0.97-(0.97-0.05)*0.04, length.out = n.moic + 1)
+  n2 <- n1[2:length(n1)]
+  n  <- data.frame(n1 = n1[1:n.moic], n2 = n2, n3 = 0.05, n4 = 0.2) %>%
+    rbind(c(0.05, 0.97, 0.25, 1), ., c(0, 0.1, 0.05, 0.25)) %>% as.matrix()
+
+  opar <- par(no.readonly = TRUE)
+  invisible(suppressWarnings(split.screen(n, erase = TRUE)))
+  screen(1, new = TRUE)
+  par(xpd = TRUE, mar = c(3, 1, 2, 0), oma = c(0, 0, 0, 0),bty = "o", mgp = c(2, 0.5, 0), tcl=-.25)
+  y_lims = range(log10(unlist(lapply(TMB.plot, function(x) max(x[, "TMB"], na.rm = TRUE))) + 1))
+  y_lims[1] = 0
+  y_lims[2] = ceiling(max(y_lims))
+  y_at = y_lims[1]:y_lims[2]
+  x_top_label <- as.numeric(unlist(lapply(TMB.plot, nrow)))
+  plot(NA, NA, xlim = c(0, length(TMB.plot)), ylim = y_lims,
+       xlab = NA, ylab = NA, xaxt="n", yaxt = "n", xaxs = "r", yaxs = "r")
+  rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#EAE9E9", border = FALSE)
+  grid(col = "white", lty = 1, lwd = 1.5) # add grid
+
+  par(new = TRUE, bty="o")
+  plot(NA, NA,
+       col = "white",
+       xlim = c(0, length(TMB.plot)), ylim = y_lims,
+       xlab = "", ylab = "",
+       xaxt = "n", yaxt = "n")
+
+  text(x = length(TMB.plot)/2, y = y_lims[2] - 0.1, cex = 1.2,
+       label = paste0(statistic, " p = ", formatC(TMB.test, digits = 1, format = "e")))
+
+  # add median TMB
+  invisible(lapply(seq_len(nrow(TMB.med)), function(i) {
+    segments(x0  = i - 1,
+             x1  = i,
+             y0  = TMB.med[i, "Median_Mutations_log10"],
+             y1  = TMB.med[i, "Median_Mutations_log10"],
+             col = "#2b2d42",
+             lwd = 2)}))
+
+  # add scatter TMB
+  invisible(lapply(seq_len(length(TMB.plot)), function(i){
+    tmp = TMB.plot[[i]]
+    points(tmp$x, log10(tmp$TMB + 1), pch = 16, cex = 0.5, col = colvec[i])
+  }))
+
+  # modify axis
+  axis(side = 1, at = seq(0.5, length(TMB.plot) - 0.5, 1), labels = names(TMB.plot),
+       las = 1, tick = TRUE, line = 0)
+  axis(side = 2, at = y_at, las = 2, line = 0, tick = TRUE, labels = y_at)
+  if(show.size) {
+    axis(side = 3, at = seq(0.5, length(TMB.plot) - 0.5, 1), labels = paste0("n = ",x_top_label),
+         tick = TRUE, line = 0, cex.axis = 0.9)
+  }
+  mtext(text = bquote("log"[10]~"(TMB + 1)"), side = 2, line = 1.15, cex = 1.1)
+
+  # add titv
+  invisible(lapply(seq_len(length(titv.dat2)), function(i){
+    screen(i + 1, new = TRUE)
+    par(xpd = TRUE, mar = c(0, 0, 0, 0), oma = c(0, 0, 0, 0), bty = "o")
+    tmp <- titv.dat2[[i]]
+    barplot(tmp, col = col.titv[rownames(tmp)], names.arg = rep("", ncol(tmp)),
+            xaxs = "i", yaxs = "i",
+            axes = FALSE, space = 0, border = NA, lwd = 1.2)
+    box()
+  }))
+
+  # add legend
+  screen(n.moic + 2, new = TRUE)
+  par(xpd = TRUE, mar = c(0, 0, 0, 0), oma = c(0, 0, 0, 0), bty = "n")
+  plot(NA, NA, xlim = c(0, 1), ylim = c(0, 1), axes = FALSE, xlab = NA, ylab = NA)
+  legend("center",
+         fill   = col.titv,
+         legend = names(col.titv),
+         border = NA,
+         bty    = "n",
+         cex    = 0.8)
+  close.screen(all.screens = TRUE)
+  invisible(dev.copy2pdf(file = file.path(fig.path, outFig), width = width, height = height))
+
+  if(rmFLAGS) {
+    return(list(TMB.dat = TMB.dat, TMB.median = TMB.med, titv.dat = titv.dat.backup, maf.nonsilent = maf.nonsilent, maf.silent = maf.silent, maf.FLAGS = maf.FLAGS, FLAGS.count = as.data.frame(count.flags)))
+  } else {
+    return(list(TMB.dat = TMB.dat, TMB.median = TMB.med, titv.dat = titv.dat.backup, maf.nonsilent = maf.nonsilent, maf.silent = maf.silent))
+  }
+}