Diff of /R/runMarker.R [000000] .. [494cbf]

Switch to side-by-side view

--- a
+++ b/R/runMarker.R
@@ -0,0 +1,240 @@
+#' @name runMarker
+#' @title Run identification of unique biomarkers
+#' @description This function aims to identify uniquely and significantly expressed (overexpressed or downexpressed) biomarkers for each subtype identified by multi-omics integrative clustering algorithms. Top markers will be chosen to generate a template so as to run nearest template prediction for subtype verification.
+#' @param moic.res An object returned by `getMOIC()` with one specified algorithm or `get\%algorithm_name\%` or `getConsensusMOIC()` with a list of multiple algorithms.
+#' @param dea.method A string value to indicate the algorithm for differential expression analysis. Allowed value contains c('deseq2', 'edger', 'limma').
+#' @param dat.path A string value to indicate the path for saving the files of differential expression analysis.
+#' @param res.path A string value to indicate the path for saving the results for identifying subtype-specific markers.
+#' @param p.cutoff A numeric value to indicate the nominal p value for identifying significant markers; pvalue < 0.05 by default.
+#' @param p.adj.cutoff A numeric value to indicate the adjusted p value for identifying significant markers; padj < 0.05 by default.
+#' @param dirct A string value to indicate the direction of identifying significant marker. Allowed values contain c('up', 'down'); `up` means up-regulated marker, and `down` means down-regulated marker.
+#' @param n.marker A integer value to indicate how many top markers sorted by log2fc should be identified for each subtype; 200 by default.
+#' @param norm.expr A matrix of normalized expression data with rows for genes and columns for samples; FPKM or TPM without log2 transformation is recommended.
+#' @param prefix A string value to indicate the prefix of differential expression file (use for searching files).
+#' @param doplot A logic value to indicate if generating heatmap by using subtype-specific markers. TRUE by default.
+#' @param annCol A data.frame storing annotation information for samples.
+#' @param annColors A list of string vectors for colors matched with annCol.
+#' @param clust.col A string vector storing colors for annotating each subtype at the top of heatmap.
+#' @param halfwidth A numeric vector to assign marginal cutoff for truncating values in data; 3 by default.
+#' @param centerFlag A logical vector to indicate if expression data should be centered; TRUE by default.
+#' @param scaleFlag A logical vector to indicate if expression data should be scaled; TRUE by default.
+#' @param color A string vector storing colors for heatmap.
+#' @param fig.path A string value to indicate the output path for storing the marker heatmap.
+#' @param fig.name A string value to indicate the name of the marker heatmap.
+#' @param width A numeric value to indicate the width of output figure.
+#' @param height A numeric value to indicate the height of output figure.
+#' @param show_rownames A logic value to indicate if showing rownames (feature names) in heatmap; FALSE by default.
+#' @param show_colnames A logic value to indicate if showing colnames (sample ID) in heatmap; FALSE by default.
+#' @param ... Additional parameters pass to \link[ComplexHeatmap]{pheatmap}.
+#' @return A figure of subtype-specific marker heatmap (.pdf) if \code{doPlot = TRUE} and a list with the following components:
+#'
+#'         \code{unqlist}   a string vector storing the unique marker across all subtypes.
+#'
+#'         \code{templates} a data.frame storing the the template information for nearest template prediction, which is used for verification in external cohort.
+#'
+#'         \code{dirct}     a string value indicating the direction for identifying subtype-specific markers.
+#'
+#'         \code{heatmap}   a complexheatmap object.
+#' @export
+#' @importFrom grDevices pdf dev.off colorRampPalette
+#' @importFrom ComplexHeatmap pheatmap draw
+#' @references Gu Z, Eils R, Schlesner M (2016). Complex heatmaps reveal patterns and correlations in multidimensional genomic data. Bioinformatics, 32(18):2847-2849.
+#' @examples # There is no example and please refer to vignette.
+runMarker <- function(moic.res      = NULL,
+                      dea.method    = c("deseq2", "edger", "limma"),
+                      prefix        = NULL,
+                      dat.path      = getwd(),
+                      res.path      = getwd(),
+                      p.cutoff      = 0.05,
+                      p.adj.cutoff  = 0.05,
+                      dirct         = "up",
+                      n.marker      = 200,
+                      doplot        = TRUE,
+                      norm.expr     = NULL,
+                      annCol        = NULL,
+                      annColors     = NULL,
+                      clust.col     = c("#2EC4B6","#E71D36","#FF9F1C","#BDD5EA","#FFA5AB","#011627","#023E8A","#9D4EDD"),
+                      halfwidth     = 3,
+                      centerFlag    = TRUE,
+                      scaleFlag     = TRUE,
+                      show_rownames = FALSE,
+                      show_colnames = FALSE,
+                      color         = c("#5bc0eb", "black", "#ECE700"),
+                      fig.path      = getwd(),
+                      fig.name      = NULL,
+                      width         = 8,
+                      height        = 8,
+                      ...) {
+
+  n.moic <- length(unique(moic.res$clust.res$clust))
+  mo.method <- moic.res$mo.method
+  DEpattern <- paste(mo.method, "_", ifelse(is.null(prefix),"",paste0(prefix,"_")), dea.method, ".*._vs_Others.txt$", sep="")
+  DEfiles <- dir(dat.path,pattern = DEpattern)
+
+  if(length(DEfiles) == 0) {stop("no DEfiles!")}
+  if(length(DEfiles) != n.moic) {stop("not all the multi-omics clusters have DEfile!")}
+  if (!is.element(dirct, c("up", "down"))) {stop( "dirct type error! Allowed value contains c('up', 'down').") }
+
+  if(dirct == "up") { outlabel <- "unique_upexpr_marker.txt" }
+  if(dirct == "down") { outlabel <- "unique_downexpr_marker.txt" }
+
+  genelist <- c()
+  for (filek in DEfiles) {
+    DEres <- read.table(file.path(dat.path, filek), header=TRUE, row.names=NULL, sep="\t", quote="", stringsAsFactors=FALSE)
+    DEres <- DEres[!duplicated(DEres[, 1]),]
+    DEres <- DEres[!is.na(DEres[, 1]), ]
+    rownames(DEres) <- DEres[, 1]
+    DEres <- DEres[, -1]
+
+    #rownames(DEres) <- toupper(rownames(DEres))
+    if (dirct == "up") {
+      genelist <- c( genelist, rownames(DEres[!is.na(DEres$padj) & DEres$pvalue < p.cutoff & DEres$padj < p.adj.cutoff & !is.na(DEres$log2fc) & DEres$log2fc > 0, ]) )
+    }
+    if (dirct == "down") {
+      genelist <- c( genelist, rownames(DEres[!is.na(DEres$padj) & DEres$pvalue < p.cutoff & DEres$padj < p.adj.cutoff & !is.na(DEres$log2fc) & DEres$log2fc < 0, ]) )
+    }
+  }
+  unqlist <- setdiff(genelist,genelist[duplicated(genelist)])
+
+  marker <- list()
+  for (filek in DEfiles) {
+    DEres <- read.table(file.path(dat.path, filek), header=TRUE, row.names=NULL, sep="\t", quote="", stringsAsFactors=FALSE)
+    DEres <- DEres[!duplicated(DEres[, 1]),]
+    DEres <- DEres[!is.na(DEres[, 1]), ]
+    rownames(DEres) <- DEres[, 1]
+    DEres <- DEres[, -1]
+
+    #rownames(DEres) <- toupper(rownames(DEres))
+    if(dirct == "up") {
+      outk <- intersect( unqlist, rownames(DEres[!is.na(DEres$padj) & DEres$pvalue < p.cutoff & DEres$padj < p.adj.cutoff & !is.na(DEres$log2fc) & DEres$log2fc > 0, ]) )
+      outk <- DEres[outk,]
+      outk <- outk[order(outk$log2fc, decreasing = TRUE),]
+
+      if(nrow(outk) > n.marker) {
+        marker[[filek]] <- outk[1:n.marker,]
+      } else {
+        marker[[filek]] <- outk
+      }
+      marker$dirct <- "up"
+    }
+    if(dirct == "down") {
+      outk <- intersect( unqlist, rownames(DEres[!is.na(DEres$padj) & DEres$pvalue < p.cutoff & DEres$padj < p.adj.cutoff & !is.na(DEres$log2fc) & DEres$log2fc < 0, ]) )
+      outk <- DEres[outk,]
+      outk <- outk[order(outk$log2fc, decreasing = FALSE),]
+
+      if(nrow(outk) > n.marker) {
+        marker[[filek]] <- outk[1:n.marker,]
+      } else {
+        marker[[filek]] <- outk
+      }
+      marker$dirct <- "down"
+    }
+    # write file
+    write.table(outk, file=file.path(res.path, paste(gsub("_vs_Others.txt","", filek, fixed = TRUE), outlabel, sep = "_")), row.names = TRUE, col.names = NA, sep = "\t", quote = FALSE)
+  }
+
+  # generate templates for nearest template prediction
+  templates <- NULL
+  for (filek in DEfiles) {
+    tmp <- data.frame(probe = rownames(marker[[filek]]),
+                      class = sub("_vs_Others.txt","",sub(".*.result.","",filek)),
+                      dirct = marker$dirct,
+                      stringsAsFactors = FALSE)
+    templates <- rbind.data.frame(templates, tmp, stringsAsFactors = FALSE)
+  }
+  write.table(templates, file=file.path(res.path, paste0(mo.method,"_",dea.method,"_",dirct,"regulated_marker_templates.txt")), row.names = FALSE, sep = "\t", quote = FALSE)
+
+  # generate heatmap with subtype-specific markers
+  if(doplot) {
+    if(is.null(norm.expr)) {
+      stop("please provide a matrix or data.frame of normalized expression data with rows for genes and columns for samples; FPKM or TPM without log2 transformation is recommended.")
+    }
+
+    # check data
+    comsam <- intersect(moic.res$clust.res$samID, colnames(norm.expr))
+    if(length(comsam) == nrow(moic.res$clust.res)) {
+      message("--all samples matched.")
+    } else {
+      message(paste0("--",(nrow(moic.res$clust.res)-length(comsam))," samples mismatched from current subtypes."))
+    }
+
+    moic.res$clust.res <- moic.res$clust.res[comsam, , drop = FALSE]
+    norm.expr <- norm.expr[,comsam]
+
+    if(is.null(fig.name)) {
+      outFig <- paste0("markerheatmap_using_",dirct,"regulated_genes.pdf")
+    } else {
+      outFig <- paste0(fig.name, "_using_", dirct,"regulated_genes.pdf")
+    }
+    sam.order <- moic.res$clust.res[order(moic.res$clust.res$clust, decreasing = FALSE), "samID"]
+    colvec <- clust.col[1:n.moic]
+    names(colvec) <- paste0("CS",1:n.moic)
+    if(!is.null(annCol) & !is.null(annColors)) {
+      annCol <- annCol[sam.order, , drop = FALSE]
+      annCol$Subtype <- paste0("CS",moic.res$clust.res[sam.order,"clust"])
+      annColors[["Subtype"]] <- colvec
+    } else {
+      annCol <- data.frame("Subtype" = paste0("CS",moic.res$clust.res[sam.order,"clust"]),
+                           row.names = sam.order,
+                           stringsAsFactors = FALSE)
+      annColors <- list("Subtype" = colvec)
+    }
+
+    if(max(norm.expr) < 25 | (max(norm.expr) >= 25 & min(norm.expr) < 0)) {
+      message("--expression profile seems to have been standardised (z-score or log transformation), no more action will be performed.")
+      gset <- norm.expr
+    }
+    if(max(norm.expr) >= 25 & min(norm.expr) >= 0){
+      message("--log2 transformation done for expression data.")
+      gset <- log2(norm.expr + 1)
+    }
+
+    standarize.fun <- function(indata=NULL, halfwidth=NULL, centerFlag=TRUE, scaleFlag=TRUE) {
+      outdata=t(scale(t(indata), center=centerFlag, scale=scaleFlag))
+      if (!is.null(halfwidth)) {
+        outdata[outdata>halfwidth]=halfwidth
+        outdata[outdata<(-halfwidth)]= -halfwidth
+      }
+      return(outdata)
+    }
+
+    plotdata <- standarize.fun(gset[intersect(templates$probe, rownames(gset)), sam.order],
+                               halfwidth = halfwidth,
+                               centerFlag = centerFlag,
+                               scaleFlag = scaleFlag)
+
+    if(!is.null(annCol) & !is.null(annColors)) {
+      for (i in names(annColors)) {
+        if(is.function(annColors[[i]])) {
+          annColors[[i]] <- annColors[[i]](pretty(range(annCol[,i]),n = 64)) # transformat colorRamp2 function to color vector
+        }
+      }
+    }
+
+    hm <- ComplexHeatmap::pheatmap(mat               = plotdata,
+                                   border_color      = NA,
+                                   cluster_cols      = FALSE,
+                                   cluster_rows      = FALSE,
+                                   annotation_col    = annCol,
+                                   annotation_colors = annColors,
+                                   legend_breaks     = pretty(c(-halfwidth,halfwidth)),
+                                   legend_labels     = pretty(c(-halfwidth,halfwidth)),
+                                   show_rownames     = show_rownames,
+                                   show_colnames     = show_colnames,
+                                   treeheight_col    = 0,
+                                   treeheight_row    = 0,
+                                   color             = grDevices::colorRampPalette(color)(64),
+                                   ...)
+
+    # save to pdf
+    pdf(file.path(fig.path, outFig), width = width, height = height)
+    draw(hm,annotation_legend_side = "left",heatmap_legend_side = "left")
+    invisible(dev.off())
+
+    # print to screen
+    draw(hm,annotation_legend_side = "left",heatmap_legend_side = "left")
+    return(list(unqlist = unqlist, templates = templates, dirct = dirct, heatmap = hm))
+  } else {
+    return(list(unqlist = unqlist, templates = templates, dirct = dirct))
+  }
+}