a b/R/runMarker.R
1
#' @name runMarker
2
#' @title Run identification of unique biomarkers
3
#' @description This function aims to identify uniquely and significantly expressed (overexpressed or downexpressed) biomarkers for each subtype identified by multi-omics integrative clustering algorithms. Top markers will be chosen to generate a template so as to run nearest template prediction for subtype verification.
4
#' @param moic.res An object returned by `getMOIC()` with one specified algorithm or `get\%algorithm_name\%` or `getConsensusMOIC()` with a list of multiple algorithms.
5
#' @param dea.method A string value to indicate the algorithm for differential expression analysis. Allowed value contains c('deseq2', 'edger', 'limma').
6
#' @param dat.path A string value to indicate the path for saving the files of differential expression analysis.
7
#' @param res.path A string value to indicate the path for saving the results for identifying subtype-specific markers.
8
#' @param p.cutoff A numeric value to indicate the nominal p value for identifying significant markers; pvalue < 0.05 by default.
9
#' @param p.adj.cutoff A numeric value to indicate the adjusted p value for identifying significant markers; padj < 0.05 by default.
10
#' @param dirct A string value to indicate the direction of identifying significant marker. Allowed values contain c('up', 'down'); `up` means up-regulated marker, and `down` means down-regulated marker.
11
#' @param n.marker A integer value to indicate how many top markers sorted by log2fc should be identified for each subtype; 200 by default.
12
#' @param norm.expr A matrix of normalized expression data with rows for genes and columns for samples; FPKM or TPM without log2 transformation is recommended.
13
#' @param prefix A string value to indicate the prefix of differential expression file (use for searching files).
14
#' @param doplot A logic value to indicate if generating heatmap by using subtype-specific markers. TRUE by default.
15
#' @param annCol A data.frame storing annotation information for samples.
16
#' @param annColors A list of string vectors for colors matched with annCol.
17
#' @param clust.col A string vector storing colors for annotating each subtype at the top of heatmap.
18
#' @param halfwidth A numeric vector to assign marginal cutoff for truncating values in data; 3 by default.
19
#' @param centerFlag A logical vector to indicate if expression data should be centered; TRUE by default.
20
#' @param scaleFlag A logical vector to indicate if expression data should be scaled; TRUE by default.
21
#' @param color A string vector storing colors for heatmap.
22
#' @param fig.path A string value to indicate the output path for storing the marker heatmap.
23
#' @param fig.name A string value to indicate the name of the marker heatmap.
24
#' @param width A numeric value to indicate the width of output figure.
25
#' @param height A numeric value to indicate the height of output figure.
26
#' @param show_rownames A logic value to indicate if showing rownames (feature names) in heatmap; FALSE by default.
27
#' @param show_colnames A logic value to indicate if showing colnames (sample ID) in heatmap; FALSE by default.
28
#' @param ... Additional parameters pass to \link[ComplexHeatmap]{pheatmap}.
29
#' @return A figure of subtype-specific marker heatmap (.pdf) if \code{doPlot = TRUE} and a list with the following components:
30
#'
31
#'         \code{unqlist}   a string vector storing the unique marker across all subtypes.
32
#'
33
#'         \code{templates} a data.frame storing the the template information for nearest template prediction, which is used for verification in external cohort.
34
#'
35
#'         \code{dirct}     a string value indicating the direction for identifying subtype-specific markers.
36
#'
37
#'         \code{heatmap}   a complexheatmap object.
38
#' @export
39
#' @importFrom grDevices pdf dev.off colorRampPalette
40
#' @importFrom ComplexHeatmap pheatmap draw
41
#' @references Gu Z, Eils R, Schlesner M (2016). Complex heatmaps reveal patterns and correlations in multidimensional genomic data. Bioinformatics, 32(18):2847-2849.
42
#' @examples # There is no example and please refer to vignette.
43
runMarker <- function(moic.res      = NULL,
44
                      dea.method    = c("deseq2", "edger", "limma"),
45
                      prefix        = NULL,
46
                      dat.path      = getwd(),
47
                      res.path      = getwd(),
48
                      p.cutoff      = 0.05,
49
                      p.adj.cutoff  = 0.05,
50
                      dirct         = "up",
51
                      n.marker      = 200,
52
                      doplot        = TRUE,
53
                      norm.expr     = NULL,
54
                      annCol        = NULL,
55
                      annColors     = NULL,
56
                      clust.col     = c("#2EC4B6","#E71D36","#FF9F1C","#BDD5EA","#FFA5AB","#011627","#023E8A","#9D4EDD"),
57
                      halfwidth     = 3,
58
                      centerFlag    = TRUE,
59
                      scaleFlag     = TRUE,
60
                      show_rownames = FALSE,
61
                      show_colnames = FALSE,
62
                      color         = c("#5bc0eb", "black", "#ECE700"),
63
                      fig.path      = getwd(),
64
                      fig.name      = NULL,
65
                      width         = 8,
66
                      height        = 8,
67
                      ...) {
68
69
  n.moic <- length(unique(moic.res$clust.res$clust))
70
  mo.method <- moic.res$mo.method
71
  DEpattern <- paste(mo.method, "_", ifelse(is.null(prefix),"",paste0(prefix,"_")), dea.method, ".*._vs_Others.txt$", sep="")
72
  DEfiles <- dir(dat.path,pattern = DEpattern)
73
74
  if(length(DEfiles) == 0) {stop("no DEfiles!")}
75
  if(length(DEfiles) != n.moic) {stop("not all the multi-omics clusters have DEfile!")}
76
  if (!is.element(dirct, c("up", "down"))) {stop( "dirct type error! Allowed value contains c('up', 'down').") }
77
78
  if(dirct == "up") { outlabel <- "unique_upexpr_marker.txt" }
79
  if(dirct == "down") { outlabel <- "unique_downexpr_marker.txt" }
80
81
  genelist <- c()
82
  for (filek in DEfiles) {
83
    DEres <- read.table(file.path(dat.path, filek), header=TRUE, row.names=NULL, sep="\t", quote="", stringsAsFactors=FALSE)
84
    DEres <- DEres[!duplicated(DEres[, 1]),]
85
    DEres <- DEres[!is.na(DEres[, 1]), ]
86
    rownames(DEres) <- DEres[, 1]
87
    DEres <- DEres[, -1]
88
89
    #rownames(DEres) <- toupper(rownames(DEres))
90
    if (dirct == "up") {
91
      genelist <- c( genelist, rownames(DEres[!is.na(DEres$padj) & DEres$pvalue < p.cutoff & DEres$padj < p.adj.cutoff & !is.na(DEres$log2fc) & DEres$log2fc > 0, ]) )
92
    }
93
    if (dirct == "down") {
94
      genelist <- c( genelist, rownames(DEres[!is.na(DEres$padj) & DEres$pvalue < p.cutoff & DEres$padj < p.adj.cutoff & !is.na(DEres$log2fc) & DEres$log2fc < 0, ]) )
95
    }
96
  }
97
  unqlist <- setdiff(genelist,genelist[duplicated(genelist)])
98
99
  marker <- list()
100
  for (filek in DEfiles) {
101
    DEres <- read.table(file.path(dat.path, filek), header=TRUE, row.names=NULL, sep="\t", quote="", stringsAsFactors=FALSE)
102
    DEres <- DEres[!duplicated(DEres[, 1]),]
103
    DEres <- DEres[!is.na(DEres[, 1]), ]
104
    rownames(DEres) <- DEres[, 1]
105
    DEres <- DEres[, -1]
106
107
    #rownames(DEres) <- toupper(rownames(DEres))
108
    if(dirct == "up") {
109
      outk <- intersect( unqlist, rownames(DEres[!is.na(DEres$padj) & DEres$pvalue < p.cutoff & DEres$padj < p.adj.cutoff & !is.na(DEres$log2fc) & DEres$log2fc > 0, ]) )
110
      outk <- DEres[outk,]
111
      outk <- outk[order(outk$log2fc, decreasing = TRUE),]
112
113
      if(nrow(outk) > n.marker) {
114
        marker[[filek]] <- outk[1:n.marker,]
115
      } else {
116
        marker[[filek]] <- outk
117
      }
118
      marker$dirct <- "up"
119
    }
120
    if(dirct == "down") {
121
      outk <- intersect( unqlist, rownames(DEres[!is.na(DEres$padj) & DEres$pvalue < p.cutoff & DEres$padj < p.adj.cutoff & !is.na(DEres$log2fc) & DEres$log2fc < 0, ]) )
122
      outk <- DEres[outk,]
123
      outk <- outk[order(outk$log2fc, decreasing = FALSE),]
124
125
      if(nrow(outk) > n.marker) {
126
        marker[[filek]] <- outk[1:n.marker,]
127
      } else {
128
        marker[[filek]] <- outk
129
      }
130
      marker$dirct <- "down"
131
    }
132
    # write file
133
    write.table(outk, file=file.path(res.path, paste(gsub("_vs_Others.txt","", filek, fixed = TRUE), outlabel, sep = "_")), row.names = TRUE, col.names = NA, sep = "\t", quote = FALSE)
134
  }
135
136
  # generate templates for nearest template prediction
137
  templates <- NULL
138
  for (filek in DEfiles) {
139
    tmp <- data.frame(probe = rownames(marker[[filek]]),
140
                      class = sub("_vs_Others.txt","",sub(".*.result.","",filek)),
141
                      dirct = marker$dirct,
142
                      stringsAsFactors = FALSE)
143
    templates <- rbind.data.frame(templates, tmp, stringsAsFactors = FALSE)
144
  }
145
  write.table(templates, file=file.path(res.path, paste0(mo.method,"_",dea.method,"_",dirct,"regulated_marker_templates.txt")), row.names = FALSE, sep = "\t", quote = FALSE)
146
147
  # generate heatmap with subtype-specific markers
148
  if(doplot) {
149
    if(is.null(norm.expr)) {
150
      stop("please provide a matrix or data.frame of normalized expression data with rows for genes and columns for samples; FPKM or TPM without log2 transformation is recommended.")
151
    }
152
153
    # check data
154
    comsam <- intersect(moic.res$clust.res$samID, colnames(norm.expr))
155
    if(length(comsam) == nrow(moic.res$clust.res)) {
156
      message("--all samples matched.")
157
    } else {
158
      message(paste0("--",(nrow(moic.res$clust.res)-length(comsam))," samples mismatched from current subtypes."))
159
    }
160
161
    moic.res$clust.res <- moic.res$clust.res[comsam, , drop = FALSE]
162
    norm.expr <- norm.expr[,comsam]
163
164
    if(is.null(fig.name)) {
165
      outFig <- paste0("markerheatmap_using_",dirct,"regulated_genes.pdf")
166
    } else {
167
      outFig <- paste0(fig.name, "_using_", dirct,"regulated_genes.pdf")
168
    }
169
    sam.order <- moic.res$clust.res[order(moic.res$clust.res$clust, decreasing = FALSE), "samID"]
170
    colvec <- clust.col[1:n.moic]
171
    names(colvec) <- paste0("CS",1:n.moic)
172
    if(!is.null(annCol) & !is.null(annColors)) {
173
      annCol <- annCol[sam.order, , drop = FALSE]
174
      annCol$Subtype <- paste0("CS",moic.res$clust.res[sam.order,"clust"])
175
      annColors[["Subtype"]] <- colvec
176
    } else {
177
      annCol <- data.frame("Subtype" = paste0("CS",moic.res$clust.res[sam.order,"clust"]),
178
                           row.names = sam.order,
179
                           stringsAsFactors = FALSE)
180
      annColors <- list("Subtype" = colvec)
181
    }
182
183
    if(max(norm.expr) < 25 | (max(norm.expr) >= 25 & min(norm.expr) < 0)) {
184
      message("--expression profile seems to have been standardised (z-score or log transformation), no more action will be performed.")
185
      gset <- norm.expr
186
    }
187
    if(max(norm.expr) >= 25 & min(norm.expr) >= 0){
188
      message("--log2 transformation done for expression data.")
189
      gset <- log2(norm.expr + 1)
190
    }
191
192
    standarize.fun <- function(indata=NULL, halfwidth=NULL, centerFlag=TRUE, scaleFlag=TRUE) {
193
      outdata=t(scale(t(indata), center=centerFlag, scale=scaleFlag))
194
      if (!is.null(halfwidth)) {
195
        outdata[outdata>halfwidth]=halfwidth
196
        outdata[outdata<(-halfwidth)]= -halfwidth
197
      }
198
      return(outdata)
199
    }
200
201
    plotdata <- standarize.fun(gset[intersect(templates$probe, rownames(gset)), sam.order],
202
                               halfwidth = halfwidth,
203
                               centerFlag = centerFlag,
204
                               scaleFlag = scaleFlag)
205
206
    if(!is.null(annCol) & !is.null(annColors)) {
207
      for (i in names(annColors)) {
208
        if(is.function(annColors[[i]])) {
209
          annColors[[i]] <- annColors[[i]](pretty(range(annCol[,i]),n = 64)) # transformat colorRamp2 function to color vector
210
        }
211
      }
212
    }
213
214
    hm <- ComplexHeatmap::pheatmap(mat               = plotdata,
215
                                   border_color      = NA,
216
                                   cluster_cols      = FALSE,
217
                                   cluster_rows      = FALSE,
218
                                   annotation_col    = annCol,
219
                                   annotation_colors = annColors,
220
                                   legend_breaks     = pretty(c(-halfwidth,halfwidth)),
221
                                   legend_labels     = pretty(c(-halfwidth,halfwidth)),
222
                                   show_rownames     = show_rownames,
223
                                   show_colnames     = show_colnames,
224
                                   treeheight_col    = 0,
225
                                   treeheight_row    = 0,
226
                                   color             = grDevices::colorRampPalette(color)(64),
227
                                   ...)
228
229
    # save to pdf
230
    pdf(file.path(fig.path, outFig), width = width, height = height)
231
    draw(hm,annotation_legend_side = "left",heatmap_legend_side = "left")
232
    invisible(dev.off())
233
234
    # print to screen
235
    draw(hm,annotation_legend_side = "left",heatmap_legend_side = "left")
236
    return(list(unqlist = unqlist, templates = templates, dirct = dirct, heatmap = hm))
237
  } else {
238
    return(list(unqlist = unqlist, templates = templates, dirct = dirct))
239
  }
240
}