[494cbf]: / R / runMarker.R

Download this file

241 lines (219 with data), 13.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
#' @name runMarker
#' @title Run identification of unique biomarkers
#' @description This function aims to identify uniquely and significantly expressed (overexpressed or downexpressed) biomarkers for each subtype identified by multi-omics integrative clustering algorithms. Top markers will be chosen to generate a template so as to run nearest template prediction for subtype verification.
#' @param moic.res An object returned by `getMOIC()` with one specified algorithm or `get\%algorithm_name\%` or `getConsensusMOIC()` with a list of multiple algorithms.
#' @param dea.method A string value to indicate the algorithm for differential expression analysis. Allowed value contains c('deseq2', 'edger', 'limma').
#' @param dat.path A string value to indicate the path for saving the files of differential expression analysis.
#' @param res.path A string value to indicate the path for saving the results for identifying subtype-specific markers.
#' @param p.cutoff A numeric value to indicate the nominal p value for identifying significant markers; pvalue < 0.05 by default.
#' @param p.adj.cutoff A numeric value to indicate the adjusted p value for identifying significant markers; padj < 0.05 by default.
#' @param dirct A string value to indicate the direction of identifying significant marker. Allowed values contain c('up', 'down'); `up` means up-regulated marker, and `down` means down-regulated marker.
#' @param n.marker A integer value to indicate how many top markers sorted by log2fc should be identified for each subtype; 200 by default.
#' @param norm.expr A matrix of normalized expression data with rows for genes and columns for samples; FPKM or TPM without log2 transformation is recommended.
#' @param prefix A string value to indicate the prefix of differential expression file (use for searching files).
#' @param doplot A logic value to indicate if generating heatmap by using subtype-specific markers. TRUE by default.
#' @param annCol A data.frame storing annotation information for samples.
#' @param annColors A list of string vectors for colors matched with annCol.
#' @param clust.col A string vector storing colors for annotating each subtype at the top of heatmap.
#' @param halfwidth A numeric vector to assign marginal cutoff for truncating values in data; 3 by default.
#' @param centerFlag A logical vector to indicate if expression data should be centered; TRUE by default.
#' @param scaleFlag A logical vector to indicate if expression data should be scaled; TRUE by default.
#' @param color A string vector storing colors for heatmap.
#' @param fig.path A string value to indicate the output path for storing the marker heatmap.
#' @param fig.name A string value to indicate the name of the marker heatmap.
#' @param width A numeric value to indicate the width of output figure.
#' @param height A numeric value to indicate the height of output figure.
#' @param show_rownames A logic value to indicate if showing rownames (feature names) in heatmap; FALSE by default.
#' @param show_colnames A logic value to indicate if showing colnames (sample ID) in heatmap; FALSE by default.
#' @param ... Additional parameters pass to \link[ComplexHeatmap]{pheatmap}.
#' @return A figure of subtype-specific marker heatmap (.pdf) if \code{doPlot = TRUE} and a list with the following components:
#'
#' \code{unqlist} a string vector storing the unique marker across all subtypes.
#'
#' \code{templates} a data.frame storing the the template information for nearest template prediction, which is used for verification in external cohort.
#'
#' \code{dirct} a string value indicating the direction for identifying subtype-specific markers.
#'
#' \code{heatmap} a complexheatmap object.
#' @export
#' @importFrom grDevices pdf dev.off colorRampPalette
#' @importFrom ComplexHeatmap pheatmap draw
#' @references Gu Z, Eils R, Schlesner M (2016). Complex heatmaps reveal patterns and correlations in multidimensional genomic data. Bioinformatics, 32(18):2847-2849.
#' @examples # There is no example and please refer to vignette.
runMarker <- function(moic.res = NULL,
dea.method = c("deseq2", "edger", "limma"),
prefix = NULL,
dat.path = getwd(),
res.path = getwd(),
p.cutoff = 0.05,
p.adj.cutoff = 0.05,
dirct = "up",
n.marker = 200,
doplot = TRUE,
norm.expr = NULL,
annCol = NULL,
annColors = NULL,
clust.col = c("#2EC4B6","#E71D36","#FF9F1C","#BDD5EA","#FFA5AB","#011627","#023E8A","#9D4EDD"),
halfwidth = 3,
centerFlag = TRUE,
scaleFlag = TRUE,
show_rownames = FALSE,
show_colnames = FALSE,
color = c("#5bc0eb", "black", "#ECE700"),
fig.path = getwd(),
fig.name = NULL,
width = 8,
height = 8,
...) {
n.moic <- length(unique(moic.res$clust.res$clust))
mo.method <- moic.res$mo.method
DEpattern <- paste(mo.method, "_", ifelse(is.null(prefix),"",paste0(prefix,"_")), dea.method, ".*._vs_Others.txt$", sep="")
DEfiles <- dir(dat.path,pattern = DEpattern)
if(length(DEfiles) == 0) {stop("no DEfiles!")}
if(length(DEfiles) != n.moic) {stop("not all the multi-omics clusters have DEfile!")}
if (!is.element(dirct, c("up", "down"))) {stop( "dirct type error! Allowed value contains c('up', 'down').") }
if(dirct == "up") { outlabel <- "unique_upexpr_marker.txt" }
if(dirct == "down") { outlabel <- "unique_downexpr_marker.txt" }
genelist <- c()
for (filek in DEfiles) {
DEres <- read.table(file.path(dat.path, filek), header=TRUE, row.names=NULL, sep="\t", quote="", stringsAsFactors=FALSE)
DEres <- DEres[!duplicated(DEres[, 1]),]
DEres <- DEres[!is.na(DEres[, 1]), ]
rownames(DEres) <- DEres[, 1]
DEres <- DEres[, -1]
#rownames(DEres) <- toupper(rownames(DEres))
if (dirct == "up") {
genelist <- c( genelist, rownames(DEres[!is.na(DEres$padj) & DEres$pvalue < p.cutoff & DEres$padj < p.adj.cutoff & !is.na(DEres$log2fc) & DEres$log2fc > 0, ]) )
}
if (dirct == "down") {
genelist <- c( genelist, rownames(DEres[!is.na(DEres$padj) & DEres$pvalue < p.cutoff & DEres$padj < p.adj.cutoff & !is.na(DEres$log2fc) & DEres$log2fc < 0, ]) )
}
}
unqlist <- setdiff(genelist,genelist[duplicated(genelist)])
marker <- list()
for (filek in DEfiles) {
DEres <- read.table(file.path(dat.path, filek), header=TRUE, row.names=NULL, sep="\t", quote="", stringsAsFactors=FALSE)
DEres <- DEres[!duplicated(DEres[, 1]),]
DEres <- DEres[!is.na(DEres[, 1]), ]
rownames(DEres) <- DEres[, 1]
DEres <- DEres[, -1]
#rownames(DEres) <- toupper(rownames(DEres))
if(dirct == "up") {
outk <- intersect( unqlist, rownames(DEres[!is.na(DEres$padj) & DEres$pvalue < p.cutoff & DEres$padj < p.adj.cutoff & !is.na(DEres$log2fc) & DEres$log2fc > 0, ]) )
outk <- DEres[outk,]
outk <- outk[order(outk$log2fc, decreasing = TRUE),]
if(nrow(outk) > n.marker) {
marker[[filek]] <- outk[1:n.marker,]
} else {
marker[[filek]] <- outk
}
marker$dirct <- "up"
}
if(dirct == "down") {
outk <- intersect( unqlist, rownames(DEres[!is.na(DEres$padj) & DEres$pvalue < p.cutoff & DEres$padj < p.adj.cutoff & !is.na(DEres$log2fc) & DEres$log2fc < 0, ]) )
outk <- DEres[outk,]
outk <- outk[order(outk$log2fc, decreasing = FALSE),]
if(nrow(outk) > n.marker) {
marker[[filek]] <- outk[1:n.marker,]
} else {
marker[[filek]] <- outk
}
marker$dirct <- "down"
}
# write file
write.table(outk, file=file.path(res.path, paste(gsub("_vs_Others.txt","", filek, fixed = TRUE), outlabel, sep = "_")), row.names = TRUE, col.names = NA, sep = "\t", quote = FALSE)
}
# generate templates for nearest template prediction
templates <- NULL
for (filek in DEfiles) {
tmp <- data.frame(probe = rownames(marker[[filek]]),
class = sub("_vs_Others.txt","",sub(".*.result.","",filek)),
dirct = marker$dirct,
stringsAsFactors = FALSE)
templates <- rbind.data.frame(templates, tmp, stringsAsFactors = FALSE)
}
write.table(templates, file=file.path(res.path, paste0(mo.method,"_",dea.method,"_",dirct,"regulated_marker_templates.txt")), row.names = FALSE, sep = "\t", quote = FALSE)
# generate heatmap with subtype-specific markers
if(doplot) {
if(is.null(norm.expr)) {
stop("please provide a matrix or data.frame of normalized expression data with rows for genes and columns for samples; FPKM or TPM without log2 transformation is recommended.")
}
# check data
comsam <- intersect(moic.res$clust.res$samID, colnames(norm.expr))
if(length(comsam) == nrow(moic.res$clust.res)) {
message("--all samples matched.")
} else {
message(paste0("--",(nrow(moic.res$clust.res)-length(comsam))," samples mismatched from current subtypes."))
}
moic.res$clust.res <- moic.res$clust.res[comsam, , drop = FALSE]
norm.expr <- norm.expr[,comsam]
if(is.null(fig.name)) {
outFig <- paste0("markerheatmap_using_",dirct,"regulated_genes.pdf")
} else {
outFig <- paste0(fig.name, "_using_", dirct,"regulated_genes.pdf")
}
sam.order <- moic.res$clust.res[order(moic.res$clust.res$clust, decreasing = FALSE), "samID"]
colvec <- clust.col[1:n.moic]
names(colvec) <- paste0("CS",1:n.moic)
if(!is.null(annCol) & !is.null(annColors)) {
annCol <- annCol[sam.order, , drop = FALSE]
annCol$Subtype <- paste0("CS",moic.res$clust.res[sam.order,"clust"])
annColors[["Subtype"]] <- colvec
} else {
annCol <- data.frame("Subtype" = paste0("CS",moic.res$clust.res[sam.order,"clust"]),
row.names = sam.order,
stringsAsFactors = FALSE)
annColors <- list("Subtype" = colvec)
}
if(max(norm.expr) < 25 | (max(norm.expr) >= 25 & min(norm.expr) < 0)) {
message("--expression profile seems to have been standardised (z-score or log transformation), no more action will be performed.")
gset <- norm.expr
}
if(max(norm.expr) >= 25 & min(norm.expr) >= 0){
message("--log2 transformation done for expression data.")
gset <- log2(norm.expr + 1)
}
standarize.fun <- function(indata=NULL, halfwidth=NULL, centerFlag=TRUE, scaleFlag=TRUE) {
outdata=t(scale(t(indata), center=centerFlag, scale=scaleFlag))
if (!is.null(halfwidth)) {
outdata[outdata>halfwidth]=halfwidth
outdata[outdata<(-halfwidth)]= -halfwidth
}
return(outdata)
}
plotdata <- standarize.fun(gset[intersect(templates$probe, rownames(gset)), sam.order],
halfwidth = halfwidth,
centerFlag = centerFlag,
scaleFlag = scaleFlag)
if(!is.null(annCol) & !is.null(annColors)) {
for (i in names(annColors)) {
if(is.function(annColors[[i]])) {
annColors[[i]] <- annColors[[i]](pretty(range(annCol[,i]),n = 64)) # transformat colorRamp2 function to color vector
}
}
}
hm <- ComplexHeatmap::pheatmap(mat = plotdata,
border_color = NA,
cluster_cols = FALSE,
cluster_rows = FALSE,
annotation_col = annCol,
annotation_colors = annColors,
legend_breaks = pretty(c(-halfwidth,halfwidth)),
legend_labels = pretty(c(-halfwidth,halfwidth)),
show_rownames = show_rownames,
show_colnames = show_colnames,
treeheight_col = 0,
treeheight_row = 0,
color = grDevices::colorRampPalette(color)(64),
...)
# save to pdf
pdf(file.path(fig.path, outFig), width = width, height = height)
draw(hm,annotation_legend_side = "left",heatmap_legend_side = "left")
invisible(dev.off())
# print to screen
draw(hm,annotation_legend_side = "left",heatmap_legend_side = "left")
return(list(unqlist = unqlist, templates = templates, dirct = dirct, heatmap = hm))
} else {
return(list(unqlist = unqlist, templates = templates, dirct = dirct))
}
}