Diff of /R/runGSEA.R [000000] .. [494cbf]

Switch to unified view

a b/R/runGSEA.R
1
#' @name runGSEA
2
#' @title Run identification of unique functional pathways
3
#' @description Use gene set enrichment analysis to identify subtype-specific (overexpressed or downexpressed) functional pathways for each subtype identified by multi-omics clustering algorithms.
4
#' @param moic.res An object returned by `getMOIC()` with one specified algorithm or `get\%algorithm_name\%` or `getConsensusMOIC()` with a list of multiple algorithms.
5
#' @param dea.method A string value to indicate the algorithm for differential expression analysis. Allowed value contains c('deseq2', 'edger', 'limma').
6
#' @param norm.expr A matrix of normalized expression data with rows for genes and columns for samples; FPKM or TPM without log2 transformation is recommended.
7
#' @param prefix A string value to indicate the prefix of differential expression file (use for searching files).
8
#' @param dat.path A string value to indicate the path for saving the files of differential expression analysis.
9
#' @param res.path A string value to indicate the path for saving the results for identifying subtype-specific functional pathways.
10
#' @param dirct A string value to indicate the direction of identifying significant pathway. Allowed values contain c('up', 'down'); `up` means up-regulated pathway, and `down` means down-regulated pathway; "up" by default.
11
#' @param n.path A integer value to indicate how many top pathways sorted by NES should be identified for each subtypes; 10 by default.
12
#' @param msigdb.path A string value to indicate ABSOULUTE PATH/NAME of MSigDB file (GMT file with gene symbols) downloaded from \url{https://www.gsea-msigdb.org/gsea/msigdb/collections.jsp#H}.
13
#' @param nPerm A integer value to indicate the number of permutations; 1000 by default and 10000 will be better for reproducibility.
14
#' @param minGSSize A integer value to indicate minimal size of each geneSet for analyzing; 10 by default.
15
#' @param maxGSSize A integer value to indicate maximal size of each geneSet for analyzing; 500 by default.
16
#' @param p.cutoff A numeric value to indicate the nominal p value for identifying significant pathways; pvalue < 0.05 by default.
17
#' @param p.adj.cutoff A numeric value to indicate the adjusted p value for identifying significant pathways; padj < 0.05 by default.
18
#' @param gsva.method A string value to indicate the method to employ in the estimation of gene-set enrichment scores per sample. By default this is set to gsva (Hänzelmann et al, 2013) and other options are ssgsea (Barbie et al, 2009), zscore (Lee et al, 2008) or plage (Tomfohr et al, 2005). The latter two standardize first expression profiles into z-scores over the samples and, in the case of zscore, it combines them together as their sum divided by the square-root of the size of the gene set, while in the case of plage they are used to calculate the singular value decomposition (SVD) over the genes in the gene set and use the coefficients of the first right-singular vector as pathway activity profile.
19
#' @param norm.method A string value to indicate how to calculate subtype-specific pathway enrichment scores. Allowed values contain c('mean', 'median'); mean by default.
20
#' @param clust.col A string vector storing colors for annotating each subtype at the top of heatmap.
21
#' @param color A string vector storing colors for heatmap.
22
#' @param fig.path A string value to indicate the output path for storing the pathway heatmap.
23
#' @param fig.name A string value to indicate the name of the pathway heatmap.
24
#' @param width A numeric value to indicate the width of output figure.
25
#' @param height A numeric value to indicate the height of output figure.
26
#' @return A figure of subtype-specific pathway heatmap (.pdf) and a list with the following components:
27
#'
28
#'         \code{gsea.list}  a list storing gsea object returned by \link[clusterProfiler]{GSEA} for each subtype.
29
#'
30
#'         \code{raw.es}     a data.frame storing raw enrichment score of identified subtype-specific pathways by using specified \code{gsva.method}.
31
#'
32
#'         \code{scaled.es}  a data.frame storing z-scored enrichment score of identified subtype-specific pathways by using specified \code{gsva.method}.
33
#'
34
#'         \code{grouped.es} a data.frame storing grouped enrichment score (mean or median value among each subtype) by using specified \code{norm.method}.
35
#'
36
#'         \code{heatmap}    a complexheatmap object.
37
#' @export
38
#' @importFrom grDevices pdf dev.off colorRampPalette
39
#' @importFrom ComplexHeatmap pheatmap draw
40
#' @importFrom GSVA gsva
41
#' @importFrom clusterProfiler GSEA read.gmt
42
#' @references Barbie, D.A. et al. (2009). Systematic RNA interference reveals that oncogenic KRAS-driven cancers require TBK1. Nature, 462(5):108-112.
43
#'
44
#' Hänzelmann, S., Castelo, R. and Guinney, J. (2013). GSVA: Gene set variation analysis for microarray and RNA-Seq data. BMC Bioinformatics, 14(1):7.
45
#'
46
#' Lee, E. et al. (2008). Inferring pathway activity toward precise disease classification. PLoS Comp Biol, 4(11):e1000217.
47
#'
48
#' Tomfohr, J. et al. (2005). Pathway level analysis of gene expression using singular value decomposition. BMC Bioinformatics, 6(1):1-11.
49
#'
50
#' Yu G, Wang L, Han Y, He Q (2012). clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS, 16(5):284-287.
51
#'
52
#' Gu Z, Eils R, Schlesner M (2016). Complex heatmaps reveal patterns and correlations in multidimensional genomic data. Bioinformatics, 32(18):2847–2849.
53
#' @examples # There is no example and please refer to vignette.
54
runGSEA <- function(moic.res     = NULL,
55
                    dea.method   = c("deseq2", "edger", "limma"),
56
                    norm.expr    = NULL,
57
                    prefix       = NULL,
58
                    dat.path     = getwd(),
59
                    res.path     = getwd(),
60
                    dirct        = "up",
61
                    n.path       = 10,
62
                    msigdb.path  = NULL,
63
                    nPerm        = 1000,
64
                    minGSSize    = 10,
65
                    maxGSSize    = 500,
66
                    p.cutoff     = 0.05,
67
                    p.adj.cutoff = 0.05,
68
                    gsva.method  = "gsva",
69
                    norm.method  = "mean",
70
                    clust.col    = c("#2EC4B6","#E71D36","#FF9F1C","#BDD5EA","#FFA5AB","#011627","#023E8A","#9D4EDD"),
71
                    color        = NULL,
72
                    fig.name     = NULL,
73
                    fig.path     = getwd(),
74
                    width        = 15,
75
                    height       = 10) {
76
77
  # check data
78
  comsam <- intersect(moic.res$clust.res$samID, colnames(norm.expr))
79
  if(length(comsam) == nrow(moic.res$clust.res)) {
80
    message("--all samples matched.")
81
  } else {
82
    message(paste0("--",(nrow(moic.res$clust.res)-length(comsam))," samples mismatched from current subtypes."))
83
  }
84
85
  moic.res$clust.res <- moic.res$clust.res[comsam, , drop = FALSE]
86
  norm.expr <- norm.expr[,comsam]
87
88
  n.moic <- length(unique(moic.res$clust.res$clust))
89
  mo.method <- moic.res$mo.method
90
  DEpattern <- paste(mo.method, "_", ifelse(is.null(prefix),"",paste0(prefix,"_")), dea.method, ".*._vs_Others.txt$", sep="")
91
  DEfiles <- dir(dat.path,pattern = DEpattern)
92
93
  if(length(DEfiles) == 0) {stop("no DEfiles!")}
94
  if(length(DEfiles) != n.moic) {stop("not all multi-omics clusters have DEfile!")}
95
  if (!is.element(dirct, c("up", "down"))) {stop( "dirct type error! Allowed value contains c('up', 'down').") }
96
  if (!is.element(gsva.method, c("gsva", "ssgsea", "zscore", "plage"))) {stop( "GSVA method error! Allowed value contains c('gsva', 'ssgsea', 'zscore', 'plage').") }
97
98
  if(dirct == "up") { outlabel <- "unique_upexpr_pathway.txt" }
99
  if(dirct == "down") { outlabel <- "unique_downexpr_pathway.txt" }
100
101
  gsea.list <- list()
102
  gseaidList <- c()
103
  for (filek in DEfiles) {
104
    DEres <- read.table(file.path(dat.path, filek), header=TRUE, row.names=NULL, sep="\t", quote="", stringsAsFactors=FALSE)
105
    DEres <- DEres[!duplicated(DEres[, 1]),]
106
    DEres <- DEres[!is.na(DEres[, 1]), ]
107
    rownames(DEres) <- DEres[, 1]
108
    DEres <- DEres[, -1]
109
110
    geneList <- DEres$log2fc; names(geneList) <- rownames(DEres)
111
    geneList <- sort(geneList,decreasing = TRUE) # ranked gene set
112
113
    # run gsea
114
    msigdb <- try(clusterProfiler::read.gmt(msigdb.path), silent = TRUE)
115
    if(class(msigdb) == "try-error") {stop("please provide correct ABSOLUTE PATH for MSigDB file.")}
116
117
    moic.lab <- paste0("CS",gsub("_vs_Others.txt","",sub(".*CS", "", filek)))
118
    gsea.list[[moic.lab]] <- suppressWarnings(clusterProfiler::GSEA(geneList     = geneList,
119
                                                                    TERM2GENE    = msigdb,
120
                                                                    nPerm        = nPerm,
121
                                                                    minGSSize    = minGSSize,
122
                                                                    maxGSSize    = maxGSSize,
123
                                                                    seed         = TRUE,
124
                                                                    verbose      = FALSE,
125
                                                                    pvalueCutoff = 1))
126
    gsea.dat <- as.data.frame(gsea.list[[moic.lab]])
127
    write.table(gsea.dat[,setdiff(colnames(gsea.dat),"ID")], file=file.path(res.path, paste(gsub("_vs_Others.txt","", filek, fixed = TRUE), "gsea_all_results.txt", sep = "_")),sep = "\t",row.names = FALSE,col.names = TRUE,quote = FALSE)
128
129
    if(dirct == "up") {
130
      gseaidList <- c(gseaidList, rownames(gsea.dat[which(gsea.dat$NES > 0 & gsea.dat$pvalue < p.cutoff & gsea.dat$p.adjust < p.adj.cutoff),]))
131
    }
132
    if(dirct == "down") {
133
      gseaidList <- c(gseaidList, rownames(gsea.dat[which(gsea.dat$NES < 0 & gsea.dat$pvalue < p.cutoff & gsea.dat$p.adjust < p.adj.cutoff),]))
134
    }
135
    unqlist <- setdiff(gseaidList,gseaidList[duplicated(gseaidList)])
136
  }
137
  message("GSEA done...")
138
139
  GSEApattern <- paste(mo.method, "_", ifelse(is.null(prefix),"",paste0(prefix,"_")), dea.method, ".*._gsea_all_results.txt$", sep="")
140
  GSEAfiles <- dir(res.path,pattern = GSEApattern)
141
142
  pathway <- pathcore <- list()
143
  pathnum <- c()
144
  for (filek in GSEAfiles) {
145
    GSEAres <- read.table(file.path(res.path, filek), header=TRUE, row.names=1, sep="\t", quote="", stringsAsFactors=FALSE)
146
147
    if(dirct == "up") {
148
      outk <- intersect( unqlist, rownames(GSEAres[which(GSEAres$NES > 0 & GSEAres$pvalue < p.cutoff & GSEAres$p.adjust < p.adj.cutoff),]) )
149
      outk <- GSEAres[outk,]
150
      outk <- outk[order(outk$NES, decreasing = TRUE),]
151
152
      if(nrow(outk) > n.path) {
153
        pathway[[filek]] <- outk[1:n.path,]
154
      } else {
155
        pathway[[filek]] <- outk
156
      }
157
      pathnum <- c(pathnum, nrow(pathway[[filek]]))
158
      pathway$dirct <- "up"
159
160
      for (i in rownames(pathway[[filek]])) {
161
        pathcore[[i]] <- msigdb[which(msigdb[,1] %in% i),"gene"]
162
      }
163
    }
164
    if(dirct=="down") {
165
      outk <- intersect( unqlist, rownames(GSEAres[which(GSEAres$NES < 0 & GSEAres$pvalue < p.cutoff & GSEAres$p.adjust < p.adj.cutoff),]) )
166
      outk <- GSEAres[outk,]
167
      outk <- outk[order(outk$NES, decreasing = FALSE),]
168
169
      if(nrow(outk) > n.path) {
170
        pathway[[filek]] <- outk[1:n.path,]
171
      } else {
172
        pathway[[filek]] <- outk
173
      }
174
      pathnum <- c(pathnum, nrow(pathway[[filek]]))
175
      pathway$dirct <- "down"
176
177
      for (i in rownames(pathway[[filek]])) {
178
        pathcore[[i]] <- msigdb[which(msigdb[,1] %in% i),"gene"]
179
      }
180
    }
181
    write.table(outk, file=file.path(res.path, paste(gsub("_gsea_all_results.txt","", filek, fixed = TRUE), outlabel, sep = "_")), row.names = TRUE, col.names = NA, sep = "\t", quote = FALSE)
182
  }
183
184
  # calculate single sample enrichment scores
185
  standarize.fun <- function(indata=NULL, halfwidth=NULL, centerFlag=TRUE, scaleFlag=TRUE) {
186
    outdata=t(scale(t(indata), center=centerFlag, scale=scaleFlag))
187
    if (!is.null(halfwidth)) {
188
      outdata[outdata>halfwidth]=halfwidth
189
      outdata[outdata<(-halfwidth)]= -halfwidth
190
    }
191
    return(outdata)
192
  }
193
194
  rowmean <- function(x) {
195
    return(apply(x, 1, mean))
196
  }
197
198
  rowmedian <- function(x) {
199
    return(apply(x, 1, median))
200
  }
201
202
  if(max(norm.expr) < 25 | (max(norm.expr) >= 25 & min(norm.expr) < 0)) {
203
    message("--expression profile seems to have been standardised (z-score or log transformation), no more action will be performed.")
204
    gset <- norm.expr
205
  }
206
  if(max(norm.expr) >= 25 & min(norm.expr) >= 0){
207
    message("--log2 transformation done for expression data.")
208
    gset <- log2(norm.expr + 1)
209
  }
210
211
  sam.order <- moic.res$clust.res[order(moic.res$clust.res$clust, decreasing = FALSE), "samID"]
212
  colvec <- clust.col[1:n.moic]
213
  names(colvec) <- paste0("CS",1:n.moic)
214
  annCol <- data.frame("Subtype" = paste0("CS",moic.res$clust.res[sam.order,"clust"]),
215
                       row.names = sam.order,
216
                       stringsAsFactors = FALSE)
217
  annColors <- list("Subtype" = colvec)
218
219
  es <- GSVA::gsva(expr          = as.matrix(gset[,rownames(annCol), drop = FALSE]),
220
                   gset.idx.list = pathcore,
221
                   method        = gsva.method,
222
                   parallel.sz   = 1)
223
  es.backup <- es
224
  es <- standarize.fun(es, halfwidth = 1, centerFlag = TRUE, scaleFlag = TRUE)
225
  message(gsva.method," done...")
226
227
  # calculate subtype-specific pathway enrichment score
228
  esm <- data.frame(row.names = rownames(es))
229
  if(norm.method == "mean") {
230
    for (i in paste0("CS",1:n.moic)) {
231
      esm <- cbind.data.frame(esm,
232
                              data.frame(rowmean(es[,rownames(annCol[which(annCol$Subtype == i), , drop = FALSE])])))
233
    }
234
  }
235
236
  if(norm.method == "median") {
237
    for (i in paste0("CS",1:n.moic)) {
238
      esm <- cbind.data.frame(esm,
239
                              data.frame(rowmedian(es[,rownames(annCol[which(annCol$Subtype == i), , drop = FALSE])])))
240
    }
241
  }
242
243
  colnames(esm) <- paste0("CS",1:n.moic)
244
245
  # generate heatmap
246
  annRow <- data.frame(Subtype = rep(paste0("CS",1:n.moic), pathnum),
247
                       row.names = rownames(esm),
248
                       stringsAsFactors = FALSE)
249
250
  if(is.null(color)) {
251
    mapcolor <- grDevices::colorRampPalette(c("#0000FF", "#8080FF", "#FFFFFF", "#FF8080", "#FF0000"))(64)
252
  } else {mapcolor <- grDevices::colorRampPalette(color)(64)}
253
  hm <- ComplexHeatmap::pheatmap(mat                  = as.matrix(esm),
254
                                 cluster_rows         = FALSE,
255
                                 cluster_cols         = FALSE,
256
                                 show_rownames        = TRUE,
257
                                 show_colnames        = TRUE,
258
                                 annotation_row       = annRow,
259
                                 annotation_colors    = annColors,
260
                                 annotation_names_row = FALSE,
261
                                 legend               = TRUE,
262
                                 color                = mapcolor,
263
                                 border_color         = "black",
264
                                 legend_breaks        = c(-1,-0.5,0,0.5,1),
265
                                 legend_labels        = c(-1,-0.5,0,0.5,1),
266
                                 cellwidth            = 15,
267
                                 cellheight           = 10)
268
  # print to screen
269
  draw(hm, annotation_legend_side = "left",heatmap_legend_side = "left")
270
271
  # save to pdf
272
  if(is.null(fig.name)) {
273
    outFig <- paste0("gseaheatmap_using_",dirct,"regulated_pathways.pdf")
274
  } else {
275
    outFig <- paste0(fig.name, "_using_", dirct,"regulated_pathways.pdf")
276
  }
277
  pdf(file.path(fig.path,outFig), width = width, height = height)
278
  draw(hm, annotation_legend_side = "left",heatmap_legend_side = "left")
279
  invisible(dev.off())
280
281
  message("heatmap done...")
282
283
  return(list(gsea.list = gsea.list, raw.es = es.backup, scaled.es = es, grouped.es = esm, heatmap = hm))
284
}