Diff of /R/runGSVA.R [000000] .. [494cbf]

Switch to unified view

a b/R/runGSVA.R
1
#' @name runGSVA
2
#' @title Run gene set variation analysis
3
#' @description Use gene set variation analysis to calculate enrichment score of each sample in each subtype based on given gene set list of interest.
4
#' @param moic.res An object returned by `getMOIC()` with one specified algorithm or `get\%algorithm_name\%` or `getConsensusMOIC()` with a list of multiple algorithms.
5
#' @param norm.expr A matrix of normalized expression data with rows for genes and columns for samples; FPKM or TPM without log2 transformation is recommended.
6
#' @param gset.gmt.path A string value to indicate ABSOULUTE PATH/NAME of gene sets of interest stored as GMT format \url{https://software.broadinstitute.org/cancer/software/gsea/wiki/index.php/Data_formats#GMT:_Gene_Matrix_Transposed_file_format_.28.2A.gmt.29}.
7
#' @param annCol A data.frame storing annotation information for samples.
8
#' @param annColors A list of string vectors for colors matched with annCol.
9
#' @param clust.col A string vector storing colors for annotating each subtype at the top of heatmap.
10
#' @param halfwidth A numeric value to assign marginal cutoff for truncating enrichment scores; 1 by default.
11
#' @param centerFlag A logical vector to indicate if enrichment scores should be centered; TRUE by default.
12
#' @param scaleFlag A logical vector to indicate if enrichment scores should be scaled; TRUE by default.
13
#' @param distance A string value of distance measurement for hierarchical clustering; 'euclidean' by default.
14
#' @param linkage A string value of clustering method for hierarchical clustering; 'ward.D' by default.
15
#' @param show_rownames A logic value to indicate if showing rownames (feature names) in heatmap; TRUE by default.
16
#' @param show_colnames A logic value to indicate if showing colnames (sample ID) in heatmap; FALSE by default.
17
#' @param color A string vector storing colors for heatmap.
18
#' @param fig.path A string value to indicate the output path for storing the enrichment heatmap.
19
#' @param fig.name A string value to indicate the name of the enrichment heatmap.
20
#' @param width A numeric value to indicate the width of output figure.
21
#' @param height A numeric value to indicate the height of output figure.
22
#' @param ... Additional parameters pass to \link[ComplexHeatmap]{pheatmap}.
23
#'
24
#' @return A figure of enrichment heatmap (.pdf) and a list with the following components:
25
#'
26
#'         \code{gset.list}  a list storing gene sets information converted from GMT format by \link[clusterProfiler]{read.gmt}.
27
#'
28
#'         \code{raw.es}     a data.frame storing raw enrichment score based on given gene sets of interest by using specified \code{gsva.method}.
29
#'
30
#'         \code{scaled.es}  a data.frame storing z-scored enrichment score based on given gene sets of interest by using specified \code{gsva.method}.
31
#'
32
#' @export
33
#' @importFrom ClassDiscovery distanceMatrix
34
#' @importFrom clusterProfiler read.gmt
35
#' @importFrom GSVA gsva
36
#' @importFrom ComplexHeatmap pheatmap draw ht_opt
37
#' @importFrom grDevices pdf dev.off colorRampPalette
38
#' @references Barbie, D.A. et al. (2009). Systematic RNA interference reveals that oncogenic KRAS-driven cancers require TBK1. Nature, 462(5):108-112.
39
#'
40
#' Hänzelmann, S., Castelo, R. and Guinney, J. (2013). GSVA: Gene set variation analysis for microarray and RNA-Seq data. BMC Bioinformatics, 14(1):7.
41
#'
42
#' Lee, E. et al. (2008). Inferring pathway activity toward precise disease classification. PLoS Comp Biol, 4(11):e1000217.
43
#'
44
#' Tomfohr, J. et al. (2005). Pathway level analysis of gene expression using singular value decomposition. BMC Bioinformatics, 6(1):1-11.
45
#'
46
#' Yu G, Wang L, Han Y, He Q (2012). clusterProfiler: an R package for comparing biological themes among gene clusters. OMICS, 16(5):284-287.
47
#' @examples # There is no example and please refer to vignette.
48
runGSVA <- function(moic.res      = NULL,
49
                    norm.expr     = NULL,
50
                    gset.gmt.path = NULL,
51
                    gsva.method   = "gsva",
52
                    centerFlag    = TRUE,
53
                    scaleFlag     = TRUE,
54
                    halfwidth     = 1,
55
                    annCol        = NULL,
56
                    annColors     = NULL,
57
                    clust.col     = c("#2EC4B6","#E71D36","#FF9F1C","#BDD5EA","#FFA5AB","#011627","#023E8A","#9D4EDD"),
58
                    distance      = "euclidean",
59
                    linkage       = "ward.D",
60
                    show_rownames = TRUE,
61
                    show_colnames = FALSE,
62
                    color         = c("#366A9B", "#4E98DE", "#DDDDDD", "#FBCFA7", "#F79C4A"),
63
                    fig.path      = getwd(),
64
                    fig.name      = NULL,
65
                    width         = 8,
66
                    height        = 8,
67
                    ...) {
68
  
69
  # standardize function
70
  standarize.fun <- function(indata=NULL, halfwidth=NULL, centerFlag=TRUE, scaleFlag=TRUE) {
71
    outdata=t(scale(t(indata), center=centerFlag, scale=scaleFlag))
72
    if (!is.null(halfwidth)) {
73
      outdata[outdata>halfwidth]=halfwidth
74
      outdata[outdata<(-halfwidth)]= -halfwidth
75
    }
76
    return(outdata)
77
  }
78
  
79
  # check data
80
  comsam <- intersect(moic.res$clust.res$samID, colnames(norm.expr))
81
  if(length(comsam) == nrow(moic.res$clust.res)) {
82
    message("--all samples matched.")
83
  } else {
84
    message(paste0("--",(nrow(moic.res$clust.res)-length(comsam))," samples mismatched from current subtypes."))
85
  }
86
  
87
  moic.res$clust.res <- moic.res$clust.res[comsam, , drop = FALSE]
88
  norm.expr <- norm.expr[,comsam]
89
  n.moic <- length(unique(moic.res$clust.res$clust))
90
  
91
  # load gene set data and convert gmt to data.frame
92
  gset <- try(clusterProfiler::read.gmt(gset.gmt.path), silent = TRUE)
93
  if(class(gset) == "try-error") {stop("please provide correct ABSOLUTE PATH for gene sets of interest.")}
94
  
95
  # convert data.frame to list
96
  term <- unique(gset[,1])
97
  gset.list <- list()
98
  for (i in term) {
99
    gset.list[[i]] <- gset[which(gset[,1] == i),2]
100
  }
101
  
102
  # calculate gene set enrichment scores
103
  if(max(norm.expr) < 25 | (max(norm.expr) >= 25 & min(norm.expr) < 0)) {
104
    message("--expression profile seems to have been standardised (z-score or log transformation), no more action will be performed.")
105
  }
106
  if(max(norm.expr) >= 25 & min(norm.expr) >= 0){
107
    message("--log2 transformation done for expression data.")
108
    norm.expr <- log2(norm.expr + 1)
109
  }
110
  
111
  es <- GSVA::gsva(expr          = as.matrix(norm.expr),
112
                   gset.idx.list = gset.list,
113
                   method        = gsva.method,
114
                   parallel.sz   = 1)
115
  es.backup <- es
116
  es <- standarize.fun(es, halfwidth = halfwidth, centerFlag = centerFlag, scaleFlag = scaleFlag)
117
  message(gsva.method," done...")
118
  
119
  if(is.null(fig.name)) {
120
    outFig <- paste0("enrichment_heatmap_using_", gsva.method, ".pdf")
121
  } else {
122
    outFig <- paste0(fig.name, "_", gsva.method, ".pdf")
123
  }
124
  
125
  sam.order <- moic.res$clust.res[order(moic.res$clust.res$clust, decreasing = FALSE), "samID"]
126
  colvec <- clust.col[1:n.moic]
127
  names(colvec) <- paste0("CS",1:n.moic)
128
  if(!is.null(annCol) & !is.null(annColors)) {
129
    annCol <- annCol[sam.order, , drop = FALSE]
130
    annCol$Subtype <- paste0("CS",moic.res$clust.res[sam.order,"clust"])
131
    annColors[["Subtype"]] <- colvec
132
  } else {
133
    annCol <- data.frame("Subtype" = paste0("CS",moic.res$clust.res[sam.order,"clust"]),
134
                         row.names = sam.order,
135
                         stringsAsFactors = FALSE)
136
    annColors <- list("Subtype" = colvec)
137
  }
138
  
139
  if(!is.null(annCol) & !is.null(annColors)) {
140
    for (i in names(annColors)) {
141
      if(is.function(annColors[[i]])) {
142
        annColors[[i]] <- annColors[[i]](pretty(range(annCol[,i]),n = 64)) # transformat colorRamp2 function to color vector
143
      }
144
    }
145
  }
146
  
147
  ht_opt$message = FALSE
148
  if(is.null(distance) | is.null(linkage)) {
149
    hcg <- FALSE
150
  } else {
151
    hcg <- hclust(ClassDiscovery::distanceMatrix(t(as.matrix(es[,sam.order])), distance), linkage)
152
  }
153
  hm <- ComplexHeatmap::pheatmap(mat               = es[,sam.order],
154
                                 border_color      = NA,
155
                                 cluster_cols      = FALSE,
156
                                 cluster_rows      = hcg,
157
                                 annotation_col    = annCol,
158
                                 annotation_colors = annColors,
159
                                 show_rownames     = show_rownames,
160
                                 show_colnames     = show_colnames,
161
                                 color             = grDevices::colorRampPalette(color)(64),
162
                                 ...)
163
  
164
  # save to pdf
165
  pdf(file.path(fig.path, outFig), width = width, height = height)
166
  draw(hm)
167
  invisible(dev.off())
168
  
169
  # print to screen
170
  draw(hm)
171
  
172
  return(list(gset.list = gset.list, raw.es = es.backup, scaled.es = es))
173
}