Diff of /R/compFGA.R [000000] .. [494cbf]

Switch to unified view

a b/R/compFGA.R
1
#' @name compFGA
2
#' @title Comparison of fraction genome altered
3
#' @description This function calculates Fraction Genome Altered (FGA), Fraction Genome Gained (FGG), and Fraction Genome Lost (FGL) seperately, and compares them among curent subtypes identified from multi-omics integrative clustering algorithms.
4
#' @param moic.res An object returned by `getMOIC()` with one specified algorithm or `get\%algorithm_name\%` or `getConsensusMOIC()` with a list of multiple algorithms.
5
#' @param segment A data frame containing segmented copy number and columns must exactly include the following elements: c('sample','chrom','start','end','value'). Column of `value` should be segments value when \code{iscopynumber = FALSE} but copy-number value when \code{iscopynumber = TRUE}. Copy-number will be converted to segments by log2(copy-number/2).
6
#' @param iscopynumber A logical value to indicate if the fifth column of segment input is copy-number. If segment file derived from CNV calling provides copy number instead of segment_mean value, this argument must be switched to TRUE. FALSE by default.
7
#' @param cnathreshold A numeric value to indicate the cutoff for identifying copy-number gain or loss. 0.2 by default.
8
#' @param test.method A string value to indicate the method for statistical testing. Allowed values contain c('nonparametric', 'parametric'); nonparametric means two-sample wilcoxon rank sum test for two subtypes and Kruskal-Wallis rank sum test for multiple subtypes; parametric means two-sample t-test when only two subtypes are identified, and anova for multiple subtypes comparison; "nonparametric" by default.
9
#' @param barcolor A string vector to indicate the mapping color for bars of FGA, FGG and FGL.
10
#' @param clust.col A string vector storing colors for each subtype.
11
#' @param fig.path A string value to indicate the output path for storing the barplot.
12
#' @param fig.name A string value to indicate the name of the barplot.
13
#' @param width A numeric value to indicate the width of barplot.
14
#' @param height A numeric value to indicate the height of barplot.
15
#' @return A list contains the following components:
16
#'
17
#'         \code{summary}           a table summarizing the measurements of FGA, FGG, and FGL per sample
18
#'
19
#'         \code{FGA.p.value}       a nominal p value quantifying the difference of FGA among current subtypes
20
#'
21
#'         \code{pairwise.FGA.test} a pairwise BH adjusted p value matrix for multiple comparisons of FGA if more than 2 subtypes were identified
22
#'
23
#'         \code{FGG.p.value}       a nominal p value quantifying the difference of FGG among current subtypes
24
#'
25
#'         \code{pairwise.FGG.test} a pairwise BH adjusted p value matrix for multiple comparisons of FGG if more than 2 subtypes were identified
26
#'
27
#'         \code{FGL.p.value}       a nominal p value quantifying the difference of FGL among current subtypes
28
#'
29
#'         \code{pairwise.FGL.test} a pairwise BH adjusted p value matrix for multiple comparisons of FGL if more than 2 subtypes were identified
30
#'
31
#'         \code{test.method}       a string value indicating the statistical testing method to calculate p values
32
#'
33
#' @export
34
#' @import ggplot2
35
#' @import patchwork
36
#' @importFrom dplyr group_by summarize %>%
37
#' @importFrom ggpubr stat_compare_means
38
#' @examples # There is no example and please refer to vignette.
39
#' @references Cerami E, Gao J, Dogrusoz U, et al. (2012). The cBio Cancer Genomics Portal: An Open Platform for Exploring Multidimensional Cancer Genomics Data. Cancer Discov, 2(5):401-404.
40
#'
41
#' Gao J, Aksoy B A, Dogrusoz U, et al. (2013). Integrative analysis of complex cancer genomics and clinical profiles using the cBioPortal. Sci Signal, 6(269):pl1-pl1.
42
compFGA <- function(moic.res     = NULL,
43
                    segment      = NULL,
44
                    iscopynumber = FALSE,
45
                    cnathreshold = 0.2,
46
                    test.method  = "nonparametric",
47
                    barcolor     = c("#008B8A", "#F2042C", "#21498D"),
48
                    clust.col    = c("#2EC4B6","#E71D36","#FF9F1C","#BDD5EA","#FFA5AB","#011627","#023E8A","#9D4EDD"),
49
                    fig.path     = getwd(),
50
                    fig.name     = NULL,
51
                    width        = 8,
52
                    height       = 4) {
53
54
  # check arguments
55
  if(!all(is.element(c("sample","chrom","start","end","value"), colnames(segment)))) {
56
    stop("segment data must have the following columns: sample, chrom, start, end, value.")
57
  }
58
59
  if(iscopynumber) {
60
    segment$value <- log2(segment$value/2) # convert copy-number to segment-mean value
61
  }
62
63
  comsam <- intersect(moic.res$clust.res$samID, unique(segment[,1]))
64
  # check data
65
  if(length(comsam) == nrow(moic.res$clust.res)) {
66
    message("--all samples matched.")
67
  } else {
68
    message(paste0("--",(nrow(moic.res$clust.res)-length(comsam))," samples mismatched from current subtypes."))
69
  }
70
71
  if(!is.element(test.method, c("nonparametric","parametric"))) {
72
    stop("test.method can be one of nonparametric or parametric.")
73
  }
74
75
  clust.res <- moic.res$clust.res[comsam, , drop = FALSE]
76
  segment <- segment[which(segment$sample %in% comsam),]
77
  n.moic <- length(unique(clust.res$clust))
78
79
  # data process
80
  segment$bases <- segment$end - segment$start
81
82
  # calculate FGA, FGG and FGL
83
  display.progress = function (index, totalN, breakN=20) {
84
    if ( index %% ceiling(totalN/breakN)  == 0  ) {
85
      cat(paste(round(index*100/totalN), "% ", sep = ""))
86
    }
87
  }
88
  std <- function(x, na.rm = TRUE) {
89
    if(na.rm) {
90
      x <- as.numeric(na.omit(x))
91
      sd(x)/sqrt(length(x))
92
    } else {sd(x)/sqrt(length(x))}
93
  }
94
95
  outTab <- data.frame()
96
  for (i in 1:length(unique(segment$sample))) {
97
    display.progress(index = i, totalN = length(unique(segment$sample)))
98
99
    tmp <- segment[segment$sample == names(table(segment$sample))[i],]
100
101
    if (length(tmp[abs(tmp$value) > cnathreshold,"bases"][6]) == 0) {
102
      FGA = 0
103
    } else {
104
      FGA = sum(tmp[abs(tmp$value) > cnathreshold,"bases"]) / sum(tmp[,"bases"])
105
    }
106
107
    if (length(tmp[tmp$value > cnathreshold,"bases"][6]) == 0) {
108
      FGG = 0
109
    } else {
110
      FGG = sum(tmp[tmp$value > cnathreshold,"bases"]) / sum(tmp[,"bases"])
111
    }
112
113
    if (length(tmp[tmp$value < (-cnathreshold),"bases"][6]) == 0) {
114
      FGL = 0
115
    } else {
116
      FGL = sum(tmp[tmp$value < (-cnathreshold),"bases"]) / sum(tmp[,"bases"])
117
    }
118
119
    tmp <- data.frame(samID            = names(table(segment$sample))[i],
120
                      FGA              = FGA,
121
                      FGG              = FGG,
122
                      FGL              = FGL,
123
                      stringsAsFactors = FALSE)
124
    outTab <- rbind.data.frame(outTab, tmp, stringsAsFactors = FALSE)
125
  }
126
  outTab$Subtype <- paste0("CS", clust.res[outTab$samID, "clust"])
127
128
  # calculate mean and se
129
  summaryFGA  <- outTab %>% group_by(Subtype) %>% dplyr::summarize(mean = mean(FGA, na.rm = TRUE), se = std(FGA, na.rm = TRUE))
130
  summaryFGG  <- outTab %>% group_by(Subtype) %>% dplyr::summarize(mean = mean(FGG, na.rm = TRUE), se = std(FGG, na.rm = TRUE))
131
  summaryFGL  <- outTab %>% group_by(Subtype) %>% dplyr::summarize(mean = mean(FGL, na.rm = TRUE), se = std(FGL, na.rm = TRUE))
132
  summaryFGGL <- data.frame(rbind.data.frame(summaryFGG,summaryFGL),class = rep(c("FGG","FGL"),c(nrow(summaryFGG),nrow(summaryFGL))),stringsAsFactors = FALSE)
133
134
  # statistical testing
135
  # generate boxviolin plot with statistical testing
136
  if(n.moic == 2 & test.method == "nonparametric") {
137
    statistic <- "wilcox.test"
138
    FGA.test  <- wilcox.test(outTab$FGA ~ outTab$Subtype)$p.value
139
    FGG.test  <- wilcox.test(outTab$FGG ~ outTab$Subtype)$p.value
140
    FGL.test  <- wilcox.test(outTab$FGL ~ outTab$Subtype)$p.value
141
  }
142
143
  if(n.moic == 2 & test.method == "parametric") {
144
    statistic <- "t.test"
145
    FGA.test  <- t.test(outTab$FGA ~ outTab$Subtype)$p.value
146
    FGG.test  <- t.test(outTab$FGG ~ outTab$Subtype)$p.value
147
    FGL.test  <- t.test(outTab$FGL ~ outTab$Subtype)$p.value
148
  }
149
150
  if(n.moic > 2 & test.method == "nonparametric") {
151
    statistic <- "kruskal.test"
152
    FGA.test  <- kruskal.test(outTab$FGA ~ outTab$Subtype)$p.value
153
    FGG.test  <- kruskal.test(outTab$FGG ~ outTab$Subtype)$p.value
154
    FGL.test  <- kruskal.test(outTab$FGL ~ outTab$Subtype)$p.value
155
    pairwise.FGA.test <- pairwise.wilcox.test(outTab$FGA,outTab$Subtype,p.adjust.method = "BH")
156
    pairwise.FGG.test <- pairwise.wilcox.test(outTab$FGG,outTab$Subtype,p.adjust.method = "BH")
157
    pairwise.FGL.test <- pairwise.wilcox.test(outTab$FGL,outTab$Subtype,p.adjust.method = "BH")
158
  }
159
160
  if(n.moic > 2 & test.method == "parametric") {
161
    statistic <- "anova"
162
    FGA.test  <- summary(aov(outTab$FGA ~ outTab$Subtype))[[1]][["Pr(>F)"]][1]
163
    FGG.test  <- summary(aov(outTab$FGG ~ outTab$Subtype))[[1]][["Pr(>F)"]][1]
164
    FGL.test  <- summary(aov(outTab$FGL ~ outTab$Subtype))[[1]][["Pr(>F)"]][1]
165
    pairwise.FGA.test <- pairwise.t.test(outTab$FGA,outTab$Subtype,p.adjust.method = "BH")
166
    pairwise.FGG.test <- pairwise.t.test(outTab$FGG,outTab$Subtype,p.adjust.method = "BH")
167
    pairwise.FGL.test <- pairwise.t.test(outTab$FGL,outTab$Subtype,p.adjust.method = "BH")
168
  }
169
170
  # generate barplot
171
  FGA.col <- barcolor[1]
172
  FGG.col <- barcolor[2]
173
  FGL.col <- barcolor[3]
174
175
  p1 <- ggplot(summaryFGA, aes(x = Subtype, y = mean,fill=rep("0",nrow(summaryFGA)))) +
176
    geom_bar(stat = 'identity') +
177
    geom_errorbar(aes(ymax = mean+se, ymin = mean-se),position = position_dodge(0.9), width = 0.15) +
178
    annotate(geom="text",
179
             #hjust = ifelse(n.moic %% 2 == 0, 0.5, 0),
180
             x = n.moic / 2 + 0.5,
181
             #y = as.numeric(summaryFGA[which.max(summaryFGA$mean),"mean"] + summaryFGA[which.max(summaryFGA$mean),"se"]),
182
             y = as.numeric(summaryFGA[which.max(summaryFGA$mean),"mean"]),
183
             size = 8, angle = 90, fontface = "bold",
184
             #label = paste0(statistic, " p = ", formatC(FGA.test, digits = 1, format = "e")),
185
             label = cut(FGA.test,c(0,0.001,0.01,0.05,0.1,1),labels = c("****","***","**","*","."))) +
186
    scale_x_discrete(name = "",position = "top") +
187
    theme_bw() +
188
    theme(axis.line.y = element_line(size = 0.8),
189
          axis.ticks.y = element_line(size = 0.2),
190
          axis.text.y = element_blank(),
191
          axis.title.x = element_text(vjust = -0.3,size = 12),
192
          axis.text.x = element_text(size = 10, color = "black"),
193
          plot.margin = unit(c(0.3, -1.7, 0.3, 0.3), "lines"),
194
          legend.title = element_blank()) +
195
    coord_flip() +
196
    scale_fill_manual(values  = FGA.col, breaks = c("0"), labels = c("Copy number-altered genome")) +
197
    scale_y_reverse(expand = c(0.01,0),
198
                    name = "FGA (Fraction of Genome Altered)", position = "left")
199
200
  p2 <- ggplot(summaryFGGL, aes(x = Subtype, y = ifelse(class == 'FGG',mean,-mean),fill=class)) +
201
    geom_bar(stat = 'identity') +
202
    geom_errorbar(data=summaryFGGL[summaryFGGL$class=='FGG',],aes(ymax = mean+se, ymin =mean-se),position = position_dodge(0.9), width = 0.15) +
203
    geom_errorbar(data=summaryFGGL[summaryFGGL$class=='FGL',],aes(ymax = -mean-se, ymin =-mean+se),position = position_dodge(0.9), width = 0.15) +
204
    annotate(geom="text",
205
206
             x = n.moic / 2 + 0.5,
207
             y = -as.numeric(summaryFGL[which.max(summaryFGL$mean),"mean"]),
208
             size = 8, angle = 90, fontface = "bold",
209
             label = cut(FGL.test,c(0,0.001,0.01,0.05,0.1,1),labels = c("****","***","**","*","."))) +
210
    annotate(geom="text",
211
             x = n.moic / 2 + 0.5,
212
             y = as.numeric(summaryFGG[which.max(summaryFGG$mean),"mean"]),
213
             size = 8, angle = 90, fontface = "bold",
214
             label = cut(FGG.test,c(0,0.001,0.01,0.05,0.1,1),labels = c("****","***","**","*","."))) +
215
    scale_x_discrete(name = "") +
216
    theme_bw() +
217
    theme(axis.line.y = element_line(size = 0.8),
218
          axis.ticks.y = element_line(size = 0.2),
219
          axis.text.y = element_blank(),
220
          axis.title.x = element_text(vjust = -0.3, size = 12),
221
          axis.text.x = element_text(size = 10, color = "black"),
222
          plot.margin = unit(c(0.3, 0.3, 0.3, -1), "lines"),
223
          legend.title = element_blank()) +
224
    coord_flip() +
225
    scale_fill_manual(values  = c(FGL.col, FGG.col), breaks = c("FGL","FGG"),
226
                      labels = c("Copy number-lost genome","Copy number-gained genome")) +
227
    scale_y_continuous(expand = c(0.01,0),
228
                       name = "FGL or FGG (Fraction of Genome Lost or Gained)")
229
230
  pp <- ggplot() +
231
    # geom_text(data = summaryFGGL,
232
    #           aes(label = Subtype, x=Subtype), y = 0.5,
233
    #           size = 0.8*11/.pt, # match font size to theme
234
    #           hjust = 0.5, vjust = 0.5) +
235
    geom_label(data = summaryFGGL,
236
               aes(label = Subtype, x = Subtype, fill = Subtype),
237
               y = 0.5,
238
               color = "white",
239
               size = 0.9*11/.pt, # match font size to theme
240
               hjust = 0.4, vjust = 0.5) +
241
    scale_fill_manual(values = clust.col) +
242
    theme_minimal()+
243
    theme(axis.line.y =element_blank(),
244
          axis.ticks.y =element_blank(),
245
          axis.text.y =element_blank(),
246
          axis.title.y =element_blank(),
247
          axis.title.x =element_blank(),
248
          plot.margin = unit(c(0.3, 0, 0.3, 0), "lines")
249
    ) +
250
    guides(fill = FALSE) +
251
    coord_flip() +
252
    scale_y_reverse()
253
254
  pal <- p1 + pp + p2 +
255
    plot_layout(widths = c(7,1,7), guides = 'collect') & theme(legend.position = 'top')
256
257
  # save to pdf
258
  if(is.null(fig.name)) {
259
    outFig <- "barplot of FGA.pdf"
260
  } else {
261
    outFig <- paste0(fig.name,".pdf")
262
  }
263
  ggsave(file.path(fig.path, outFig), width = width, height = height)
264
265
  # print to screen
266
  print(pal)
267
268
  if(n.moic > 2) {
269
    return(list(summary = outTab,
270
                FGA.p.value = FGA.test,
271
                pairwise.FGA.test = pairwise.FGA.test,
272
                FGG.p.value = FGG.test,
273
                pairwise.FGG.test = pairwise.FGG.test,
274
                FGL.p.value = FGL.test,
275
                pairwise.FGL.test = pairwise.FGL.test,
276
                test.method = statistic))
277
  } else {
278
    return(list(summary = outTab,
279
                FGA.p.value = FGA.test,
280
                FGG.p.value = FGG.test,
281
                FGL.p.value = FGL.test,
282
                test.method = statistic))
283
  }
284
}