|
a |
|
b/R/compFGA.R |
|
|
1 |
#' @name compFGA |
|
|
2 |
#' @title Comparison of fraction genome altered |
|
|
3 |
#' @description This function calculates Fraction Genome Altered (FGA), Fraction Genome Gained (FGG), and Fraction Genome Lost (FGL) seperately, and compares them among curent subtypes identified from multi-omics integrative clustering algorithms. |
|
|
4 |
#' @param moic.res An object returned by `getMOIC()` with one specified algorithm or `get\%algorithm_name\%` or `getConsensusMOIC()` with a list of multiple algorithms. |
|
|
5 |
#' @param segment A data frame containing segmented copy number and columns must exactly include the following elements: c('sample','chrom','start','end','value'). Column of `value` should be segments value when \code{iscopynumber = FALSE} but copy-number value when \code{iscopynumber = TRUE}. Copy-number will be converted to segments by log2(copy-number/2). |
|
|
6 |
#' @param iscopynumber A logical value to indicate if the fifth column of segment input is copy-number. If segment file derived from CNV calling provides copy number instead of segment_mean value, this argument must be switched to TRUE. FALSE by default. |
|
|
7 |
#' @param cnathreshold A numeric value to indicate the cutoff for identifying copy-number gain or loss. 0.2 by default. |
|
|
8 |
#' @param test.method A string value to indicate the method for statistical testing. Allowed values contain c('nonparametric', 'parametric'); nonparametric means two-sample wilcoxon rank sum test for two subtypes and Kruskal-Wallis rank sum test for multiple subtypes; parametric means two-sample t-test when only two subtypes are identified, and anova for multiple subtypes comparison; "nonparametric" by default. |
|
|
9 |
#' @param barcolor A string vector to indicate the mapping color for bars of FGA, FGG and FGL. |
|
|
10 |
#' @param clust.col A string vector storing colors for each subtype. |
|
|
11 |
#' @param fig.path A string value to indicate the output path for storing the barplot. |
|
|
12 |
#' @param fig.name A string value to indicate the name of the barplot. |
|
|
13 |
#' @param width A numeric value to indicate the width of barplot. |
|
|
14 |
#' @param height A numeric value to indicate the height of barplot. |
|
|
15 |
#' @return A list contains the following components: |
|
|
16 |
#' |
|
|
17 |
#' \code{summary} a table summarizing the measurements of FGA, FGG, and FGL per sample |
|
|
18 |
#' |
|
|
19 |
#' \code{FGA.p.value} a nominal p value quantifying the difference of FGA among current subtypes |
|
|
20 |
#' |
|
|
21 |
#' \code{pairwise.FGA.test} a pairwise BH adjusted p value matrix for multiple comparisons of FGA if more than 2 subtypes were identified |
|
|
22 |
#' |
|
|
23 |
#' \code{FGG.p.value} a nominal p value quantifying the difference of FGG among current subtypes |
|
|
24 |
#' |
|
|
25 |
#' \code{pairwise.FGG.test} a pairwise BH adjusted p value matrix for multiple comparisons of FGG if more than 2 subtypes were identified |
|
|
26 |
#' |
|
|
27 |
#' \code{FGL.p.value} a nominal p value quantifying the difference of FGL among current subtypes |
|
|
28 |
#' |
|
|
29 |
#' \code{pairwise.FGL.test} a pairwise BH adjusted p value matrix for multiple comparisons of FGL if more than 2 subtypes were identified |
|
|
30 |
#' |
|
|
31 |
#' \code{test.method} a string value indicating the statistical testing method to calculate p values |
|
|
32 |
#' |
|
|
33 |
#' @export |
|
|
34 |
#' @import ggplot2 |
|
|
35 |
#' @import patchwork |
|
|
36 |
#' @importFrom dplyr group_by summarize %>% |
|
|
37 |
#' @importFrom ggpubr stat_compare_means |
|
|
38 |
#' @examples # There is no example and please refer to vignette. |
|
|
39 |
#' @references Cerami E, Gao J, Dogrusoz U, et al. (2012). The cBio Cancer Genomics Portal: An Open Platform for Exploring Multidimensional Cancer Genomics Data. Cancer Discov, 2(5):401-404. |
|
|
40 |
#' |
|
|
41 |
#' Gao J, Aksoy B A, Dogrusoz U, et al. (2013). Integrative analysis of complex cancer genomics and clinical profiles using the cBioPortal. Sci Signal, 6(269):pl1-pl1. |
|
|
42 |
compFGA <- function(moic.res = NULL, |
|
|
43 |
segment = NULL, |
|
|
44 |
iscopynumber = FALSE, |
|
|
45 |
cnathreshold = 0.2, |
|
|
46 |
test.method = "nonparametric", |
|
|
47 |
barcolor = c("#008B8A", "#F2042C", "#21498D"), |
|
|
48 |
clust.col = c("#2EC4B6","#E71D36","#FF9F1C","#BDD5EA","#FFA5AB","#011627","#023E8A","#9D4EDD"), |
|
|
49 |
fig.path = getwd(), |
|
|
50 |
fig.name = NULL, |
|
|
51 |
width = 8, |
|
|
52 |
height = 4) { |
|
|
53 |
|
|
|
54 |
# check arguments |
|
|
55 |
if(!all(is.element(c("sample","chrom","start","end","value"), colnames(segment)))) { |
|
|
56 |
stop("segment data must have the following columns: sample, chrom, start, end, value.") |
|
|
57 |
} |
|
|
58 |
|
|
|
59 |
if(iscopynumber) { |
|
|
60 |
segment$value <- log2(segment$value/2) # convert copy-number to segment-mean value |
|
|
61 |
} |
|
|
62 |
|
|
|
63 |
comsam <- intersect(moic.res$clust.res$samID, unique(segment[,1])) |
|
|
64 |
# check data |
|
|
65 |
if(length(comsam) == nrow(moic.res$clust.res)) { |
|
|
66 |
message("--all samples matched.") |
|
|
67 |
} else { |
|
|
68 |
message(paste0("--",(nrow(moic.res$clust.res)-length(comsam))," samples mismatched from current subtypes.")) |
|
|
69 |
} |
|
|
70 |
|
|
|
71 |
if(!is.element(test.method, c("nonparametric","parametric"))) { |
|
|
72 |
stop("test.method can be one of nonparametric or parametric.") |
|
|
73 |
} |
|
|
74 |
|
|
|
75 |
clust.res <- moic.res$clust.res[comsam, , drop = FALSE] |
|
|
76 |
segment <- segment[which(segment$sample %in% comsam),] |
|
|
77 |
n.moic <- length(unique(clust.res$clust)) |
|
|
78 |
|
|
|
79 |
# data process |
|
|
80 |
segment$bases <- segment$end - segment$start |
|
|
81 |
|
|
|
82 |
# calculate FGA, FGG and FGL |
|
|
83 |
display.progress = function (index, totalN, breakN=20) { |
|
|
84 |
if ( index %% ceiling(totalN/breakN) == 0 ) { |
|
|
85 |
cat(paste(round(index*100/totalN), "% ", sep = "")) |
|
|
86 |
} |
|
|
87 |
} |
|
|
88 |
std <- function(x, na.rm = TRUE) { |
|
|
89 |
if(na.rm) { |
|
|
90 |
x <- as.numeric(na.omit(x)) |
|
|
91 |
sd(x)/sqrt(length(x)) |
|
|
92 |
} else {sd(x)/sqrt(length(x))} |
|
|
93 |
} |
|
|
94 |
|
|
|
95 |
outTab <- data.frame() |
|
|
96 |
for (i in 1:length(unique(segment$sample))) { |
|
|
97 |
display.progress(index = i, totalN = length(unique(segment$sample))) |
|
|
98 |
|
|
|
99 |
tmp <- segment[segment$sample == names(table(segment$sample))[i],] |
|
|
100 |
|
|
|
101 |
if (length(tmp[abs(tmp$value) > cnathreshold,"bases"][6]) == 0) { |
|
|
102 |
FGA = 0 |
|
|
103 |
} else { |
|
|
104 |
FGA = sum(tmp[abs(tmp$value) > cnathreshold,"bases"]) / sum(tmp[,"bases"]) |
|
|
105 |
} |
|
|
106 |
|
|
|
107 |
if (length(tmp[tmp$value > cnathreshold,"bases"][6]) == 0) { |
|
|
108 |
FGG = 0 |
|
|
109 |
} else { |
|
|
110 |
FGG = sum(tmp[tmp$value > cnathreshold,"bases"]) / sum(tmp[,"bases"]) |
|
|
111 |
} |
|
|
112 |
|
|
|
113 |
if (length(tmp[tmp$value < (-cnathreshold),"bases"][6]) == 0) { |
|
|
114 |
FGL = 0 |
|
|
115 |
} else { |
|
|
116 |
FGL = sum(tmp[tmp$value < (-cnathreshold),"bases"]) / sum(tmp[,"bases"]) |
|
|
117 |
} |
|
|
118 |
|
|
|
119 |
tmp <- data.frame(samID = names(table(segment$sample))[i], |
|
|
120 |
FGA = FGA, |
|
|
121 |
FGG = FGG, |
|
|
122 |
FGL = FGL, |
|
|
123 |
stringsAsFactors = FALSE) |
|
|
124 |
outTab <- rbind.data.frame(outTab, tmp, stringsAsFactors = FALSE) |
|
|
125 |
} |
|
|
126 |
outTab$Subtype <- paste0("CS", clust.res[outTab$samID, "clust"]) |
|
|
127 |
|
|
|
128 |
# calculate mean and se |
|
|
129 |
summaryFGA <- outTab %>% group_by(Subtype) %>% dplyr::summarize(mean = mean(FGA, na.rm = TRUE), se = std(FGA, na.rm = TRUE)) |
|
|
130 |
summaryFGG <- outTab %>% group_by(Subtype) %>% dplyr::summarize(mean = mean(FGG, na.rm = TRUE), se = std(FGG, na.rm = TRUE)) |
|
|
131 |
summaryFGL <- outTab %>% group_by(Subtype) %>% dplyr::summarize(mean = mean(FGL, na.rm = TRUE), se = std(FGL, na.rm = TRUE)) |
|
|
132 |
summaryFGGL <- data.frame(rbind.data.frame(summaryFGG,summaryFGL),class = rep(c("FGG","FGL"),c(nrow(summaryFGG),nrow(summaryFGL))),stringsAsFactors = FALSE) |
|
|
133 |
|
|
|
134 |
# statistical testing |
|
|
135 |
# generate boxviolin plot with statistical testing |
|
|
136 |
if(n.moic == 2 & test.method == "nonparametric") { |
|
|
137 |
statistic <- "wilcox.test" |
|
|
138 |
FGA.test <- wilcox.test(outTab$FGA ~ outTab$Subtype)$p.value |
|
|
139 |
FGG.test <- wilcox.test(outTab$FGG ~ outTab$Subtype)$p.value |
|
|
140 |
FGL.test <- wilcox.test(outTab$FGL ~ outTab$Subtype)$p.value |
|
|
141 |
} |
|
|
142 |
|
|
|
143 |
if(n.moic == 2 & test.method == "parametric") { |
|
|
144 |
statistic <- "t.test" |
|
|
145 |
FGA.test <- t.test(outTab$FGA ~ outTab$Subtype)$p.value |
|
|
146 |
FGG.test <- t.test(outTab$FGG ~ outTab$Subtype)$p.value |
|
|
147 |
FGL.test <- t.test(outTab$FGL ~ outTab$Subtype)$p.value |
|
|
148 |
} |
|
|
149 |
|
|
|
150 |
if(n.moic > 2 & test.method == "nonparametric") { |
|
|
151 |
statistic <- "kruskal.test" |
|
|
152 |
FGA.test <- kruskal.test(outTab$FGA ~ outTab$Subtype)$p.value |
|
|
153 |
FGG.test <- kruskal.test(outTab$FGG ~ outTab$Subtype)$p.value |
|
|
154 |
FGL.test <- kruskal.test(outTab$FGL ~ outTab$Subtype)$p.value |
|
|
155 |
pairwise.FGA.test <- pairwise.wilcox.test(outTab$FGA,outTab$Subtype,p.adjust.method = "BH") |
|
|
156 |
pairwise.FGG.test <- pairwise.wilcox.test(outTab$FGG,outTab$Subtype,p.adjust.method = "BH") |
|
|
157 |
pairwise.FGL.test <- pairwise.wilcox.test(outTab$FGL,outTab$Subtype,p.adjust.method = "BH") |
|
|
158 |
} |
|
|
159 |
|
|
|
160 |
if(n.moic > 2 & test.method == "parametric") { |
|
|
161 |
statistic <- "anova" |
|
|
162 |
FGA.test <- summary(aov(outTab$FGA ~ outTab$Subtype))[[1]][["Pr(>F)"]][1] |
|
|
163 |
FGG.test <- summary(aov(outTab$FGG ~ outTab$Subtype))[[1]][["Pr(>F)"]][1] |
|
|
164 |
FGL.test <- summary(aov(outTab$FGL ~ outTab$Subtype))[[1]][["Pr(>F)"]][1] |
|
|
165 |
pairwise.FGA.test <- pairwise.t.test(outTab$FGA,outTab$Subtype,p.adjust.method = "BH") |
|
|
166 |
pairwise.FGG.test <- pairwise.t.test(outTab$FGG,outTab$Subtype,p.adjust.method = "BH") |
|
|
167 |
pairwise.FGL.test <- pairwise.t.test(outTab$FGL,outTab$Subtype,p.adjust.method = "BH") |
|
|
168 |
} |
|
|
169 |
|
|
|
170 |
# generate barplot |
|
|
171 |
FGA.col <- barcolor[1] |
|
|
172 |
FGG.col <- barcolor[2] |
|
|
173 |
FGL.col <- barcolor[3] |
|
|
174 |
|
|
|
175 |
p1 <- ggplot(summaryFGA, aes(x = Subtype, y = mean,fill=rep("0",nrow(summaryFGA)))) + |
|
|
176 |
geom_bar(stat = 'identity') + |
|
|
177 |
geom_errorbar(aes(ymax = mean+se, ymin = mean-se),position = position_dodge(0.9), width = 0.15) + |
|
|
178 |
annotate(geom="text", |
|
|
179 |
#hjust = ifelse(n.moic %% 2 == 0, 0.5, 0), |
|
|
180 |
x = n.moic / 2 + 0.5, |
|
|
181 |
#y = as.numeric(summaryFGA[which.max(summaryFGA$mean),"mean"] + summaryFGA[which.max(summaryFGA$mean),"se"]), |
|
|
182 |
y = as.numeric(summaryFGA[which.max(summaryFGA$mean),"mean"]), |
|
|
183 |
size = 8, angle = 90, fontface = "bold", |
|
|
184 |
#label = paste0(statistic, " p = ", formatC(FGA.test, digits = 1, format = "e")), |
|
|
185 |
label = cut(FGA.test,c(0,0.001,0.01,0.05,0.1,1),labels = c("****","***","**","*","."))) + |
|
|
186 |
scale_x_discrete(name = "",position = "top") + |
|
|
187 |
theme_bw() + |
|
|
188 |
theme(axis.line.y = element_line(size = 0.8), |
|
|
189 |
axis.ticks.y = element_line(size = 0.2), |
|
|
190 |
axis.text.y = element_blank(), |
|
|
191 |
axis.title.x = element_text(vjust = -0.3,size = 12), |
|
|
192 |
axis.text.x = element_text(size = 10, color = "black"), |
|
|
193 |
plot.margin = unit(c(0.3, -1.7, 0.3, 0.3), "lines"), |
|
|
194 |
legend.title = element_blank()) + |
|
|
195 |
coord_flip() + |
|
|
196 |
scale_fill_manual(values = FGA.col, breaks = c("0"), labels = c("Copy number-altered genome")) + |
|
|
197 |
scale_y_reverse(expand = c(0.01,0), |
|
|
198 |
name = "FGA (Fraction of Genome Altered)", position = "left") |
|
|
199 |
|
|
|
200 |
p2 <- ggplot(summaryFGGL, aes(x = Subtype, y = ifelse(class == 'FGG',mean,-mean),fill=class)) + |
|
|
201 |
geom_bar(stat = 'identity') + |
|
|
202 |
geom_errorbar(data=summaryFGGL[summaryFGGL$class=='FGG',],aes(ymax = mean+se, ymin =mean-se),position = position_dodge(0.9), width = 0.15) + |
|
|
203 |
geom_errorbar(data=summaryFGGL[summaryFGGL$class=='FGL',],aes(ymax = -mean-se, ymin =-mean+se),position = position_dodge(0.9), width = 0.15) + |
|
|
204 |
annotate(geom="text", |
|
|
205 |
|
|
|
206 |
x = n.moic / 2 + 0.5, |
|
|
207 |
y = -as.numeric(summaryFGL[which.max(summaryFGL$mean),"mean"]), |
|
|
208 |
size = 8, angle = 90, fontface = "bold", |
|
|
209 |
label = cut(FGL.test,c(0,0.001,0.01,0.05,0.1,1),labels = c("****","***","**","*","."))) + |
|
|
210 |
annotate(geom="text", |
|
|
211 |
x = n.moic / 2 + 0.5, |
|
|
212 |
y = as.numeric(summaryFGG[which.max(summaryFGG$mean),"mean"]), |
|
|
213 |
size = 8, angle = 90, fontface = "bold", |
|
|
214 |
label = cut(FGG.test,c(0,0.001,0.01,0.05,0.1,1),labels = c("****","***","**","*","."))) + |
|
|
215 |
scale_x_discrete(name = "") + |
|
|
216 |
theme_bw() + |
|
|
217 |
theme(axis.line.y = element_line(size = 0.8), |
|
|
218 |
axis.ticks.y = element_line(size = 0.2), |
|
|
219 |
axis.text.y = element_blank(), |
|
|
220 |
axis.title.x = element_text(vjust = -0.3, size = 12), |
|
|
221 |
axis.text.x = element_text(size = 10, color = "black"), |
|
|
222 |
plot.margin = unit(c(0.3, 0.3, 0.3, -1), "lines"), |
|
|
223 |
legend.title = element_blank()) + |
|
|
224 |
coord_flip() + |
|
|
225 |
scale_fill_manual(values = c(FGL.col, FGG.col), breaks = c("FGL","FGG"), |
|
|
226 |
labels = c("Copy number-lost genome","Copy number-gained genome")) + |
|
|
227 |
scale_y_continuous(expand = c(0.01,0), |
|
|
228 |
name = "FGL or FGG (Fraction of Genome Lost or Gained)") |
|
|
229 |
|
|
|
230 |
pp <- ggplot() + |
|
|
231 |
# geom_text(data = summaryFGGL, |
|
|
232 |
# aes(label = Subtype, x=Subtype), y = 0.5, |
|
|
233 |
# size = 0.8*11/.pt, # match font size to theme |
|
|
234 |
# hjust = 0.5, vjust = 0.5) + |
|
|
235 |
geom_label(data = summaryFGGL, |
|
|
236 |
aes(label = Subtype, x = Subtype, fill = Subtype), |
|
|
237 |
y = 0.5, |
|
|
238 |
color = "white", |
|
|
239 |
size = 0.9*11/.pt, # match font size to theme |
|
|
240 |
hjust = 0.4, vjust = 0.5) + |
|
|
241 |
scale_fill_manual(values = clust.col) + |
|
|
242 |
theme_minimal()+ |
|
|
243 |
theme(axis.line.y =element_blank(), |
|
|
244 |
axis.ticks.y =element_blank(), |
|
|
245 |
axis.text.y =element_blank(), |
|
|
246 |
axis.title.y =element_blank(), |
|
|
247 |
axis.title.x =element_blank(), |
|
|
248 |
plot.margin = unit(c(0.3, 0, 0.3, 0), "lines") |
|
|
249 |
) + |
|
|
250 |
guides(fill = FALSE) + |
|
|
251 |
coord_flip() + |
|
|
252 |
scale_y_reverse() |
|
|
253 |
|
|
|
254 |
pal <- p1 + pp + p2 + |
|
|
255 |
plot_layout(widths = c(7,1,7), guides = 'collect') & theme(legend.position = 'top') |
|
|
256 |
|
|
|
257 |
# save to pdf |
|
|
258 |
if(is.null(fig.name)) { |
|
|
259 |
outFig <- "barplot of FGA.pdf" |
|
|
260 |
} else { |
|
|
261 |
outFig <- paste0(fig.name,".pdf") |
|
|
262 |
} |
|
|
263 |
ggsave(file.path(fig.path, outFig), width = width, height = height) |
|
|
264 |
|
|
|
265 |
# print to screen |
|
|
266 |
print(pal) |
|
|
267 |
|
|
|
268 |
if(n.moic > 2) { |
|
|
269 |
return(list(summary = outTab, |
|
|
270 |
FGA.p.value = FGA.test, |
|
|
271 |
pairwise.FGA.test = pairwise.FGA.test, |
|
|
272 |
FGG.p.value = FGG.test, |
|
|
273 |
pairwise.FGG.test = pairwise.FGG.test, |
|
|
274 |
FGL.p.value = FGL.test, |
|
|
275 |
pairwise.FGL.test = pairwise.FGL.test, |
|
|
276 |
test.method = statistic)) |
|
|
277 |
} else { |
|
|
278 |
return(list(summary = outTab, |
|
|
279 |
FGA.p.value = FGA.test, |
|
|
280 |
FGG.p.value = FGG.test, |
|
|
281 |
FGL.p.value = FGL.test, |
|
|
282 |
test.method = statistic)) |
|
|
283 |
} |
|
|
284 |
} |