|
a |
|
b/Functions/analysis.diff.survival.TCGA.R |
|
|
1 |
analysis.diff.survival.TCGA <- function(interest.gene, diff.gene.pro, exp.data.process, clin.data, group.by = "median", EnhancedVolcano.plot = T, Box.plot = T, main = NULL, meta.signature = F, single.signature = T){ |
|
|
2 |
require(ggpubr) |
|
|
3 |
|
|
|
4 |
# require interest.gene to be data.frame |
|
|
5 |
if(EnhancedVolcano.plot){ |
|
|
6 |
require(EnhancedVolcano) |
|
|
7 |
gene.overlap <- intersect(rownames(diff.gene.pro), rownames(interest.gene)) |
|
|
8 |
diff.gene.pro.sig.gene <- diff.gene.pro[gene.overlap,] |
|
|
9 |
interest.gene.sig <- interest.gene[gene.overlap,] |
|
|
10 |
|
|
|
11 |
up.gene <- which(diff.gene.pro.sig.gene$log2FoldChange>0) |
|
|
12 |
if(length(up.gene)>0){ |
|
|
13 |
down.gene <- setdiff(1:nrow(diff.gene.pro.sig.gene), up.gene) |
|
|
14 |
interest.gene.sig$label <- rep("down", nrow(diff.gene.pro.sig.gene)) |
|
|
15 |
interest.gene.sig$label[up.gene] <- "up" |
|
|
16 |
}else{ |
|
|
17 |
down.gene <- 1:nrow(diff.gene.pro.sig.gene) |
|
|
18 |
interest.gene.sig$label <- rep("down", nrow(diff.gene.pro.sig.gene)) |
|
|
19 |
} |
|
|
20 |
|
|
|
21 |
interest.gene.sig$TCGA_log2FoldChange <- diff.gene.pro.sig.gene$log2FoldChange |
|
|
22 |
interest.gene.sig$TCGA_padj <- diff.gene.pro.sig.gene$padj |
|
|
23 |
|
|
|
24 |
p <- EnhancedVolcano(diff.gene.pro.sig.gene, |
|
|
25 |
lab = rownames(diff.gene.pro.sig.gene), |
|
|
26 |
selectLab = rownames(diff.gene.pro.sig.gene), |
|
|
27 |
x = 'log2FoldChange', |
|
|
28 |
y = 'padj', pCutoff = 0.05, drawConnectors = TRUE, FCcutoff = 1, |
|
|
29 |
widthConnectors = 0.2, |
|
|
30 |
title = "Tumor VS Normal", xlab = "", ylab = "-Log10(padj)") |
|
|
31 |
print(p) |
|
|
32 |
|
|
|
33 |
if(length(gene.overlap) < nrow(interest.gene)){ |
|
|
34 |
a <- setdiff(rownames(interest.gene), gene.overlap) |
|
|
35 |
cat("gene ", paste(a, sep = ";"), " not in TCGA!\n") |
|
|
36 |
}else{ |
|
|
37 |
cat("All gene in TCGA data!\n") |
|
|
38 |
} |
|
|
39 |
}else{ |
|
|
40 |
#Require interest.gene to be a character vector |
|
|
41 |
gene.overlap <- intersect(interest.gene, rownames(diff.gene.pro)) |
|
|
42 |
diff.gene.pro.sig.gene <- diff.gene.pro[gene.overlap,] |
|
|
43 |
interest.gene.sig <- data.frame(Gene = gene.overlap) |
|
|
44 |
if(length(gene.overlap) < length(interest.gene)){ |
|
|
45 |
a <- setdiff(interest.gene, gene.overlap) |
|
|
46 |
cat("gene ", paste(a, sep = ";"), " not in TCGA!\n") |
|
|
47 |
}else{ |
|
|
48 |
cat("All gene in TCGA data!\n") |
|
|
49 |
} |
|
|
50 |
} |
|
|
51 |
|
|
|
52 |
##Draw the expression barplot of each gene |
|
|
53 |
exp.data.interest <- exp.data.process[gene.overlap,] |
|
|
54 |
sample.id <- colnames(exp.data.process) |
|
|
55 |
sample.id <- substr(sample.id, 1, 16) |
|
|
56 |
sample.type <- substr(sample.id, 14, 15) |
|
|
57 |
|
|
|
58 |
normal.index <- which(sample.type == "11") #72 |
|
|
59 |
cancer.index <- which(sample.type == "01") #533 |
|
|
60 |
patient.type <- rep("Tumor", ncol(exp.data.process)) |
|
|
61 |
patient.type[normal.index] <- "Normal" |
|
|
62 |
|
|
|
63 |
if(Box.plot){ |
|
|
64 |
cat("Normal sample: ", length(normal.index), "\n") |
|
|
65 |
cat("Tumor sample: ", length(cancer.index), "\n") |
|
|
66 |
box.res <- sapply(gene.overlap, function(x){ |
|
|
67 |
interest.gene.matrix <- exp.data.process[x,] |
|
|
68 |
genes <- data.frame(type = patient.type, expressionValue = interest.gene.matrix) |
|
|
69 |
p <- ggboxplot(genes, x = "type", y = "expressionValue", ylab = "Normalized Count", color = "type", palette = "jco", add = "jitter", title = x) + stat_compare_means() |
|
|
70 |
print(p) |
|
|
71 |
}) |
|
|
72 |
} |
|
|
73 |
|
|
|
74 |
##对应表达和生存数据 |
|
|
75 |
sample.id <- substr(sample.id, 1, 15) |
|
|
76 |
patient.overlap <- intersect(clin.data$Sample, sample.id) # 533 tumor samples |
|
|
77 |
exp.data.process.pro <- exp.data.process[,match(patient.overlap, sample.id)] |
|
|
78 |
clin.info <- clin.data[match(patient.overlap, clin.data$Sample),] |
|
|
79 |
|
|
|
80 |
#all gene as the signature |
|
|
81 |
if(meta.signature){ |
|
|
82 |
interest.matrix <- exp.data.process.pro[gene.overlap,] |
|
|
83 |
signature.score <- colSums(interest.matrix)/nrow(interest.matrix) |
|
|
84 |
median.score <- median(signature.score) |
|
|
85 |
high.group <- which(signature.score > median.score) |
|
|
86 |
sample.label <- rep("Low group", length(signature.score)) |
|
|
87 |
sample.label[high.group] <- "High group" |
|
|
88 |
OS.data <- data.frame(Patient_ID = patient.overlap, event = clin.info$OS, time = clin.info$OS_time, sample.label = sample.label) |
|
|
89 |
DFS.data <- data.frame(Patient_ID = patient.overlap, event = clin.info$DFS, time = clin.info$DFS_time, sample.label = sample.label) |
|
|
90 |
p1 <- plot.surv(OS.data, risk.table = T, HR = T, ylab = "Overall Survival", main = main, surv.median.line = "hv", xlab = "Time (Month)") |
|
|
91 |
print(p1) |
|
|
92 |
p2 <- plot.surv(DFS.data, risk.table = T, HR = T, ylab = "Disease-Free Survival", main = main, surv.median.line = "hv", xlab = "Time (Month)") |
|
|
93 |
print(p2) |
|
|
94 |
} |
|
|
95 |
|
|
|
96 |
## single gene model |
|
|
97 |
if(single.signature){ |
|
|
98 |
suv.res <- sapply(gene.overlap, function(x){ |
|
|
99 |
gene.expression <- as.numeric(exp.data.process.pro[x,]) |
|
|
100 |
|
|
|
101 |
if(group.by == "top33"){ |
|
|
102 |
quantile.value <- quantile(gene.expression, seq(0,1,0.33)) |
|
|
103 |
top33 <- as.numeric(quantile.value[3]) |
|
|
104 |
bottom33 <- as.numeric(quantile.value[2]) |
|
|
105 |
index1 <- which(gene.expression >= top33) |
|
|
106 |
index2 <- which(gene.expression <= bottom33) |
|
|
107 |
sample.label <- c(rep("Low group", length(index1)), rep("High group", length(index2))) |
|
|
108 |
clin.info <- clin.info[c(index2,index1),] |
|
|
109 |
OS.data <- data.frame(Patient_ID = patient.overlap[c(index2,index1)], event = clin.info$OS, time = clin.info$OS_time, sample.label = sample.label) |
|
|
110 |
DFS.data <- data.frame(Patient_ID = patient.overlap[c(index2,index1)], event = clin.info$DFS, time = clin.info$DFS_time, sample.label = sample.label) |
|
|
111 |
}else{ |
|
|
112 |
# median as threshold |
|
|
113 |
med.exp <- median(gene.expression) |
|
|
114 |
high.group <- which(gene.expression>med.exp) |
|
|
115 |
sample.label <- rep("Low group", length(gene.expression)) |
|
|
116 |
sample.label[high.group] <- "High group" |
|
|
117 |
OS.data <- data.frame(Patient_ID = patient.overlap, event = clin.info$OS, time = clin.info$OS_time, sample.label = sample.label) |
|
|
118 |
DFS.data <- data.frame(Patient_ID = patient.overlap, event = clin.info$DFS, time = clin.info$DFS_time, sample.label = sample.label) |
|
|
119 |
} |
|
|
120 |
p1 <- plot.surv(OS.data, risk.table = T, HR = T, ylab = "Overall Survival", main = x, surv.median.line = "hv", xlab = "Time (Month)") |
|
|
121 |
print(p1) |
|
|
122 |
p2 <- plot.surv(DFS.data, risk.table = T, HR = T, ylab = "Disease-Free Survival", main = x, surv.median.line = "hv", xlab = "Time (Month)") |
|
|
123 |
print(p2) |
|
|
124 |
|
|
|
125 |
OS.obj <- coxph(Surv(time, event)~sample.label, data=OS.data) |
|
|
126 |
DFS.obj <- coxph(Surv(time, event)~sample.label, data=DFS.data) |
|
|
127 |
return(c(summary(OS.obj)$logtest[3], summary(DFS.obj)$logtest[3])) |
|
|
128 |
}) |
|
|
129 |
interest.gene.sig$OS_logrank_p <- suv.res[1,] |
|
|
130 |
interest.gene.sig$DFS_logrank_p <- suv.res[2,] |
|
|
131 |
} |
|
|
132 |
|
|
|
133 |
return(interest.gene.sig) |
|
|
134 |
} |
|
|
135 |
|
|
|
136 |
plot.surv <- function(clinical.data, upper.time = NULL, xscale = 1, xlab = "Time", median.time = TRUE, |
|
|
137 |
surv.median.line = "none", HR = FALSE, risk.table = TRUE, pval = TRUE, |
|
|
138 |
conf.int = FALSE, main = NULL, ylab = "Survival probability", colors = c("#D95319", "#3B6793","#EA4335","#4285F4","#34A853","#000000")) { |
|
|
139 |
|
|
|
140 |
#load R package |
|
|
141 |
require(survival) |
|
|
142 |
require(survminer) |
|
|
143 |
require(RColorBrewer) |
|
|
144 |
require(gridExtra) |
|
|
145 |
|
|
|
146 |
# Determine the type of event and the unit of time |
|
|
147 |
# survival.event <- survival.event[1]; |
|
|
148 |
# unit.xlabel <- unit.xlabel[1]; |
|
|
149 |
|
|
|
150 |
# If upper.time is set, the samples whose survival time exceeds upper.time will be removed |
|
|
151 |
if (!is.null(upper.time)) clinical.data <- clinical.data[clinical.data$time <= upper.time,] |
|
|
152 |
|
|
|
153 |
# #date format |
|
|
154 |
# xSL <- data.frame(xScale=c(1,7,30,365.25),xLab=c("Days","Weeks","Months","Years"), stringsAsFactors=FALSE) |
|
|
155 |
# switch(unit.xlabel, year={xScale <- 365.25;}, month={xScale <- 30;}, week={xScale <- 7;}, day={xScale <- 1}) |
|
|
156 |
# xLab <- xSL[which(xSL[,1]==xScale),2]; |
|
|
157 |
|
|
|
158 |
# color |
|
|
159 |
if (!is.factor(clinical.data$sample.label)) |
|
|
160 |
clinical.data$sample.label <- as.factor(clinical.data$sample.label) |
|
|
161 |
|
|
|
162 |
t.name <- levels(clinical.data$sample.label) |
|
|
163 |
|
|
|
164 |
if (length(t.name) > 6) stop("Sample grouping >6, exceeding the function acceptance range!") |
|
|
165 |
t.col <- colors[1:length(t.name)] |
|
|
166 |
|
|
|
167 |
# 构造生存对象 |
|
|
168 |
km.curves <- survfit(Surv(time, event)~sample.label, data=clinical.data) |
|
|
169 |
|
|
|
170 |
# HR and 95%CI |
|
|
171 |
if (length(t.name) == 2) { |
|
|
172 |
if (HR) { |
|
|
173 |
cox.obj <- coxph(Surv(time, event)~sample.label, data=clinical.data) |
|
|
174 |
tmp <- summary(cox.obj) |
|
|
175 |
HRs <- round(tmp$coefficients[ ,2], digits = 2) |
|
|
176 |
HR.confint.lower <- round(tmp$conf.int[,"lower .95"], 2) |
|
|
177 |
HR.confint.upper <- round(tmp$conf.int[,"upper .95"], 2) |
|
|
178 |
HRs <- paste0(HRs, " (", HR.confint.lower, "-", HR.confint.upper, ")") |
|
|
179 |
} |
|
|
180 |
} |
|
|
181 |
|
|
|
182 |
# Construct the legend display text in the survival image |
|
|
183 |
legend.content <- substr(names(km.curves$strata), start = 14, stop = 1000) |
|
|
184 |
|
|
|
185 |
# x-axis scale unit conversion |
|
|
186 |
if (is.numeric(xscale) | (xscale %in% c("d_m", "d_y", "m_d", "m_y", "y_d", "y_m"))) { |
|
|
187 |
xscale = xscale |
|
|
188 |
} else { |
|
|
189 |
stop('xscale should be numeric or one of c("d_m", "d_y", "m_d", "m_y", "y_d", "y_m").') |
|
|
190 |
} |
|
|
191 |
|
|
|
192 |
# conversion of survival time units |
|
|
193 |
.format_xticklabels <- function(labels, xscale){ |
|
|
194 |
# 1 year = 365.25 days |
|
|
195 |
# 1 month = 365.25/12 = 30.4375 days |
|
|
196 |
if (is.numeric(xscale)) xtrans <- 1/xscale |
|
|
197 |
else |
|
|
198 |
xtrans <- switch(xscale, |
|
|
199 |
d_m = 12/365.25, |
|
|
200 |
d_y = 1/365.25, |
|
|
201 |
m_d = 365.25/12, |
|
|
202 |
m_y = 1/12, |
|
|
203 |
y_d = 365.25, |
|
|
204 |
y_m = 12, |
|
|
205 |
1 |
|
|
206 |
) |
|
|
207 |
round(labels*xtrans,2) |
|
|
208 |
} |
|
|
209 |
|
|
|
210 |
# Add the median survival time and its 95% CI in the figure and place it in the subtitle position |
|
|
211 |
subtitle <- NULL |
|
|
212 |
if (median.time) { |
|
|
213 |
if (is.numeric(xscale)) { |
|
|
214 |
median.km.obj = km.curves |
|
|
215 |
} else if (xscale %in% c("d_m", "d_y", "m_d", "m_y", "y_d", "y_m")) { |
|
|
216 |
clinical.data$time <- .format_xticklabels(labels = clinical.data$time, xscale = xscale) |
|
|
217 |
median.km.obj <- survfit(Surv(time, event)~sample.label, data=clinical.data) |
|
|
218 |
} |
|
|
219 |
survival.time.info <- NULL |
|
|
220 |
survival.time.info <- rbind(survival.time.info, summary(median.km.obj)$table) |
|
|
221 |
median.survival <- round(survival.time.info[!duplicated(survival.time.info[,7:9]),7:9], digits = 2) # 注意:这里取得的置信区间上界可能为NA |
|
|
222 |
if (length(levels(clinical.data$sample.label)) == 1) { |
|
|
223 |
tmp1 <- levels(clinical.data$sample.label) |
|
|
224 |
} else { |
|
|
225 |
tmp1 <- do.call(rbind,strsplit(rownames(summary(median.km.obj)$table), split = "="))[,2] |
|
|
226 |
} |
|
|
227 |
tmp2 <- paste(median.survival[,1], "(", median.survival[,2], "-", median.survival[,3], ")") |
|
|
228 |
subtitle <- paste(tmp1, tmp2, sep = ":", collapse = "\n") |
|
|
229 |
} |
|
|
230 |
|
|
|
231 |
# ggsurvplot |
|
|
232 |
ggsurv <- ggsurvplot(km.curves, # survfit object with calculated statistics. |
|
|
233 |
data = clinical.data, # data used to fit survival curves. |
|
|
234 |
palette = t.col, |
|
|
235 |
|
|
|
236 |
risk.table = risk.table, # show risk table. |
|
|
237 |
pval = pval, # show p-value of log-rank test. |
|
|
238 |
surv.median.line = surv.median.line, # add the median survival pointer. |
|
|
239 |
title = main, #main title |
|
|
240 |
subtitle = subtitle, #sub title |
|
|
241 |
font.main = 15, |
|
|
242 |
xlab = xlab, # customize X axis label. |
|
|
243 |
ylab = ylab, # customize Y axis label |
|
|
244 |
xscale = xscale, |
|
|
245 |
|
|
|
246 |
|
|
|
247 |
#legend |
|
|
248 |
legend.title = "", |
|
|
249 |
legend.labs = legend.content, |
|
|
250 |
legend = c(0.8,0.9), |
|
|
251 |
font.legend = 9, |
|
|
252 |
|
|
|
253 |
#risk table |
|
|
254 |
tables.theme = theme_cleantable(), |
|
|
255 |
risk.table.title = "No. at risk:", |
|
|
256 |
risk.table.y.text.col = T, |
|
|
257 |
risk.table.y.text = FALSE, |
|
|
258 |
tables.height = 0.15, |
|
|
259 |
risk.table.fontsize = 3 |
|
|
260 |
) |
|
|
261 |
if (length(t.name) == 2) { |
|
|
262 |
if (HR) |
|
|
263 |
ggsurv$plot <- ggsurv$plot + ggplot2::annotate("text", x = max(km.curves$time)/12, |
|
|
264 |
y = 0.15, size = 5, label = paste("HR=", HRs)) |
|
|
265 |
} |
|
|
266 |
|
|
|
267 |
ggsurv$plot <- ggsurv$plot + theme(plot.title = element_text(hjust = 0.5), plot.subtitle = element_text(size = 10), |
|
|
268 |
plot.margin = unit(c(5.5, 5.5, 5.5, 50), "points")) |
|
|
269 |
|
|
|
270 |
ggsurv$table <- ggsurv$table + theme(plot.title = element_text(hjust = -0.04), |
|
|
271 |
plot.margin = unit(c(5.5, 5.5, 5.5, 50), "points")) |
|
|
272 |
|
|
|
273 |
|
|
|
274 |
if(length(t.name) > 2) { |
|
|
275 |
# pairwise: log rank P value |
|
|
276 |
res <- pairwise_survdiff(Surv(time, event)~sample.label, data=clinical.data); |
|
|
277 |
pairwise.pvalue <- round(res$p.value, digits = 4); |
|
|
278 |
pairwise.pvalue[which(pairwise.pvalue < 0.0001)] <- "<0.0001"; |
|
|
279 |
pairwise.pvalue[is.na(pairwise.pvalue)] <- "-" |
|
|
280 |
|
|
|
281 |
# add table |
|
|
282 |
tt <- ttheme_minimal(core = list(fg_params = list(col = "black"),bg_params = list(fill = NA, col = "black")), |
|
|
283 |
colhead = list(fg_params = list(col = NA),bg_params = list(fill = t.col, col = "black")), |
|
|
284 |
rowhead = list(fg_params = list(col = NA, hjust = 1),bg_params = list(fill = c("white",t.col[-1]), col = "black")) |
|
|
285 |
) |
|
|
286 |
pairwise.table <- tableGrob(pairwise.pvalue, theme = tt) |
|
|
287 |
ggsurv <- ggarrange(ggarrange(ggsurv$plot, ggsurv$table, nrow=2, heights=c(2,0.5)), |
|
|
288 |
pairwise.table, nrow=2, heights = c(2,0.5), |
|
|
289 |
labels = c("","p from pairwise comparisons"), |
|
|
290 |
hjust = 0, font.label = list(size = 15, face = "plain")) |
|
|
291 |
} |
|
|
292 |
|
|
|
293 |
ggsurv |
|
|
294 |
} |