[2345e0]: / experiments / Results / LUAD / .Rhistory

Download this file

220 lines (219 with data), 9.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
install.packages(c("ggplot2", "ggpubr", "readr", "reshape2", "reticulate", "rstudioapi", "stringr", "survminer"))
library(readr)
library(stringr)
library(ggplot2)
library(reshape2)
options(stringsAsFactors = F)
analyses_dir = dirname(rstudioapi::getSourceEditorContext()$path)
workdir = paste0(analyses_dir, '/../../Results/LUAD/')
setwd(workdir)
datasetting = dir(".")[grepl("RNAseq", dir(".")) | grepl("tmb", dir("."))]
datasetting = dir(".")[grepl("RNAseq", dir("."))]
ci.train.mat = NULL
ci.test.mat = NULL
pval.train.mat = NULL
pval.test.mat = NULL
for (ds in datasetting){
ci.train.5 = NULL
ci.test.5 = NULL
pval.train.5 = NULL
pval.test.5 = NULL
f = dir(ds)
f = f[grep("fold_1", f)]
str = read_file(paste0(ds, "/", f,"/mainlog.log"))
str = strsplit(str, "\n")
str = str[[1]][sapply(str, function(x) grepl("Final", x))]
for (i in 1:length(str)){
values = regmatches(str[i],gregexpr("[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?",str[i]))[[1]]
if (i%%2 == 1){
ci.train.5 = as.numeric(c(ci.train.5, values[1]))
pval.train.5 = as.numeric(c(pval.train.5, values[2]))
} else {
ci.test.5 = as.numeric(c(ci.test.5, values[1]))
pval.test.5 = as.numeric(c(pval.test.5, values[2]))
}
}
ci.train.mat = cbind(ci.train.mat, ci.train.5)
ci.test.mat = cbind(ci.test.mat, ci.test.5)
pval.train.mat = cbind(pval.train.mat, pval.train.5)
pval.test.mat = cbind(pval.test.mat, pval.test.5)
}
colnames(ci.train.mat) = datasetting
colnames(ci.test.mat) = datasetting
colnames(pval.train.mat) = datasetting
colnames(pval.test.mat) = datasetting
colMeans(as.matrix(ci.train.mat), na.rm = T)
colMeans(as.matrix(ci.test.mat), na.rm = T)
colMeans(as.matrix(pval.train.mat), na.rm = T)
colMeans(as.matrix(pval.test.mat), na.rm = T)
colnames(ci.test.mat) = c("mRNA","miRNA", "mRNA+miRNA", "mRNA+miRNA+CNB+TMB","mRNA+miRNA+Age+Gender","mRNA+miRNA+CNB+TMB+Age+Gender")
# ci.test.mat = ci.test.mat[,1:4]
# boxplot(ci.test.mat)
p1 <- ggplot(melt(ci.test.mat), aes(x=Var2, y=value, fill = Var2)) +
# geom_violin(trim=T) +
geom_boxplot(width=0.2, color="black", size = 0.7, outlier.shape = 21) +
# geom_jitter(shape=16, position=position_jitter(0.2)) +
# stat_summary(fun.y=mean, geom="pointrange", color="red") +
theme_bw() +
scale_y_continuous(limits = c(0.62, 0.85)) +
labs(title = "(A) Performances of SALMON by Integrating Multi-omics Data",
x = "", y = "Concordance Index", fill = "Datasets") +
theme(axis.text.x=element_blank(),
# axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(size=14, face="bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position="bottom"
) +
scale_fill_brewer(palette="Set2") +
# annotate("text", x = 1:6, y = rep(0.525,6), label = rep("Mean", 6)) +
annotate("point", color = "blue", x = 1:dim(ci.test.mat)[2], y = apply(ci.test.mat, 2, median)) +
annotate("line", color = "blue", x = 1:dim(ci.test.mat)[2], y = apply(ci.test.mat, 2, median)) +
# annotate("text", color = "black", x = 1:dim(ci.test.mat)[2], y = 0.58, label = rep("Mean",dim(ci.test.mat)[2])) +
annotate("text", color = "black", x = 1:dim(ci.test.mat)[2], y = 0.64, label = sprintf("Mean: %.4f", colMeans(ci.test.mat))) +
annotate("text", color = "blue", x = 1:dim(ci.test.mat)[2], y = apply(ci.test.mat, 2, median)+0.02, label = sprintf("Median: %.4f", apply(ci.test.mat, 2, median)))
p1
ggsave("performance_boxplot.png", plot = p1,
scale = 1, width = 10, height = 5, units = "in",
dpi = 300)
################## pairwise paired t-test
# pair-wised pair t-test: CI
pw.pttest.t = matrix(nrow = dim(ci.test.mat)[2], ncol = dim(ci.test.mat)[2])
pw.pttest.pval = matrix(nrow = dim(ci.test.mat)[2], ncol = dim(ci.test.mat)[2])
colnames(pw.pttest.t) = colnames(ci.test.mat)
rownames(pw.pttest.t) = colnames(ci.test.mat)
colnames(pw.pttest.pval) = colnames(ci.test.mat)
rownames(pw.pttest.pval) = colnames(ci.test.mat)
for(i in 1:dim(ci.test.mat)[2]){
for(j in 1:dim(ci.test.mat)[2]){
res = t.test(ci.test.mat[,i], ci.test.mat[,j], paired=T, conf.level=0.95)
pw.pttest.t[i,j] = res$statistic
pw.pttest.pval[i,j] = res$p.value
}
}
write.table(pw.pttest.t, "pw.pttest.t.csv", sep = ",", col.names = NA)
write.table(pw.pttest.pval, "pw.pttest.pval.csv", sep = ",", col.names = NA)
####################################
# Analyse network weights
####################################
workdir.feature.weight = "/home/zhihuan/Documents/SALMON/experiments/Results/LUAD/7_RNAseq+miRNAseq+cnb+tmb+clinical"
setwd(workdir.feature.weight)
Sys.sleep(1)
folds = dir(".")[grepl("run_", dir("."))]
feature_corresponding_CI = data.frame(matrix(0, ncol = 75, nrow = 5))
for (i in 1:length(folds)){
original_CI = read.table(paste0(folds[i], "/leave_one_out_CIndex.txt"), nrows=1)$V6
text = read.table(paste0(folds[i], "/leave_one_out_CIndex.txt"), skip = 1)
feature_names = str_remove_all(text$V5, ',')
feature_corresponding_CI[i,] = text[,8] - original_CI
}
# feature_corresponding_CI = feature_corresponding_CI[,1:74]
feature.median = apply(feature_corresponding_CI, 2, median)
feature.max = apply(feature_corresponding_CI, 2, max)
feature.order = sort.int(feature.median, index.return = T, decreasing = F)$ix
feature.median.ordered = feature.median[feature.order]
feature_corresponding_CI = cbind(c("fold 1", "fold 2", "fold 3", "fold 4", "fold 5"), feature_corresponding_CI)
colnames(feature_corresponding_CI) = c("fold", rep(1:75))
feature_corresponding_CI.m = melt(feature_corresponding_CI)
feature.color = c(rep("yellowgreen", 58), rep("red",13), rep("turquoise",2),rep("pink",2))[feature.order]
feature.color = feature.color[1:(dim(feature_corresponding_CI)[2]-1)]
p3 <- ggplot(feature_corresponding_CI.m, aes(x = factor(variable, levels = feature.order[1:(dim(feature_corresponding_CI)[2]-1)]),
y = value, fill = variable)) +
geom_boxplot(width=0.6, fill = feature.color, color="black", size = 0.6, outlier.shape = NA) +
labs(title = "Feature Importance from SALMON (LUAD)",
x = "Feature IDs", y = "Concordance Index Decreased", fill = "Methods") +
theme(axis.text.x = element_text(angle = 0, hjust = 1, size = 9),
axis.text = element_text(size = 12),
plot.title = element_text(size=14, face="bold", hjust = 0.5),
plot.subtitle = element_text(hjust = 0.5),
legend.position = c(0.9, 0.2)) +
ylim(-0.04, 0.02) +
annotate("text", angle=90, x = 1:length(feature.median.ordered), y = feature.median.ordered+0.005,
color = "red", label = sprintf("%.4f", feature.median.ordered)) +
annotate("rect", fill = "white", xmin = 49.5, xmax = 70.5, ymin = -0.015, ymax = -0.032, alpha = .8)+
annotate("text", color = "black", size = 5, x = 60, y = -0.018, label = "Boxplot Colors") +
annotate("rect", fill = "yellowgreen", xmin = 50, xmax = 70, ymin = -0.02, ymax = -0.022, alpha = .8)+
annotate("text", color = "black", x = 60, y = -0.021, label = "58 mRNA-seq coexpression modules (ID 1-58)") +
annotate("rect", fill = "red", xmin = 50, xmax = 70, ymin = -0.023, ymax = -0.025, alpha = .8)+
annotate("text", color = "white", x = 60, y = -0.024, label = "13 miRNA-seq coexpression modules (ID 59-71)") +
annotate("rect", fill = "turquoise", xmin = 50, xmax = 70, ymin = -0.026, ymax = -0.028, alpha = .8)+
annotate("text", color = "black", x = 60, y = -0.027, label = "CNB and TMB Features (ID 72-73)") +
annotate("rect", fill = "pink", xmin = 50, xmax = 70, ymin = -0.029, ymax = -0.031, alpha = .8)+
annotate("text", color = "black", x = 60, y = -0.030, label = "Gender, age (ID 74-75)")
p3
ggsave("feature_importance.png", plot = p3,
scale = 1, width = 15, height = 8, units = "in",
dpi = 300)
################
# KM plots
################
workdir = paste0(analyses_dir, '/../../Results/LUAD/')
setwd(workdir)
library(survival)
library(survminer)
library(reticulate)
use_python("/home/zhihuan/anaconda3/bin/python")
py_load_object <- function(filename, pickle = "pickle") {
builtins <- import_builtins()
pickle <- import(pickle)
handle <- builtins$open(filename, "rb")
on.exit(handle$close(), add = TRUE)
pickle$load(handle)
}
# log-ranked p-value
logrankp <- function(res, OS, OS.Status, title, fname){
mv = median(res)
group = cbind(length(OS))
for(j in 1:length(OS)){
if(res[j] < mv){
group[j] = 1
}else{
group[j] = 2
}
}
mySurvTest = Surv(OS, OS.Status)
# logrank
log1 = survdiff(mySurvTest ~ group)
logrank.p = pchisq(log1$chisq, 1, lower.tail=FALSE)
# KM Curve
fit = survfit(mySurvTest ~ group)
n1 = sum(group==1)
leg1 = paste("Low risk (", n1, ")", sep = "")
n2 = sum(group==2)
leg2 = paste("High risk (", n2, ")", sep = "")
png(filename = fname,
width = 7, height = 6.5,
units = "cm", res = 300, pointsize = 7)
plot(fit, mark.time=TRUE, lty = 1:2,
col = 1:2, cex = 0.5)
title(main = title, xlab = "Months", ylab = "Survival", mgp=c(2,2,0))
grid()
legend(x = "topright", legend = c(leg1, leg2), lty = 1:2,
col = 1:2, cex = 0.65)
text(10, 0.1, paste("p=", formatC(logrank.p, format="g", digits = 4), sep = ""),
pos = 4, cex = 1)
p <- recordPlot()
dev.off()
return(p)
}
datasets = dir(".")[grepl("RNAseq", dir(".")) | grepl("tmb", dir("."))]#[1:5]
titles = c("(A) mRNA","(B) miRNA","(C) mRNA+miRNA","(D) mRNA+miRNA+CNB+TMB",
"(E) mRNA+miRNA+Gender+Age","(F) CNB+TMB+Gender+Age","(G) mRNA+miRNA+CNB+TMB+Gender+Age")
kmplots = NULL
for (ds in 1:length(datasets)){
folds = dir(datasets[ds])[grepl("run_", dir(datasets[ds]))]
hr.all = NULL
e.all = NULL
t.all = NULL
for (i in 1:length(folds)){
hr = py_load_object(paste0(datasets[ds],"/",folds[i], "/hazard_ratios_lbl_pred_all_test.pickle"))
e = as.character(py_load_object(paste0(datasets[ds],"/",folds[i], "/OS_event_test.pickle")))
e = as.numeric(regmatches(e,gregexpr("[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?",e))[[1]])
t = as.character(py_load_object(paste0(datasets[ds],"/",folds[i], "/OS_test.pickle")))
t = as.numeric(regmatches(t,gregexpr("[-+]?[0-9]*\\.?[0-9]+([eE][-+]?[0-9]+)?",t))[[1]])
hr.all = c(hr.all, hr)
e.all = c(e.all, e)
t.all = c(t.all, t)
}
fname = paste0(workdir,"/kmcurve_testing/KMCurve_",ds,".png")
logrankp(hr.all, t.all, e.all, titles[ds], fname)
}