[daa031]: / function / reliability_diagram.R

Download this file

149 lines (137 with data), 5.9 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
reliability_diagram = function(data_ls,u, stat_type,bins,
title_ls,color_ls,title) {
data_ls_len = length(data_ls)
### create bin averages
for (data_i in 1:data_ls_len){
colnames(data_ls[[data_i]]) = c("status","survival","pred")
temp_res = reliability_datapts(data_ls[[data_i]], u = u,bins = bins, stat_type = stat_type)
temp_hl = hosmer_lemeshow(data_ls[[data_i]], u = u,bins = bins, stat_type = stat_type)
assign(paste("recal_bins", data_i, sep=""),temp_res)
assign(paste("HL_pvalue", data_i, sep=""),temp_hl[1])
assign(paste("HL_statistic", data_i, sep=""),temp_hl[2])
}
hl_dt = data.frame(x=numeric(0),y=numeric(0),variable= character(0), label = character(0),HLpval = numeric(0),HLstatistic =numeric(0) )
for (data_i in 1:data_ls_len){
temp_res = get(paste('recal_bins', data_i, sep=''))
temp_res$variable <- paste("Vol.x", data_i, sep='')
colnames(temp_res)[colnames(temp_res)=='V1'] = 'value'
assign(paste('melt', data_i, sep=''), temp_res)
hl_p = get(paste('HL_pvalue', data_i, sep=''))
hl_s = get(paste('HL_statistic', data_i, sep=''))
hl_dt[data_i,] = cbind(0.05,1-0.1*data_i,paste("Vol.x", data_i, sep=''),title_ls[data_i],hl_p[[1]],hl_s[[1]])
}
hl_dt$x = as.numeric(hl_dt$x)
hl_dt$y = as.numeric(hl_dt$y)
hl_dt$HLpval = as.numeric(hl_dt$HLpval)
hl_dt$HLstatistic = as.numeric(hl_dt$HLstatistic)
data = melt1
if (data_ls_len > 1){
for (data_i in 2:data_ls_len){
data = rbind(data, get(paste('melt', data_i, sep='')))
}
}
line_plot = ggplot(data, aes(x=pred, y=obs,color=variable,group= variable)) +
geom_point(aes(shape = variable),size = 3)+
geom_line(linetype = "solid")+
geom_text(data = hl_dt,aes(x=x,y=y,color = variable,hjust=0,
label = paste0(label," H-L test p=",round(HLpval,2),", C-statistic=",round(HLstatistic,2))))+
scale_color_manual(labels = title_ls,
values = color_ls) +
scale_shape_discrete(labels = title_ls)+
scale_linetype_discrete(labels = title_ls)+
guides(color=guide_legend(" "),shape= guide_legend(" "),linetype=guide_legend(" ")) +
xlab(paste("Predicted Survival Probability at",title)) + ylab(paste("Observed Proportion at",title)) +
geom_abline(intercept = 0, slope = 1, color="black",
linetype="dashed", size=1) +
lims(x=c(0,1),y=c(0,1))+
theme_bw()+
theme(
legend.position="bottom",
axis.title = element_text(size=12),
axis.text = element_text(size=12),
legend.text = element_text(size=15),
legend.title = element_text(size=15),
text = element_text(size=12)
)
line_plot
if (!file.exists("results/LOO_calibration")) dir.create('results/LOO_calibration')
ggsave(paste0("results/LOO_calibration/",title,".pdf"),device = 'pdf',width = 6,height = 6)
}
reliability_datapts <- function(ndata, u,bins=10, stat_type ='C') {
min.pred <- min(ndata$pred)
max.pred <- max(ndata$pred)
min.max.diff <- max.pred - min.pred
if (stat_type == 'H'){
ndata = ndata[order(ndata$pred),]
res = data.frame(obs= numeric(0), pred = numeric(0),obs_lower=numeric(0),obs_upper=numeric(0))
split_mtx = split(ndata, cut(ndata$pred, seq(0,1,1/bins), include.lowest=TRUE))
for (i in 1:length(split_mtx)){
sfit = summary(survfit(Surv(survival, status) ~ 1,data=split_mtx[[i]]),times=u,extend=T)
obs = sfit[['surv']]
obs_upper = sfit[['upper']]
obs_lower = sfit[['lower']]
pred = mean(split_mtx[[i]]$pred)
if (sum(is.na(col_mean)) > 0) {
next
}
res[i,] = c(obs,pred,obs_lower,obs_upper)
}
}else{
## C statistics, same number of instances in each bin
mtx = ndata[order(ndata$pred),]
n <- length(ndata$pred)/bins
nr <- nrow(mtx)
split_mtx = split(mtx, rep(1:ceiling(nr/n), each=n, length.out=nr))
res = data.frame(obs= numeric(0), pred = numeric(0),obs_lower=numeric(0),obs_upper=numeric(0))
for (i in 1:length(split_mtx)){
sfit = summary(survfit(Surv(survival, status) ~ 1,data=split_mtx[[i]]),times=u,extend=T)
obs = sfit[['surv']]
obs_upper = sfit[['upper']]
obs_lower = sfit[['lower']]
pred = mean(split_mtx[[i]]$pred)
res[i,] = c(obs,pred,obs_lower,obs_upper)
}
}
res
}
hosmer_lemeshow <- function(ndata, u,bins=10, stat_type ='C') {
min.pred <- min(ndata$pred)
max.pred <- max(ndata$pred)
min.max.diff <- max.pred - min.pred
if (stat_type == 'H'){
ndata = ndata[order(ndata$pred),]
res = data.frame(obs= numeric(0), pred = numeric(0),obs_lower=numeric(0),obs_upper=numeric(0))
split_mtx = split(ndata, cut(ndata$pred, seq(0,1,1/bins), include.lowest=TRUE))
}else{
mtx = ndata[order(ndata$pred),]
n <- length(ndata$pred)/bins
nr <- nrow(mtx)
split_mtx = split(mtx, rep(1:ceiling(nr/n), each=n, length.out=nr))
}
H_stat = 0
for (i in 1:length(split_mtx)){
sfit = summary(survfit(Surv(survival, status) ~ 1,data=split_mtx[[i]]),times=u,extend=T)
obs = sfit[['surv']]
exp = mean(split_mtx[[i]]$pred)
obs_not = 1-obs
exp_not = 1-exp
if (exp == 0 || exp_not == 0){
next
}
bin_sum = ((obs - exp)**2)/exp + ((obs_not - exp_not)**2)/exp_not
H_stat = H_stat + bin_sum
}
PVAL = 1 - pchisq(H_stat, bins - 2)
cat('PVALUE', PVAL, '\n')
cat('stat', H_stat, '\n')
return(c(PVAL,H_stat))
}
#### test #######
# data_ls = list(subset(dt_risk_LOO,!is.na(prob1)) %>% select(DFS_status,DFS,prob1) ,
# subset(dt_risk_LOO,!is.na(cox1_prob1))%>% select(DFS_status,DFS,cox1_prob1))
# title_ls= c('Joint model', 'Cox model')
# color_ls = c('blue', 'red')
# limits=c(0,1)
# u=15
# bins = 5
# title = "15 Months"