[0375db]: / overview / missing-values-risk.R

Download this file

177 lines (149 with data), 5.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
source('../lib/handymedical.R', chdir = TRUE)
requirePlus('survminer', 'cowplot')
models.base <- '../../output'
cox.missing.filename <- 'caliber-replicate-with-missing-model-survreg-bootstrap-1.rds'
cox.missing.riskdists.filename <- 'caliber-replicate-with-missing-survreg-3-risk-violins.csv'
cox.missing.riskcats.filename <- 'caliber-replicate-with-missing-survreg-3-risk-cats.csv'
cox.imp.filename <- 'caliber-replicate-imputed-survreg-4-surv-boot-imp.rds'
cox.disc.filename <- 'all-cv-survreg-boot-try5-surv-boot.csv'
data.filename <- '../../data/cohort-sanitised.csv'
survPlots <- function(...) {
surv.fits <- list(...)
df <- data.frame()
for(i in 1:length(surv.fits)) {
df <-
rbind(
df,
data.frame(
variable = as.character(surv.fits[[i]]$call)[2],
stratum = '1',
time = surv.fits[[i]][1]$time,
surv = surv.fits[[i]][1]$surv,
lower = surv.fits[[i]][1]$lower,
upper = surv.fits[[i]][1]$upper
),
data.frame(
variable = as.character(surv.fits[[i]]$call)[2],
stratum = '2',
time = surv.fits[[i]][2]$time,
surv = surv.fits[[i]][2]$surv,
lower = surv.fits[[i]][2]$lower,
upper = surv.fits[[i]][2]$upper
)
)
}
ggplot(
df,
aes(x = time, y = surv, ymin = lower, ymax = upper)
) +
geom_line(aes(colour = stratum)) +
geom_ribbon(aes(fill = stratum), alpha = 0.4) +
facet_grid(variable ~ .)
}
cox.missing.boot <- readRDS(file.path(models.base, cox.missing.filename))
fit.risks.miss <- bootStats(cox.missing.boot, '95ci')
fit.risks.miss$var <- rownames(fit.risks.miss)
cox.imp.boot <- readRDS(file.path(models.base, cox.imp.filename))
fit.risks.imp <- bootMIStats(cox.imp.boot, '95ci')
fit.risks.imp$var <- rownames(fit.risks.imp)
cox.disc.boot <- read.csv(file.path(models.base, cox.disc.filename))
fit.risks.disc <- bootStatsDf(cox.disc.boot)
fit.risks.disc$var <- rownames(fit.risks.disc)
# Create two data frames, one for missing vs imputed and one for discrete
fit.risks.imp.vs.miss <- merge(fit.risks.imp, fit.risks.miss, by = c('var'))
fit.risks.imp.vs.miss$model <- 'miss'
# This is a slight cheat... The discrete model here was done by my home-made
# bootstrap function, whose variable/level names have been preprocessed with
# make.names for slightly annoying internal reasons. Luckily, all the logical
# variables escape unscathed from this, and they're the only ones we can compare
# anyway!
fit.risks.imp.vs.disc <- merge(fit.risks.imp, fit.risks.disc, by = c('var'))
fit.risks.imp.vs.disc$model <- 'disc'
fit.risks <- rbind(fit.risks.imp.vs.miss, fit.risks.imp.vs.disc)
# Lose a few irrelevant variables
fit.risks <-
subset(
fit.risks, !(var %in% c('(Intercept)', 'c.index', 'calibration.score'))
)
# Make them
disc.vs.cont.risks.plot <-
ggplot(
fit.risks,
aes(
x = val.x, xmin = lower.x, xmax = upper.x,
y = val.y, ymin = lower.y, ymax = upper.y,
label = var, colour = model
)
) +
geom_point() +
geom_errorbar() +
geom_errorbarh() +
geom_text() +
# Remove the legend, massively compresses the plot horizontally
theme(legend.position="none")
risk.dist.by.var <-
read.csv(file.path(models.base, cox.missing.riskdists.filename))
risk.cats <- read.csv(file.path(models.base, cox.missing.riskcats.filename))
# Plot the results
risk.violins.plot <-
ggplot() +
# First, and therefore at the bottom, draw the reference line at risk = 1
geom_hline(yintercept = 1) +
# Then, on top of that, draw the violin plot of the risk from the data
geom_violin(data = risk.dist.by.var, aes(x = quantity, y = risk)) +
geom_pointrange(
data = risk.cats,
aes(x = quantity, y = our_value, ymin = our_lower,
ymax = our_upper),
position = position_jitter(width = 0.1)
) +
geom_text(
data = risk.cats,
aes(
x = quantity,
y = our_value,
label = quantity.level
)
) +
scale_y_continuous(breaks = c(0.75, 1.0, 1.25, 1.5))
# Kaplan-Meier survival curves for a few example variables being missing
COHORT <- fread(data.filename)
COHORT <- subset(COHORT, !exclude & time_death > 0)
# Calculate the curves
km.hdl <-
survfit(
Surv(time_death, endpoint_death == 'Death') ~ is.na(hdl_6mo),
data = COHORT
)
km.total_chol <-
survfit(
Surv(time_death, endpoint_death == 'Death') ~ is.na(total_chol_6mo),
data = COHORT
)
km.crea <-
survfit(
Surv(time_death, endpoint_death == 'Death') ~ is.na(crea_6mo),
data = COHORT
)
km.missingness <-
survPlots(km.hdl, km.total_chol, km.crea) +
theme(
legend.position = "none",
# Remove grey labels on facets
strip.background = element_blank(),
strip.text = element_blank()
)
theme_set(theme_cowplot(font_size = 10))
plot_grid(
disc.vs.cont.risks.plot, risk.violins.plot,
km.missingness,
labels = c("A", "B", "C"),
align = "v", nrow = 1
)
ggsave(
'../../output/missing-values-risk.pdf',
width = 16,
height = 5,
units = 'cm',
useDingbats = FALSE
)