|
a |
|
b/cox-ph/cox-discretised.R |
|
|
1 |
#+ knitr_setup, include = FALSE |
|
|
2 |
|
|
|
3 |
# Whether to cache the intensive code sections. Set to FALSE to recalculate |
|
|
4 |
# everything afresh. |
|
|
5 |
cacheoption <- FALSE |
|
|
6 |
# Disable lazy caching globally, because it fails for large objects, and all the |
|
|
7 |
# objects we wish to cache are large... |
|
|
8 |
opts_chunk$set(cache.lazy = FALSE) |
|
|
9 |
|
|
|
10 |
#' # Cross-validating discretisation of input variables in a survival model |
|
|
11 |
#' |
|
|
12 |
#' |
|
|
13 |
|
|
|
14 |
calibration.filename <- '../../output/survreg-crossvalidation-try5.csv' |
|
|
15 |
caliber.missing.coefficients.filename <- |
|
|
16 |
'../../output/caliber-replicate-with-missing-survreg-bootstrap-coeffs-1.csv' |
|
|
17 |
comparison.filename <- |
|
|
18 |
'../../output/caliber-replicate-with-missing-var-imp-try2.csv' |
|
|
19 |
# The first part of the filename for any output |
|
|
20 |
output.filename.base <- '../../output/all-cv-survreg-boot-try5' |
|
|
21 |
|
|
|
22 |
|
|
|
23 |
# What kind of model to fit to...currently 'cph' (Cox model), 'ranger' or |
|
|
24 |
# 'rfsrc' (two implementations of random survival forests) |
|
|
25 |
model.type <- 'survreg' |
|
|
26 |
|
|
|
27 |
# If surv.vars is defined as a character vector here, the model only uses those |
|
|
28 |
# variables specified, eg c('age') would build a model purely based on age. If |
|
|
29 |
# not specified (ie commented out), it will use the defaults. |
|
|
30 |
# surv.predict <- c('age') |
|
|
31 |
|
|
|
32 |
#' ## Do the cross-validation |
|
|
33 |
#' |
|
|
34 |
#' The steps for this are common regardless of model type, so run the script to |
|
|
35 |
#' get a cross-validated model to further analyse... |
|
|
36 |
#+ cox_discretised_cv, cache=cacheoption |
|
|
37 |
|
|
|
38 |
source('../lib/all-cv-bootstrap.R', chdir = TRUE) |
|
|
39 |
|
|
|
40 |
#' # Results |
|
|
41 |
#' |
|
|
42 |
#' ## Performance |
|
|
43 |
#' |
|
|
44 |
#' ### C-index |
|
|
45 |
#' |
|
|
46 |
#' C-index is **`r round(surv.model.fit.coeffs['c.index', 'val'], 3)` |
|
|
47 |
#' (`r round(surv.model.fit.coeffs['c.index', 'lower'], 3)` - |
|
|
48 |
#' `r round(surv.model.fit.coeffs['c.index', 'upper'], 3)`)** on the held-out |
|
|
49 |
#' test set. |
|
|
50 |
#' |
|
|
51 |
#' |
|
|
52 |
#' ### Calibration |
|
|
53 |
#' |
|
|
54 |
#' The bootstrapped calibration score is |
|
|
55 |
#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)` |
|
|
56 |
#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` - |
|
|
57 |
#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**. |
|
|
58 |
#' |
|
|
59 |
#' Let's draw a representative curve from the unbootstrapped fit... (It would be |
|
|
60 |
#' better to draw all the curves from the bootstrap fit to get an idea of |
|
|
61 |
#' variability, but I've not implemented this yet.) |
|
|
62 |
#' |
|
|
63 |
#+ calibration_plot |
|
|
64 |
|
|
|
65 |
calibration.table <- |
|
|
66 |
calibrationTable(surv.model.fit, COHORT.optimised[test.set, ]) |
|
|
67 |
|
|
|
68 |
calibration.score <- calibrationScore(calibration.table) |
|
|
69 |
|
|
|
70 |
calibrationPlot(calibration.table) |
|
|
71 |
|
|
|
72 |
#' |
|
|
73 |
#' ## Model fit |
|
|
74 |
#' |
|
|
75 |
#+ resulting_fit |
|
|
76 |
|
|
|
77 |
print(surv.model.fit) |
|
|
78 |
|
|
|
79 |
#' ## Cox coefficients |
|
|
80 |
#' |
|
|
81 |
#+ cox_coefficients_plot |
|
|
82 |
|
|
|
83 |
|
|
|
84 |
# Save bootstrapped performance values |
|
|
85 |
varsToTable( |
|
|
86 |
data.frame( |
|
|
87 |
model = 'cox', |
|
|
88 |
imputation = FALSE, |
|
|
89 |
discretised = TRUE, |
|
|
90 |
c.index = surv.model.fit.coeffs['c.index', 'val'], |
|
|
91 |
c.index.lower = surv.model.fit.coeffs['c.index', 'lower'], |
|
|
92 |
c.index.upper = surv.model.fit.coeffs['c.index', 'upper'], |
|
|
93 |
calibration.score = surv.model.fit.coeffs['calibration.score', 'val'], |
|
|
94 |
calibration.score.lower = |
|
|
95 |
surv.model.fit.coeffs['calibration.score', 'lower'], |
|
|
96 |
calibration.score.upper = |
|
|
97 |
surv.model.fit.coeffs['calibration.score', 'upper'] |
|
|
98 |
), |
|
|
99 |
performance.file, |
|
|
100 |
index.cols = c('model', 'imputation', 'discretised') |
|
|
101 |
) |
|
|
102 |
|
|
|
103 |
# Unpackage the uncertainties again, this time transformed because survreg |
|
|
104 |
# returns negative values |
|
|
105 |
surv.boot.ests <- bootStatsDf(surv.model.params.boot, transform = `-`) |
|
|
106 |
|
|
|
107 |
#' First, plot the factors and logicals as a scatter plot to compare with the |
|
|
108 |
#' continuous Cox model... |
|
|
109 |
|
|
|
110 |
# Pull coefficients from model with missing data |
|
|
111 |
caliber.missing.coeffs <- read.csv(caliber.missing.coefficients.filename) |
|
|
112 |
|
|
|
113 |
# Rename surv.boot.ests ready for merging |
|
|
114 |
names(surv.boot.ests) <- |
|
|
115 |
c('cox_discrete_value', 'cox_discrete_lower', 'cox_discrete_upper') |
|
|
116 |
surv.boot.ests$quantity.level <- rownames(surv.boot.ests) |
|
|
117 |
# Convert variablemissing to variable_missingTRUE for compatibility |
|
|
118 |
vars.with.missing <- endsWith(surv.boot.ests$quantity.level, 'missing') |
|
|
119 |
surv.boot.ests$quantity.level[vars.with.missing] <- |
|
|
120 |
paste0( |
|
|
121 |
substr( |
|
|
122 |
surv.boot.ests$quantity.level[vars.with.missing], |
|
|
123 |
1, |
|
|
124 |
nchar(surv.boot.ests$quantity.level[vars.with.missing]) - nchar('missing') |
|
|
125 |
), |
|
|
126 |
'_missingTRUE' |
|
|
127 |
) |
|
|
128 |
|
|
|
129 |
# Create a data frame comparing them |
|
|
130 |
compare.coefficients <- merge(caliber.missing.coeffs, surv.boot.ests) |
|
|
131 |
|
|
|
132 |
ggplot( |
|
|
133 |
compare.coefficients, |
|
|
134 |
aes(x = our_value, y = cox_discrete_value, colour = unit == 'missing') |
|
|
135 |
) + |
|
|
136 |
geom_abline(intercept = 0, slope = 1) + |
|
|
137 |
geom_hline(yintercept = 1, colour = 'grey') + |
|
|
138 |
geom_vline(xintercept = 1, colour = 'grey') + |
|
|
139 |
geom_point() + |
|
|
140 |
geom_errorbar(aes(ymin = cox_discrete_lower, ymax = cox_discrete_upper)) + |
|
|
141 |
geom_errorbarh(aes(xmin = our_lower, xmax = our_upper)) + |
|
|
142 |
geom_text_repel(aes(label = long_name)) + |
|
|
143 |
theme_classic(base_size = 8) |
|
|
144 |
|
|
|
145 |
# Unpack variable and level names |
|
|
146 |
cph.coeffs <- cphCoeffs( |
|
|
147 |
bootStats(surv.model.fit.boot, uncertainty = '95ci', transform = `-`), |
|
|
148 |
COHORT.optimised, surv.predict, model.type = 'boot.survreg' |
|
|
149 |
) |
|
|
150 |
|
|
|
151 |
# We'll need the CALIBER scaling functions for plotting |
|
|
152 |
source('../cox-ph/caliber-scale.R') |
|
|
153 |
|
|
|
154 |
# set up list to store the plots |
|
|
155 |
cox.discrete.plots <- list() |
|
|
156 |
# Add dummy columns for x-position of missing values |
|
|
157 |
cph.coeffs$missing.x.pos.cont <- NA |
|
|
158 |
cph.coeffs$missing.x.pos.disc <- NA |
|
|
159 |
|
|
|
160 |
for(variable in unique(cph.coeffs$var)) { |
|
|
161 |
# If it's a continuous variable, get the real centres of the bins |
|
|
162 |
if(variable %in% process.settings$var) { |
|
|
163 |
process.i <- which(variable == process.settings$var) |
|
|
164 |
|
|
|
165 |
if(process.settings$method[[process.i]] == 'binByQuantile') { |
|
|
166 |
|
|
|
167 |
variable.quantiles <- |
|
|
168 |
getQuantiles( |
|
|
169 |
COHORT.use[, variable], |
|
|
170 |
process.settings$settings[[process.i]] |
|
|
171 |
) |
|
|
172 |
# For those rows which relate to this variable, and whose level isn't |
|
|
173 |
# missing, put in the appropriate quantile boundaries for plotting |
|
|
174 |
cph.coeffs$bin.min[cph.coeffs$var == variable & |
|
|
175 |
cph.coeffs$level != 'missing'] <- |
|
|
176 |
variable.quantiles[1:(length(variable.quantiles) - 1)] |
|
|
177 |
cph.coeffs$bin.max[cph.coeffs$var == variable & |
|
|
178 |
cph.coeffs$level != 'missing'] <- |
|
|
179 |
variable.quantiles[2:length(variable.quantiles)] |
|
|
180 |
# Make the final bin the 99th percentile |
|
|
181 |
cph.coeffs$bin.max[cph.coeffs$var == variable & |
|
|
182 |
cph.coeffs$level != 'missing'][ |
|
|
183 |
length(variable.quantiles) - 1] <- |
|
|
184 |
quantile(COHORT.use[, variable], 0.99, na.rm = TRUE) |
|
|
185 |
|
|
|
186 |
# Add a fake data point at the highest value to finish the graph |
|
|
187 |
cph.coeffs <- |
|
|
188 |
rbind( |
|
|
189 |
cph.coeffs, |
|
|
190 |
cph.coeffs[cph.coeffs$var == variable & |
|
|
191 |
cph.coeffs$level != 'missing', ][ |
|
|
192 |
length(variable.quantiles) - 1, ] |
|
|
193 |
) |
|
|
194 |
# Change it so that bin.min is bin.max from the old one |
|
|
195 |
cph.coeffs$bin.min[nrow(cph.coeffs)] <- |
|
|
196 |
cph.coeffs$bin.max[cph.coeffs$var == variable & |
|
|
197 |
cph.coeffs$level != 'missing'][ |
|
|
198 |
length(variable.quantiles) - 1] |
|
|
199 |
|
|
|
200 |
# Work out data range by taking the 1st and 99th percentiles |
|
|
201 |
# Use the max to provide a max value for the final bin |
|
|
202 |
# Also use for x-axis limits, unless there are missing values to |
|
|
203 |
# accommodate on the right-hand edge. |
|
|
204 |
x.data.range <- |
|
|
205 |
quantile(COHORT.use[, variable], c(0.01, 0.99), na.rm = TRUE) |
|
|
206 |
x.axis.limits <- x.data.range |
|
|
207 |
|
|
|
208 |
|
|
|
209 |
# Finally, we need to scale this such that the baseline value is equal |
|
|
210 |
# to the value for the equivalent place in the Cox model, to make the |
|
|
211 |
# risks comparable... |
|
|
212 |
|
|
|
213 |
# First, we need to find the average value of this variable in the lowest |
|
|
214 |
# bin (which is always the baseline here) |
|
|
215 |
baseline.bin <- variable.quantiles[1:2] |
|
|
216 |
baseline.bin.avg <- |
|
|
217 |
mean( |
|
|
218 |
# Take only those values of the variable which are in the range |
|
|
219 |
COHORT.use[ |
|
|
220 |
inRange(COHORT.use[, variable], baseline.bin, na.false = TRUE), |
|
|
221 |
variable |
|
|
222 |
] |
|
|
223 |
) |
|
|
224 |
# Then, scale it with the caliber scaling |
|
|
225 |
baseline.bin.val <- |
|
|
226 |
caliberScaleUnits(baseline.bin.avg, variable) * |
|
|
227 |
caliber.missing.coeffs$our_value[ |
|
|
228 |
caliber.missing.coeffs$quantity == variable |
|
|
229 |
] |
|
|
230 |
|
|
|
231 |
# And now, add all the discretised values to that value to make them |
|
|
232 |
# comparable... |
|
|
233 |
cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] <- |
|
|
234 |
cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] - |
|
|
235 |
baseline.bin.val |
|
|
236 |
|
|
|
237 |
# Now, plot this variable as a stepped line plot using those quantile |
|
|
238 |
# boundaries |
|
|
239 |
cox.discrete.plot <- |
|
|
240 |
ggplot( |
|
|
241 |
subset(cph.coeffs, var == variable), |
|
|
242 |
aes(x = bin.min, y = val) |
|
|
243 |
) + |
|
|
244 |
geom_step() + |
|
|
245 |
geom_step(aes(y = lower), colour = 'grey') + |
|
|
246 |
geom_step(aes(y = upper), colour = 'grey') + |
|
|
247 |
ggtitle(variable) |
|
|
248 |
|
|
|
249 |
# If there's a missing value risk, add it |
|
|
250 |
if(any(cph.coeffs$var == variable & cph.coeffs$level == 'missing')) { |
|
|
251 |
# Expand the x-axis to squeeze the missing values in |
|
|
252 |
x.axis.limits[2] <- |
|
|
253 |
x.axis.limits[2] + diff(x.data.range) * missing.padding |
|
|
254 |
# Put this missing value a third of the way into the missing area |
|
|
255 |
cph.coeffs$missing.x.pos.disc[ |
|
|
256 |
cph.coeffs$var == variable & |
|
|
257 |
cph.coeffs$level == 'missing'] <- |
|
|
258 |
x.axis.limits[2] + diff(x.data.range) * missing.padding / 3 |
|
|
259 |
|
|
|
260 |
# Add the point to the graph (we'll set axis limits later) |
|
|
261 |
cox.discrete.plot <- |
|
|
262 |
cox.discrete.plot + |
|
|
263 |
geom_pointrange( |
|
|
264 |
data = cph.coeffs[cph.coeffs$var == variable & |
|
|
265 |
cph.coeffs$level == 'missing', ], |
|
|
266 |
aes( |
|
|
267 |
x = missing.x.pos.disc, |
|
|
268 |
y = val, ymin = lower, |
|
|
269 |
ymax = upper |
|
|
270 |
), |
|
|
271 |
colour = 'red' |
|
|
272 |
) |
|
|
273 |
} |
|
|
274 |
|
|
|
275 |
# Now, let's add the line from the continuous Cox model. We only need two |
|
|
276 |
# points because the lines are straight! |
|
|
277 |
continuous.cox <- |
|
|
278 |
data.frame( |
|
|
279 |
var.x.values = x.data.range |
|
|
280 |
) |
|
|
281 |
# Scale the x-values |
|
|
282 |
continuous.cox$var.x.scaled <- |
|
|
283 |
caliberScaleUnits(continuous.cox$var.x.values, variable) |
|
|
284 |
# Use the risks to calculate risk per x for central estimate and errors |
|
|
285 |
continuous.cox$y <- |
|
|
286 |
-caliber.missing.coeffs$our_value[ |
|
|
287 |
caliber.missing.coeffs$quantity == variable |
|
|
288 |
] * continuous.cox$var.x.scaled |
|
|
289 |
continuous.cox$upper <- |
|
|
290 |
-caliber.missing.coeffs$our_upper[ |
|
|
291 |
caliber.missing.coeffs$quantity == variable |
|
|
292 |
] * continuous.cox$var.x.scaled |
|
|
293 |
continuous.cox$lower <- |
|
|
294 |
-caliber.missing.coeffs$our_lower[ |
|
|
295 |
caliber.missing.coeffs$quantity == variable |
|
|
296 |
] * continuous.cox$var.x.scaled |
|
|
297 |
|
|
|
298 |
cox.discrete.plot <- |
|
|
299 |
cox.discrete.plot + |
|
|
300 |
geom_line( |
|
|
301 |
data = continuous.cox, |
|
|
302 |
aes(x = var.x.values, y = y), |
|
|
303 |
colour = 'blue' |
|
|
304 |
) + |
|
|
305 |
geom_line( |
|
|
306 |
data = continuous.cox, |
|
|
307 |
aes(x = var.x.values, y = upper), |
|
|
308 |
colour = 'lightblue' |
|
|
309 |
) + |
|
|
310 |
geom_line( |
|
|
311 |
data = continuous.cox, |
|
|
312 |
aes(x = var.x.values, y = lower), |
|
|
313 |
colour = 'lightblue' |
|
|
314 |
) |
|
|
315 |
|
|
|
316 |
# If there is one, add missing value risk from the continuous model |
|
|
317 |
if(any(caliber.missing.coeffs$quantity == paste0(variable, '_missing') & |
|
|
318 |
caliber.missing.coeffs$unit == 'missing')) { |
|
|
319 |
# Expand the x-axis to squeeze the missing values in |
|
|
320 |
x.axis.limits[2] <- |
|
|
321 |
x.axis.limits[2] + diff(x.data.range) * missing.padding |
|
|
322 |
# Put this missing value 2/3rds of the way into the missing area |
|
|
323 |
cph.coeffs$missing.x.pos.cont[ |
|
|
324 |
cph.coeffs$var == variable & |
|
|
325 |
cph.coeffs$level == 'missing'] <- |
|
|
326 |
x.axis.limits[2] + diff(x.data.range) * missing.padding / 3 |
|
|
327 |
x.axis.limits[2] + 2 * diff(x.data.range) * missing.padding / 3 |
|
|
328 |
|
|
|
329 |
cox.discrete.plot <- |
|
|
330 |
cox.discrete.plot + |
|
|
331 |
geom_pointrange( |
|
|
332 |
data = cph.coeffs[ |
|
|
333 |
cph.coeffs$var == variable & |
|
|
334 |
cph.coeffs$level == 'missing', |
|
|
335 |
], |
|
|
336 |
aes( |
|
|
337 |
x = missing.x.pos.cont, |
|
|
338 |
y = val, ymin = lower, ymax = upper |
|
|
339 |
), |
|
|
340 |
colour = 'blue' |
|
|
341 |
) |
|
|
342 |
} |
|
|
343 |
|
|
|
344 |
# Finally, set the x-axis limits; will just be the data range, or data |
|
|
345 |
# range plus a bit if there are missing values to squeeze in |
|
|
346 |
cox.discrete.plot <- |
|
|
347 |
cox.discrete.plot + |
|
|
348 |
coord_cartesian(xlim = x.axis.limits) |
|
|
349 |
|
|
|
350 |
cox.discrete.plots[[variable]] <- cox.discrete.plot |
|
|
351 |
} |
|
|
352 |
} |
|
|
353 |
} |
|
|
354 |
|
|
|
355 |
print(cox.discrete.plots) |