Diff of /cox-ph/cox-discretised.R [000000] .. [0375db]

Switch to unified view

a b/cox-ph/cox-discretised.R
1
#+ knitr_setup, include = FALSE
2
3
# Whether to cache the intensive code sections. Set to FALSE to recalculate
4
# everything afresh.
5
cacheoption <- FALSE
6
# Disable lazy caching globally, because it fails for large objects, and all the
7
# objects we wish to cache are large...
8
opts_chunk$set(cache.lazy = FALSE)
9
10
#' # Cross-validating discretisation of input variables in a survival model
11
#' 
12
#' 
13
14
calibration.filename <- '../../output/survreg-crossvalidation-try5.csv'
15
caliber.missing.coefficients.filename <-
16
  '../../output/caliber-replicate-with-missing-survreg-bootstrap-coeffs-1.csv'
17
comparison.filename <-
18
  '../../output/caliber-replicate-with-missing-var-imp-try2.csv'
19
# The first part of the filename for any output
20
output.filename.base <- '../../output/all-cv-survreg-boot-try5'
21
22
23
# What kind of model to fit to...currently 'cph' (Cox model), 'ranger' or
24
# 'rfsrc' (two implementations of random survival forests)
25
model.type <- 'survreg'
26
27
# If surv.vars is defined as a character vector here, the model only uses those
28
# variables specified, eg c('age') would build a model purely based on age. If
29
# not specified (ie commented out), it will use the defaults.
30
# surv.predict <- c('age')
31
32
#' ## Do the cross-validation
33
#' 
34
#' The steps for this are common regardless of model type, so run the script to
35
#' get a cross-validated model to further analyse...
36
#+ cox_discretised_cv, cache=cacheoption
37
38
source('../lib/all-cv-bootstrap.R', chdir = TRUE)
39
40
#' # Results
41
#' 
42
#' ## Performance
43
#' 
44
#' ### C-index
45
#' 
46
#' C-index is **`r round(surv.model.fit.coeffs['c.index', 'val'], 3)`
47
#' (`r round(surv.model.fit.coeffs['c.index', 'lower'], 3)` - 
48
#' `r round(surv.model.fit.coeffs['c.index', 'upper'], 3)`)** on the held-out
49
#' test set.
50
#' 
51
#'
52
#' ### Calibration
53
#' 
54
#' The bootstrapped calibration score is
55
#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)`
56
#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` - 
57
#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**.
58
#' 
59
#' Let's draw a representative curve from the unbootstrapped fit... (It would be
60
#' better to draw all the curves from the bootstrap fit to get an idea of
61
#' variability, but I've not implemented this yet.)
62
#' 
63
#+ calibration_plot
64
65
calibration.table <-
66
  calibrationTable(surv.model.fit, COHORT.optimised[test.set, ])
67
68
calibration.score <- calibrationScore(calibration.table)
69
70
calibrationPlot(calibration.table)
71
72
#' 
73
#' ## Model fit
74
#'
75
#+ resulting_fit
76
77
print(surv.model.fit)
78
79
#' ## Cox coefficients
80
#'
81
#+ cox_coefficients_plot
82
83
84
# Save bootstrapped performance values
85
varsToTable(
86
  data.frame(
87
    model = 'cox',
88
    imputation = FALSE,
89
    discretised = TRUE,
90
    c.index = surv.model.fit.coeffs['c.index', 'val'],
91
    c.index.lower = surv.model.fit.coeffs['c.index', 'lower'],
92
    c.index.upper = surv.model.fit.coeffs['c.index', 'upper'],
93
    calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
94
    calibration.score.lower =
95
      surv.model.fit.coeffs['calibration.score', 'lower'],
96
    calibration.score.upper =
97
      surv.model.fit.coeffs['calibration.score', 'upper']
98
  ),
99
  performance.file,
100
  index.cols = c('model', 'imputation', 'discretised')
101
)
102
103
# Unpackage the uncertainties again, this time transformed because survreg
104
# returns negative values
105
surv.boot.ests <- bootStatsDf(surv.model.params.boot, transform = `-`)
106
107
#' First, plot the factors and logicals as a scatter plot to compare with the
108
#' continuous Cox model...
109
110
# Pull coefficients from model with missing data
111
caliber.missing.coeffs <- read.csv(caliber.missing.coefficients.filename)
112
113
# Rename surv.boot.ests ready for merging
114
names(surv.boot.ests) <-
115
  c('cox_discrete_value', 'cox_discrete_lower', 'cox_discrete_upper')
116
surv.boot.ests$quantity.level <- rownames(surv.boot.ests)
117
# Convert variablemissing to variable_missingTRUE for compatibility
118
vars.with.missing <- endsWith(surv.boot.ests$quantity.level, 'missing')
119
surv.boot.ests$quantity.level[vars.with.missing] <-
120
  paste0(
121
    substr(
122
      surv.boot.ests$quantity.level[vars.with.missing],
123
      1,
124
      nchar(surv.boot.ests$quantity.level[vars.with.missing]) - nchar('missing')
125
    ),
126
    '_missingTRUE'
127
  )
128
129
# Create a data frame comparing them
130
compare.coefficients <- merge(caliber.missing.coeffs, surv.boot.ests)
131
132
ggplot(
133
    compare.coefficients,
134
    aes(x = our_value, y = cox_discrete_value, colour = unit == 'missing')
135
  ) +
136
  geom_abline(intercept = 0, slope = 1) +
137
  geom_hline(yintercept = 1, colour = 'grey') +
138
  geom_vline(xintercept = 1, colour = 'grey') +
139
  geom_point() +
140
  geom_errorbar(aes(ymin = cox_discrete_lower, ymax = cox_discrete_upper)) +
141
  geom_errorbarh(aes(xmin = our_lower, xmax = our_upper)) +
142
  geom_text_repel(aes(label = long_name)) +
143
  theme_classic(base_size = 8)
144
145
# Unpack variable and level names
146
cph.coeffs <- cphCoeffs(
147
  bootStats(surv.model.fit.boot, uncertainty = '95ci', transform = `-`),
148
  COHORT.optimised, surv.predict, model.type = 'boot.survreg'
149
)
150
151
# We'll need the CALIBER scaling functions for plotting
152
source('../cox-ph/caliber-scale.R')
153
154
# set up list to store the plots
155
cox.discrete.plots <- list()
156
# Add dummy columns for x-position of missing values
157
cph.coeffs$missing.x.pos.cont <- NA
158
cph.coeffs$missing.x.pos.disc <- NA
159
160
for(variable in unique(cph.coeffs$var)) {
161
  # If it's a continuous variable, get the real centres of the bins
162
  if(variable %in% process.settings$var) {
163
    process.i <- which(variable == process.settings$var)
164
    
165
    if(process.settings$method[[process.i]] == 'binByQuantile') {
166
      
167
      variable.quantiles <-
168
        getQuantiles(
169
          COHORT.use[, variable],
170
          process.settings$settings[[process.i]]
171
        )
172
      # For those rows which relate to this variable, and whose level isn't
173
      # missing, put in the appropriate quantile boundaries for plotting
174
      cph.coeffs$bin.min[cph.coeffs$var == variable & 
175
                           cph.coeffs$level != 'missing'] <-
176
        variable.quantiles[1:(length(variable.quantiles) - 1)]
177
      cph.coeffs$bin.max[cph.coeffs$var == variable & 
178
                           cph.coeffs$level != 'missing'] <-
179
        variable.quantiles[2:length(variable.quantiles)]
180
      # Make the final bin the 99th percentile
181
      cph.coeffs$bin.max[cph.coeffs$var == variable & 
182
                           cph.coeffs$level != 'missing'][
183
                             length(variable.quantiles) - 1] <-
184
        quantile(COHORT.use[, variable], 0.99, na.rm = TRUE)
185
      
186
      # Add a fake data point at the highest value to finish the graph
187
      cph.coeffs <-
188
        rbind(
189
          cph.coeffs,
190
          cph.coeffs[cph.coeffs$var == variable & 
191
                       cph.coeffs$level != 'missing', ][
192
                         length(variable.quantiles) - 1, ]
193
        )
194
      # Change it so that bin.min is bin.max from the old one
195
      cph.coeffs$bin.min[nrow(cph.coeffs)] <-
196
        cph.coeffs$bin.max[cph.coeffs$var == variable & 
197
                             cph.coeffs$level != 'missing'][
198
                               length(variable.quantiles) - 1]
199
      
200
      # Work out data range by taking the 1st and 99th percentiles
201
      # Use the max to provide a max value for the final bin
202
      # Also use for x-axis limits, unless there are missing values to
203
      # accommodate on the right-hand edge.
204
      x.data.range <-
205
        quantile(COHORT.use[, variable], c(0.01, 0.99), na.rm = TRUE)
206
      x.axis.limits <- x.data.range
207
      
208
      
209
      # Finally, we need to scale this such that the baseline value is equal
210
      # to the value for the equivalent place in the Cox model, to make the
211
      # risks comparable...
212
      
213
      # First, we need to find the average value of this variable in the lowest
214
      # bin (which is always the baseline here)
215
      baseline.bin <- variable.quantiles[1:2]
216
      baseline.bin.avg <- 
217
        mean(
218
          # Take only those values of the variable which are in the range
219
          COHORT.use[
220
            inRange(COHORT.use[, variable], baseline.bin, na.false = TRUE),
221
            variable
222
            ]
223
        )
224
      # Then, scale it with the caliber scaling
225
      baseline.bin.val <-
226
        caliberScaleUnits(baseline.bin.avg, variable) * 
227
        caliber.missing.coeffs$our_value[
228
          caliber.missing.coeffs$quantity == variable
229
          ]
230
      
231
      # And now, add all the discretised values to that value to make them
232
      # comparable...
233
      cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] <-
234
        cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] -
235
        baseline.bin.val
236
      
237
      # Now, plot this variable as a stepped line plot using those quantile
238
      # boundaries
239
      cox.discrete.plot <-
240
        ggplot(
241
          subset(cph.coeffs, var == variable),
242
          aes(x = bin.min, y = val)
243
        ) +   
244
        geom_step() +
245
        geom_step(aes(y = lower), colour = 'grey') +
246
        geom_step(aes(y = upper), colour = 'grey') +
247
        ggtitle(variable)
248
      
249
      # If there's a missing value risk, add it
250
      if(any(cph.coeffs$var == variable & cph.coeffs$level == 'missing')) {
251
        # Expand the x-axis to squeeze the missing values in
252
        x.axis.limits[2] <- 
253
          x.axis.limits[2] + diff(x.data.range) * missing.padding
254
        # Put this missing value a third of the way into the missing area
255
        cph.coeffs$missing.x.pos.disc[
256
          cph.coeffs$var == variable &
257
            cph.coeffs$level == 'missing'] <-
258
          x.axis.limits[2] + diff(x.data.range) * missing.padding / 3
259
        
260
        # Add the point to the graph (we'll set axis limits later)
261
        cox.discrete.plot <-
262
          cox.discrete.plot +
263
          geom_pointrange(
264
            data = cph.coeffs[cph.coeffs$var == variable & 
265
                                cph.coeffs$level == 'missing', ],
266
            aes(
267
              x = missing.x.pos.disc,
268
              y = val, ymin = lower,
269
              ymax = upper
270
            ),
271
            colour = 'red'
272
          )
273
      }
274
      
275
      # Now, let's add the line from the continuous Cox model. We only need two
276
      # points because the lines are straight!
277
      continuous.cox <-
278
        data.frame(
279
          var.x.values = x.data.range
280
        )
281
      # Scale the x-values
282
      continuous.cox$var.x.scaled <-
283
        caliberScaleUnits(continuous.cox$var.x.values, variable)
284
      # Use the risks to calculate risk per x for central estimate and errors
285
      continuous.cox$y <-
286
        -caliber.missing.coeffs$our_value[
287
          caliber.missing.coeffs$quantity == variable
288
          ] * continuous.cox$var.x.scaled
289
      continuous.cox$upper <-
290
        -caliber.missing.coeffs$our_upper[
291
          caliber.missing.coeffs$quantity == variable
292
          ] * continuous.cox$var.x.scaled
293
      continuous.cox$lower <-
294
        -caliber.missing.coeffs$our_lower[
295
          caliber.missing.coeffs$quantity == variable
296
          ] * continuous.cox$var.x.scaled
297
      
298
      cox.discrete.plot <-
299
        cox.discrete.plot +
300
        geom_line(
301
          data = continuous.cox,
302
          aes(x = var.x.values, y = y),
303
          colour = 'blue'
304
        ) +
305
        geom_line(
306
          data = continuous.cox,
307
          aes(x = var.x.values, y = upper),
308
          colour = 'lightblue'
309
        ) +
310
        geom_line(
311
          data = continuous.cox,
312
          aes(x = var.x.values, y = lower),
313
          colour = 'lightblue'
314
        )
315
      
316
      # If there is one, add missing value risk from the continuous model
317
      if(any(caliber.missing.coeffs$quantity == paste0(variable, '_missing') &
318
             caliber.missing.coeffs$unit == 'missing')) {
319
        # Expand the x-axis to squeeze the missing values in
320
        x.axis.limits[2] <- 
321
          x.axis.limits[2] + diff(x.data.range) * missing.padding
322
        # Put this missing value 2/3rds of the way into the missing area
323
        cph.coeffs$missing.x.pos.cont[
324
          cph.coeffs$var == variable &
325
            cph.coeffs$level == 'missing'] <-
326
          x.axis.limits[2] + diff(x.data.range) * missing.padding / 3
327
        x.axis.limits[2] + 2 * diff(x.data.range) * missing.padding / 3
328
        
329
        cox.discrete.plot <-
330
          cox.discrete.plot +
331
          geom_pointrange(
332
            data = cph.coeffs[
333
              cph.coeffs$var == variable &
334
                cph.coeffs$level == 'missing',
335
              ],
336
            aes(
337
              x = missing.x.pos.cont,
338
              y = val, ymin = lower, ymax = upper
339
            ),
340
            colour = 'blue'
341
          )
342
      }
343
      
344
      # Finally, set the x-axis limits; will just be the data range, or data
345
      # range plus a bit if there are missing values to squeeze in
346
      cox.discrete.plot <-
347
        cox.discrete.plot +
348
        coord_cartesian(xlim = x.axis.limits)
349
      
350
      cox.discrete.plots[[variable]] <- cox.discrete.plot
351
    }
352
  }
353
}
354
355
print(cox.discrete.plots)