Switch to unified view

a b/cox-ph/cox-discretised-imputed.R
1
#+ knitr_setup, include = FALSE
2
3
# Whether to cache the intensive code sections. Set to FALSE to recalculate
4
# everything afresh.
5
cacheoption <- FALSE
6
# Disable lazy caching globally, because it fails for large objects, and all the
7
# objects we wish to cache are large...
8
opts_chunk$set(cache.lazy = FALSE)
9
10
#' # Discrete Cox model with imputed data
11
#' 
12
#' The discrete Cox model performs very similarly to the normal Cox model, even
13
#' without performing imputation first. Let's try it with imputation, and see
14
#' if the performance is boosted.
15
#' 
16
#' We're going to use the same parameters for discretising as were found for the
17
#' data with missing values. On the one hand, this is slightly unfair because
18
#' these values might be suboptimal now that the data are imputed but, on the
19
#' other, it does allow for direct comparison. If we cross-validated again, we
20
#' would be very likely to find different parameters (there are far more than
21
#' can plausibly be tried), which may lead performance to be better or worse
22
#' entirely at random.
23
24
# Calibration from cross-validation performed on data before imputation
25
calibration.filename <- '../../output/survreg-crossvalidation-try4.csv'
26
# The first part of the filename for any output
27
output.filename.base <- '../../output/survreg-discrete-imputed-try1'
28
imputed.data.filename <- '../../data/COHORT_complete.rds'
29
boot.filename <- paste0(output.filename.base, '-boot.rds')
30
31
n.threads <- 20
32
bootstraps <- 50
33
34
model.type <- 'survreg'
35
36
#' ## Discretise data
37
#' 
38
#' First, let's load the results from the calibrations and find the parameters
39
#' for discretisation.
40
41
source('../lib/shared.R')
42
43
continuous.vars <-
44
  c(
45
    'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
46
    'total_wbc_6mo', 'haemoglobin_6mo'
47
  )
48
49
# read file containing calibrations
50
cv.performance <- read.csv(calibration.filename)
51
52
# Find the best calibration...
53
# First, average performance across cross-validation folds
54
cv.performance.average <-
55
  aggregate(
56
    c.index.val ~ calibration,
57
    data = cv.performance,
58
    mean
59
  )
60
# Find the highest value
61
best.calibration <-
62
  cv.performance.average$calibration[
63
    which.max(cv.performance.average$c.index.val)
64
    ]
65
# And finally, find the first row of that calibration to get the n.bins values
66
best.calibration.row1 <-
67
  min(which(cv.performance$calibration == best.calibration))
68
69
# Get its parameters
70
n.bins <-
71
  t(
72
    cv.performance[best.calibration.row1, continuous.vars]
73
  )
74
75
76
# Reset process settings with the base setings
77
process.settings <-
78
  list(
79
    var        = c('anonpatid', 'time_death', 'imd_score', 'exclude'),
80
    method     = c(NA, NA, NA, NA),
81
    settings   = list(NA, NA, NA, NA)
82
  )
83
for(j in 1:length(continuous.vars)) {
84
  process.settings$var <- c(process.settings$var, continuous.vars[j])
85
  process.settings$method <- c(process.settings$method, 'binByQuantile')
86
  process.settings$settings <-
87
    c(
88
      process.settings$settings,
89
      list(
90
        seq(
91
          # Quantiles are obviously between 0 and 1
92
          0, 1,
93
          # Choose a random number of bins (and for n bins, you need n + 1 breaks)
94
          length.out = n.bins[j]
95
        )
96
      )
97
    )
98
}
99
100
#' ## Load imputed data
101
#' 
102
#' Load the data and prepare it with the settings above
103
104
# Load the data from its RDS file
105
imputed.data <- readRDS(imputed.data.filename)
106
107
# Remove rows with death time of 0 to avoid fitting errors, and get the survival
108
# columns ready
109
for(i in 1:length(imputed.data)) {
110
  imputed.data[[i]] <- imputed.data[[i]][imputed.data[[i]][, surv.time] > 0, ]
111
  # Put in a fake exclude column for the next function (exclusions are already
112
  # excluded in the imputed dataset)
113
  imputed.data[[i]]$exclude <- FALSE
114
  imputed.data[[i]]$imd_score <- as.numeric(imputed.data[[i]]$imd_score)
115
  imputed.data[[i]] <-
116
    prepData(
117
      imputed.data[[i]],
118
      cols.keep,
119
      process.settings,
120
      'time_death', 'endpoint_death', 1,
121
      extra.fun = caliberExtraPrep
122
    )
123
}
124
125
# Define test set
126
test.set <- testSetIndices(imputed.data[[1]], random.seed = 78361)
127
128
# Do a quick and dirty fit on a single imputed dataset, to draw calibration
129
# curve from
130
time.start <- handyTimer()
131
surv.model.fit <-
132
  survivalFit(
133
    surv.predict,
134
    COHORT.optimised[-test.set,], # Training set
135
    model.type = model.type,
136
    n.threads = n.threads
137
  )
138
time.fit <- handyTimer(time.start)
139
140
# Perform bootstrap fitting for every multiply imputed dataset
141
surv.fit.boot <- list()
142
time.start <- handyTimer()
143
for(i in 1:length(imputed.data)) {
144
  surv.fit.boot[[i]] <-
145
    survivalBootstrap(
146
      surv.predict,
147
      imputed.data[[i]][-test.set, ],
148
      imputed.data[[i]][test.set, ],
149
      model.type = model.type,
150
      bootstraps = bootstraps,
151
      n.threads = n.threads
152
    )
153
}
154
time.boot <- handyTimer(time.start)
155
156
# Save the fits, because it might've taken a while!
157
saveRDS(surv.fit.boot, boot.filename)
158
159
#' Model fitted in `r round(time.fit)` seconds, and `r bootstraps` fits
160
#' performed on `r length(imputed.data)` imputed datasets in
161
#' `r round(time.boot)` seconds.
162
163
# Unpackage the uncertainties from the bootstrapped data
164
surv.fit.boot.ests <-  bootMIStats(surv.fit.boot)
165
166
# Save bootstrapped performance values
167
varsToTable(
168
  data.frame(
169
    model = 'cox',
170
    imputation = TRUE,
171
    discretised = TRUE,
172
    c.index = surv.fit.boot.ests['c.test', 'val'],
173
    c.index.lower = surv.fit.boot.ests['c.test', 'lower'],
174
    c.index.upper = surv.fit.boot.ests['c.test', 'upper'],
175
    calibration.score = surv.fit.boot.ests['calibration.score', 'val'],
176
    calibration.score.lower = surv.fit.boot.ests['calibration.score', 'lower'],
177
    calibration.score.upper = surv.fit.boot.ests['calibration.score', 'upper']
178
  ),
179
  performance.file,
180
  index.cols = c('model', 'imputation', 'discretised')
181
)
182
183
184
#' # Results
185
#' 
186
#' ## Performance
187
#' 
188
#' Having fitted the Cox model, how did we do? The c-indices were calculated as
189
#' part of the bootstrapping, so we just need to take a look at those...
190
#' 
191
#' C-indices are **`r round(surv.fit.boot.ests['c.train', 'val'], 3)`
192
#' (`r round(surv.fit.boot.ests['c.train', 'lower'], 3)` - 
193
#' `r round(surv.fit.boot.ests['c.train', 'upper'], 3)`)** on the training set and
194
#' **`r round(surv.fit.boot.ests['c.test', 'val'], 3)`
195
#' (`r round(surv.fit.boot.ests['c.test', 'lower'], 3)` - 
196
#' `r round(surv.fit.boot.ests['c.test', 'upper'], 3)`)** on the test set.
197
#' 
198
#' 
199
#' ### Calibration
200
#' 
201
#' The bootstrapped calibration score is
202
#' **`r round(surv.fit.boot.ests['calibration.score', 'val'], 3)`
203
#' (`r round(surv.fit.boot.ests['calibration.score', 'lower'], 3)` - 
204
#' `r round(surv.fit.boot.ests['calibration.score', 'upper'], 3)`)**.
205
#' 
206
#' Let's draw a representative curve from the unbootstrapped fit... (It would be
207
#' better to draw all the curves from the bootstrap fit to get an idea of
208
#' variability, but I've not implemented this yet.)
209
#' 
210
#+ calibration_plot
211
212
calibration.table <-
213
  calibrationTable(surv.model.fit, imputed.data[[1]][test.set, ])
214
215
calibration.score <- calibrationScore(calibration.table)
216
217
calibrationPlot(calibration.table)
218
219
#' The area between the calibration curve and the diagonal is 
220
#' **`r round(calibration.score[['area']], 3)`** +/-
221
#' **`r round(calibration.score[['se']], 3)`**.