[0375db]: / cox-ph / cox-discretised-imputed.R

Download this file

221 lines (192 with data), 7.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
#+ knitr_setup, include = FALSE
# Whether to cache the intensive code sections. Set to FALSE to recalculate
# everything afresh.
cacheoption <- FALSE
# Disable lazy caching globally, because it fails for large objects, and all the
# objects we wish to cache are large...
opts_chunk$set(cache.lazy = FALSE)
#' # Discrete Cox model with imputed data
#'
#' The discrete Cox model performs very similarly to the normal Cox model, even
#' without performing imputation first. Let's try it with imputation, and see
#' if the performance is boosted.
#'
#' We're going to use the same parameters for discretising as were found for the
#' data with missing values. On the one hand, this is slightly unfair because
#' these values might be suboptimal now that the data are imputed but, on the
#' other, it does allow for direct comparison. If we cross-validated again, we
#' would be very likely to find different parameters (there are far more than
#' can plausibly be tried), which may lead performance to be better or worse
#' entirely at random.
# Calibration from cross-validation performed on data before imputation
calibration.filename <- '../../output/survreg-crossvalidation-try4.csv'
# The first part of the filename for any output
output.filename.base <- '../../output/survreg-discrete-imputed-try1'
imputed.data.filename <- '../../data/COHORT_complete.rds'
boot.filename <- paste0(output.filename.base, '-boot.rds')
n.threads <- 20
bootstraps <- 50
model.type <- 'survreg'
#' ## Discretise data
#'
#' First, let's load the results from the calibrations and find the parameters
#' for discretisation.
source('../lib/shared.R')
continuous.vars <-
c(
'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
'total_wbc_6mo', 'haemoglobin_6mo'
)
# read file containing calibrations
cv.performance <- read.csv(calibration.filename)
# Find the best calibration...
# First, average performance across cross-validation folds
cv.performance.average <-
aggregate(
c.index.val ~ calibration,
data = cv.performance,
mean
)
# Find the highest value
best.calibration <-
cv.performance.average$calibration[
which.max(cv.performance.average$c.index.val)
]
# And finally, find the first row of that calibration to get the n.bins values
best.calibration.row1 <-
min(which(cv.performance$calibration == best.calibration))
# Get its parameters
n.bins <-
t(
cv.performance[best.calibration.row1, continuous.vars]
)
# Reset process settings with the base setings
process.settings <-
list(
var = c('anonpatid', 'time_death', 'imd_score', 'exclude'),
method = c(NA, NA, NA, NA),
settings = list(NA, NA, NA, NA)
)
for(j in 1:length(continuous.vars)) {
process.settings$var <- c(process.settings$var, continuous.vars[j])
process.settings$method <- c(process.settings$method, 'binByQuantile')
process.settings$settings <-
c(
process.settings$settings,
list(
seq(
# Quantiles are obviously between 0 and 1
0, 1,
# Choose a random number of bins (and for n bins, you need n + 1 breaks)
length.out = n.bins[j]
)
)
)
}
#' ## Load imputed data
#'
#' Load the data and prepare it with the settings above
# Load the data from its RDS file
imputed.data <- readRDS(imputed.data.filename)
# Remove rows with death time of 0 to avoid fitting errors, and get the survival
# columns ready
for(i in 1:length(imputed.data)) {
imputed.data[[i]] <- imputed.data[[i]][imputed.data[[i]][, surv.time] > 0, ]
# Put in a fake exclude column for the next function (exclusions are already
# excluded in the imputed dataset)
imputed.data[[i]]$exclude <- FALSE
imputed.data[[i]]$imd_score <- as.numeric(imputed.data[[i]]$imd_score)
imputed.data[[i]] <-
prepData(
imputed.data[[i]],
cols.keep,
process.settings,
'time_death', 'endpoint_death', 1,
extra.fun = caliberExtraPrep
)
}
# Define test set
test.set <- testSetIndices(imputed.data[[1]], random.seed = 78361)
# Do a quick and dirty fit on a single imputed dataset, to draw calibration
# curve from
time.start <- handyTimer()
surv.model.fit <-
survivalFit(
surv.predict,
COHORT.optimised[-test.set,], # Training set
model.type = model.type,
n.threads = n.threads
)
time.fit <- handyTimer(time.start)
# Perform bootstrap fitting for every multiply imputed dataset
surv.fit.boot <- list()
time.start <- handyTimer()
for(i in 1:length(imputed.data)) {
surv.fit.boot[[i]] <-
survivalBootstrap(
surv.predict,
imputed.data[[i]][-test.set, ],
imputed.data[[i]][test.set, ],
model.type = model.type,
bootstraps = bootstraps,
n.threads = n.threads
)
}
time.boot <- handyTimer(time.start)
# Save the fits, because it might've taken a while!
saveRDS(surv.fit.boot, boot.filename)
#' Model fitted in `r round(time.fit)` seconds, and `r bootstraps` fits
#' performed on `r length(imputed.data)` imputed datasets in
#' `r round(time.boot)` seconds.
# Unpackage the uncertainties from the bootstrapped data
surv.fit.boot.ests <- bootMIStats(surv.fit.boot)
# Save bootstrapped performance values
varsToTable(
data.frame(
model = 'cox',
imputation = TRUE,
discretised = TRUE,
c.index = surv.fit.boot.ests['c.test', 'val'],
c.index.lower = surv.fit.boot.ests['c.test', 'lower'],
c.index.upper = surv.fit.boot.ests['c.test', 'upper'],
calibration.score = surv.fit.boot.ests['calibration.score', 'val'],
calibration.score.lower = surv.fit.boot.ests['calibration.score', 'lower'],
calibration.score.upper = surv.fit.boot.ests['calibration.score', 'upper']
),
performance.file,
index.cols = c('model', 'imputation', 'discretised')
)
#' # Results
#'
#' ## Performance
#'
#' Having fitted the Cox model, how did we do? The c-indices were calculated as
#' part of the bootstrapping, so we just need to take a look at those...
#'
#' C-indices are **`r round(surv.fit.boot.ests['c.train', 'val'], 3)`
#' (`r round(surv.fit.boot.ests['c.train', 'lower'], 3)` -
#' `r round(surv.fit.boot.ests['c.train', 'upper'], 3)`)** on the training set and
#' **`r round(surv.fit.boot.ests['c.test', 'val'], 3)`
#' (`r round(surv.fit.boot.ests['c.test', 'lower'], 3)` -
#' `r round(surv.fit.boot.ests['c.test', 'upper'], 3)`)** on the test set.
#'
#'
#' ### Calibration
#'
#' The bootstrapped calibration score is
#' **`r round(surv.fit.boot.ests['calibration.score', 'val'], 3)`
#' (`r round(surv.fit.boot.ests['calibration.score', 'lower'], 3)` -
#' `r round(surv.fit.boot.ests['calibration.score', 'upper'], 3)`)**.
#'
#' Let's draw a representative curve from the unbootstrapped fit... (It would be
#' better to draw all the curves from the bootstrap fit to get an idea of
#' variability, but I've not implemented this yet.)
#'
#+ calibration_plot
calibration.table <-
calibrationTable(surv.model.fit, imputed.data[[1]][test.set, ])
calibration.score <- calibrationScore(calibration.table)
calibrationPlot(calibration.table)
#' The area between the calibration curve and the diagonal is
#' **`r round(calibration.score[['area']], 3)`** +/-
#' **`r round(calibration.score[['se']], 3)`**.