[0375db]: / cox-ph / caliber-replicate-with-imputation.R

Download this file

328 lines (279 with data), 10.5 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
#+ knitr_setup, include = FALSE
# Whether to cache the intensive code sections. Set to FALSE to recalculate
# everything afresh.
cacheoption <- FALSE
# Disable lazy caching globally, because it fails for large objects, and all the
# objects we wish to cache are large...
opts_chunk$set(cache.lazy = FALSE)
#' # Replicating Rapsomaniki _et al._ 2014
#'
#' ## User variables
#'
#' First, define some variables...
#+ define_vars
imputed.data.filename <- '../../data/COHORT_complete.rds'
n.data <- NA # This is of full dataset...further rows may be excluded in prep
endpoint <- 'death.imputed'
old.coefficients.filename <- 'rapsomaniki-cox-values-from-paper.csv'
output.filename.base <- '../../output/caliber-replicate-imputed-survreg-4'
cox.var.imp.perm.filename <-
'../../output/caliber-replicate-imputed-survreg-bootstrap-var-imp-perm-1.csv'
cox.var.imp.perm.missing.filename <-
'../../output/caliber-replicate-with-missing-survreg-bootstrap-var-imp-perm-1.csv'
bootstraps <- 100
n.threads <- 10
#' ## Setup
#+ setup, message=FALSE
source('../lib/shared.R')
require(xtable)
require(ggrepel)
# Load the data and convert to data frame to make column-selecting code in
# prepData simpler
imputed.data <- readRDS(imputed.data.filename)
# Remove rows with death time of 0 to avoid fitting errors
for(i in 1:length(imputed.data)) {
imputed.data[[i]] <- imputed.data[[i]][imputed.data[[i]][, surv.time] > 0, ]
}
# Define n.data based on the imputed data, which has already been preprocessed
n.data <- nrow(imputed.data[[1]])
# Define indices of test set
test.set <- testSetIndices(imputed.data[[1]], random.seed = 78361)
#' OK, we've now got **`r n.data`** patients, split into a training set of
#' `r n.data - length(test.set)` and a test set of `r length(test.set)`.
#'
#'
#' ## Transform variables
#'
#' The model uses variables which have been standardised in various ways, so
#' let's go through and transform our input variables in the same way...
source('caliber-scale.R')
# Remove the age splining
ageSpline <- identity
for(i in 1:length(imputed.data)) {
imputed.data[[i]] <- caliberScale(imputed.data[[i]], surv.time, surv.event)
}
#' ## Survival fitting
#'
#' Fit a Cox model to the preprocessed data. The paper uses a Cox model with an
#' exponential baseline hazard, as here. The standard errors were calculated
#' with 200 bootstrap samples, which we're also doing here.
#+ fit_cox_model, cache=cacheoption
surv.formula <-
Surv(surv_time, surv_event) ~
### Sociodemographic characteristics #######################################
## Age in men, per year
## Age in women, per year
## Women vs. men
# ie include interaction between age and gender!
age*gender +
## Most deprived quintile, yes vs. no
most_deprived +
### SCAD diagnosis and severity ############################################
## Other CHD vs. stable angina
## Unstable angina vs. stable angina
## NSTEMI vs. stable angina
## STEMI vs. stable angina
diagnosis +
#diagnosis_missing +
## PCI in last 6 months, yes vs. no
pci_6mo +
## CABG in last 6 months, yes vs. no
cabg_6mo +
## Previous/recurrent MI, yes vs. no
hx_mi +
## Use of nitrates, yes vs. no
long_nitrate +
### CVD risk factors #######################################################
## Ex-smoker / current smoker / missing data vs. never
smokstatus +
## Hypertension, present vs. absent
hypertension +
## Diabetes mellitus, present vs. absent
diabetes_logical +
## Total cholesterol, per 1 mmol/L increase
total_chol_6mo +
## HDL, per 0.5 mmol/L increase
hdl_6mo +
### CVD co-morbidities #####################################################
## Heart failure, present vs. absent
heart_failure +
## Peripheral arterial disease, present vs. absent
pad +
## Atrial fibrillation, present vs. absent
hx_af +
## Stroke, present vs. absent
hx_stroke +
### Non-CVD comorbidities ##################################################
## Chronic kidney disease, present vs. absent
hx_renal +
## Chronic obstructive pulmonary disease, present vs. absent
hx_copd +
## Cancer, present vs. absent
hx_cancer +
## Chronic liver disease, present vs. absent
hx_liver +
### Psychosocial characteristics ###########################################
## Depression at diagnosis, present vs. absent
hx_depression +
## Anxiety at diagnosis, present vs. absent
hx_anxiety +
### Biomarkers #############################################################
## Heart rate, per 10 b.p.m increase
pulse_6mo +
## Creatinine, per 30 μmol/L increase
crea_6mo +
## White cell count, per 1.5 109/L increase
total_wbc_6mo +
## Haemoglobin, per 1.5 g/dL increase
haemoglobin_6mo
# Do a quick and dirty fit on a single imputed dataset, to draw calibration
# curve from
fit.exp <- survreg(
formula = surv.formula,
data = imputed.data[[1]][-test.set, ],
dist = "exponential"
)
fit.exp.boot <- list()
# Perform bootstrap fitting for every multiply imputed dataset
for(i in 1:length(imputed.data)) {
fit.exp.boot[[i]] <-
boot(
formula = surv.formula,
data = imputed.data[[i]][-test.set, ],
statistic = bootstrapFitSurvreg,
R = bootstraps,
parallel = 'multicore',
ncpus = n.threads,
test.data = imputed.data[[i]][test.set, ]
)
}
# Save the fits, because it might've taken a while!
saveRDS(fit.exp.boot, paste0(output.filename.base, '-surv-boot-imp.rds'))
# Unpackage the uncertainties from the bootstrapped data
fit.exp.boot.ests <- bootMIStats(fit.exp.boot)
# Save bootstrapped performance values
varsToTable(
data.frame(
model = 'cox',
imputation = TRUE,
discretised = FALSE,
c.index = fit.exp.boot.ests['c.index', 'val'],
c.index.lower = fit.exp.boot.ests['c.index', 'lower'],
c.index.upper = fit.exp.boot.ests['c.index', 'upper'],
calibration.score = fit.exp.boot.ests['calibration.score', 'val'],
calibration.score.lower = fit.exp.boot.ests['calibration.score', 'lower'],
calibration.score.upper = fit.exp.boot.ests['calibration.score', 'upper']
),
performance.file,
index.cols = c('model', 'imputation', 'discretised')
)
#' ## Performance
#'
#' Having fitted the Cox model, how did we do? The c-indices were calculated as
#' part of the bootstrapping, so we just need to take a look at those...
#'
#' C-indices are **`r round(fit.exp.boot.ests['c.train', 'val'], 3)`
#' (`r round(fit.exp.boot.ests['c.train', 'lower'], 3)` -
#' `r round(fit.exp.boot.ests['c.train', 'upper'], 3)`)** on the training set and
#' **`r round(fit.exp.boot.ests['c.test', 'val'], 3)`
#' (`r round(fit.exp.boot.ests['c.test', 'lower'], 3)` -
#' `r round(fit.exp.boot.ests['c.test', 'upper'], 3)`)** on the test set.
#' Not too bad!
#'
#'
#' ### Calibration
#'
#' The bootstrapped calibration score is
#' **`r round(fit.exp.boot.ests['calibration.score', 'val'], 3)`
#' (`r round(fit.exp.boot.ests['calibration.score', 'lower'], 3)` -
#' `r round(fit.exp.boot.ests['calibration.score', 'upper'], 3)`)**.
#'
#' Let's draw a representative curve from the unbootstrapped fit... (It would be
#' better to draw all the curves from the bootstrap fit to get an idea of
#' variability, but I've not implemented this yet.)
#'
#+ calibration_plot
calibration.table <-
calibrationTable(fit.exp, imputed.data[[i]][test.set, ])
calibration.score <- calibrationScore(calibration.table)
calibrationPlot(calibration.table)
#' The area between the calibration curve and the diagonal is
#' **`r round(calibration.score, 3)`**.
#'
#' ## Coefficients
#'
#' As well as getting comparable C-indices, it's also worth checking to see how
#' the risk coefficients calculated compare to those found in the original
#' paper. Let's compare...
# Load CSV of values from paper
old.coefficients <- read.csv(old.coefficients.filename)
# Get coefficients from this fit
new.coefficients <-
bootMIStats(fit.exp.boot, uncertainty = '95ci', transform = negExp)
names(new.coefficients) <- c('our_value', 'our_lower', 'our_upper')
new.coefficients$quantity.level <- rownames(new.coefficients)
# Create a data frame comparing them
compare.coefficients <- merge(old.coefficients, new.coefficients)
# Kludge because age:genderWomen is the pure interaction term, not the risk for
# a woman per unit of advancing spline-transformed age
compare.coefficients[
compare.coefficients$quantity.level == 'age:genderWomen', 'our_value'
] <-
compare.coefficients[
compare.coefficients$quantity.level == 'age:genderWomen', 'our_value'
] *
compare.coefficients[
compare.coefficients$quantity.level == 'age', 'our_value'
]
# Save CSV of results
write.csv(compare.coefficients, output.filename)
# Plot a graph by which to judge success
ggplot(compare.coefficients, aes(x = their_value, y = our_value)) +
geom_abline(intercept = 0, slope = 1) +
geom_hline(yintercept = 1, colour = 'grey') +
geom_vline(xintercept = 1, colour = 'grey') +
geom_point() +
geom_errorbar(aes(ymin = our_lower, ymax = our_upper)) +
geom_errorbarh(aes(xmin = their_lower, xmax = their_upper)) +
geom_text_repel(aes(label = long_name)) +
theme_classic(base_size = 8)
#+ coefficients_table, results='asis'
print(
xtable(
data.frame(
variable =
paste(
compare.coefficients$long_name, compare.coefficients$unit, sep=', '
),
compare.coefficients[c('our_value', 'their_value')]
),
digits = c(0,0,3,3)
),
type = 'html',
include.rownames = FALSE
)
#' ### Variable importance
#'
#' Let's compare the variable importance from this method with accounting for
#' missing values explicitly. Slight kludge as it's only using one imputed
#' dataset and a fit based on another, but should give some idea.
#'
#+ cox_variable_importance
cox.var.imp.perm <-
generalVarImp(
fit.exp, imputed.data[[2]][test.set, ], model.type = 'survreg'
)
write.csv(cox.var.imp.perm, cox.var.imp.perm.filename, row.names = FALSE)
cox.var.imp.perm.missing <- read.csv(cox.var.imp.perm.missing.filename)
cox.var.imp.comparison <-
merge(
cox.var.imp.perm,
cox.var.imp.perm.missing,
by = 'var',
suffixes = c('', '.missing')
)
ggplot(cox.var.imp.comparison, aes(x = var.imp.missing, y = var.imp)) +
geom_point() +
scale_x_log10() +
scale_y_log10()
#' There's a good correlation!