Switch to unified view

a b/cox-ph/caliber-replicate-with-imputation.R
1
#+ knitr_setup, include = FALSE
2
3
# Whether to cache the intensive code sections. Set to FALSE to recalculate
4
# everything afresh.
5
cacheoption <- FALSE
6
# Disable lazy caching globally, because it fails for large objects, and all the
7
# objects we wish to cache are large...
8
opts_chunk$set(cache.lazy = FALSE)
9
10
#' # Replicating Rapsomaniki _et al._ 2014
11
#' 
12
#' ## User variables
13
#' 
14
#' First, define some variables...
15
16
#+ define_vars
17
18
imputed.data.filename <- '../../data/COHORT_complete.rds'
19
n.data <- NA # This is of full dataset...further rows may be excluded in prep
20
endpoint <- 'death.imputed'
21
22
old.coefficients.filename <- 'rapsomaniki-cox-values-from-paper.csv'
23
24
output.filename.base <- '../../output/caliber-replicate-imputed-survreg-4'
25
26
27
cox.var.imp.perm.filename <-
28
  '../../output/caliber-replicate-imputed-survreg-bootstrap-var-imp-perm-1.csv'
29
cox.var.imp.perm.missing.filename <-
30
  '../../output/caliber-replicate-with-missing-survreg-bootstrap-var-imp-perm-1.csv'
31
32
33
bootstraps <- 100
34
n.threads <- 10
35
36
#' ## Setup
37
38
#+ setup, message=FALSE
39
40
source('../lib/shared.R')
41
require(xtable)
42
require(ggrepel)
43
44
# Load the data and convert to data frame to make column-selecting code in
45
# prepData simpler
46
imputed.data <- readRDS(imputed.data.filename)
47
48
# Remove rows with death time of 0 to avoid fitting errors
49
for(i in 1:length(imputed.data)) {
50
  imputed.data[[i]] <- imputed.data[[i]][imputed.data[[i]][, surv.time] > 0, ]
51
}
52
53
# Define n.data based on the imputed data, which has already been preprocessed
54
n.data <- nrow(imputed.data[[1]])
55
# Define indices of test set
56
test.set <- testSetIndices(imputed.data[[1]], random.seed = 78361)
57
58
#' OK, we've now got **`r n.data`** patients, split into a training set of
59
#' `r n.data - length(test.set)` and a test set of `r length(test.set)`.
60
#'
61
#'
62
#' ## Transform variables
63
#' 
64
#' The model uses variables which have been standardised in various ways, so
65
#' let's go through and transform our input variables in the same way...
66
67
source('caliber-scale.R')
68
# Remove the age splining
69
ageSpline <- identity
70
71
for(i in 1:length(imputed.data)) {
72
  imputed.data[[i]] <- caliberScale(imputed.data[[i]], surv.time, surv.event)
73
}
74
75
#' ## Survival fitting
76
#' 
77
#' Fit a Cox model to the preprocessed data. The paper uses a Cox model with an
78
#' exponential baseline hazard, as here. The standard errors were calculated
79
#' with 200 bootstrap samples, which we're also doing here.
80
81
#+ fit_cox_model, cache=cacheoption
82
83
surv.formula <-
84
  Surv(surv_time, surv_event) ~
85
    ### Sociodemographic characteristics #######################################
86
    ## Age in men, per year
87
    ## Age in women, per year
88
    ## Women vs. men
89
    # ie include interaction between age and gender!
90
    age*gender +
91
    ## Most deprived quintile, yes vs. no
92
    most_deprived +
93
    ### SCAD diagnosis and severity ############################################
94
    ## Other CHD vs. stable angina
95
    ## Unstable angina vs. stable angina
96
    ## NSTEMI vs. stable angina
97
    ## STEMI vs. stable angina
98
    diagnosis +
99
    #diagnosis_missing +
100
    ## PCI in last 6 months, yes vs. no
101
    pci_6mo +
102
    ## CABG in last 6 months, yes vs. no
103
    cabg_6mo +
104
    ## Previous/recurrent MI, yes vs. no
105
    hx_mi +
106
    ## Use of nitrates, yes vs. no
107
    long_nitrate +
108
    ### CVD risk factors #######################################################
109
    ## Ex-smoker / current smoker / missing data vs. never
110
    smokstatus +
111
    ## Hypertension, present vs. absent
112
    hypertension +
113
    ## Diabetes mellitus, present vs. absent
114
    diabetes_logical +
115
    ## Total cholesterol, per 1 mmol/L increase
116
    total_chol_6mo +
117
    ## HDL, per 0.5 mmol/L increase
118
    hdl_6mo +
119
    ### CVD co-morbidities #####################################################
120
    ## Heart failure, present vs. absent
121
    heart_failure +
122
    ## Peripheral arterial disease, present vs. absent
123
    pad +
124
    ## Atrial fibrillation, present vs. absent
125
    hx_af +
126
    ## Stroke, present vs. absent
127
    hx_stroke +
128
    ### Non-CVD comorbidities ##################################################
129
    ## Chronic kidney disease, present vs. absent
130
    hx_renal +
131
    ## Chronic obstructive pulmonary disease, present vs. absent
132
    hx_copd +
133
    ## Cancer, present vs. absent
134
    hx_cancer +
135
    ## Chronic liver disease, present vs. absent
136
    hx_liver +
137
    ### Psychosocial characteristics ###########################################
138
    ## Depression at diagnosis, present vs. absent
139
    hx_depression +
140
    ## Anxiety at diagnosis, present vs. absent
141
    hx_anxiety +
142
    ### Biomarkers #############################################################
143
    ## Heart rate, per 10 b.p.m increase
144
    pulse_6mo +
145
    ## Creatinine, per 30 μmol/L increase
146
    crea_6mo +
147
    ## White cell count, per 1.5 109/L increase
148
    total_wbc_6mo +
149
    ## Haemoglobin, per 1.5 g/dL increase
150
    haemoglobin_6mo
151
152
# Do a quick and dirty fit on a single imputed dataset, to draw calibration
153
# curve from
154
fit.exp <- survreg(
155
  formula = surv.formula,
156
  data = imputed.data[[1]][-test.set, ],
157
  dist = "exponential"
158
)
159
160
fit.exp.boot <- list()
161
162
# Perform bootstrap fitting for every multiply imputed dataset
163
for(i in 1:length(imputed.data)) {
164
  fit.exp.boot[[i]] <-
165
    boot(
166
      formula = surv.formula,
167
      data = imputed.data[[i]][-test.set, ],
168
      statistic = bootstrapFitSurvreg,
169
      R = bootstraps,
170
      parallel = 'multicore',
171
      ncpus = n.threads,
172
      test.data = imputed.data[[i]][test.set, ]
173
    )
174
}
175
176
# Save the fits, because it might've taken a while!
177
saveRDS(fit.exp.boot, paste0(output.filename.base, '-surv-boot-imp.rds'))
178
179
# Unpackage the uncertainties from the bootstrapped data
180
fit.exp.boot.ests <-  bootMIStats(fit.exp.boot)
181
182
# Save bootstrapped performance values
183
varsToTable(
184
  data.frame(
185
    model = 'cox',
186
    imputation = TRUE,
187
    discretised = FALSE,
188
    c.index = fit.exp.boot.ests['c.index', 'val'],
189
    c.index.lower = fit.exp.boot.ests['c.index', 'lower'],
190
    c.index.upper = fit.exp.boot.ests['c.index', 'upper'],
191
    calibration.score = fit.exp.boot.ests['calibration.score', 'val'],
192
    calibration.score.lower = fit.exp.boot.ests['calibration.score', 'lower'],
193
    calibration.score.upper = fit.exp.boot.ests['calibration.score', 'upper']
194
  ),
195
  performance.file,
196
  index.cols = c('model', 'imputation', 'discretised')
197
)
198
199
#' ## Performance
200
#' 
201
#' Having fitted the Cox model, how did we do? The c-indices were calculated as
202
#' part of the bootstrapping, so we just need to take a look at those...
203
#' 
204
#' C-indices are **`r round(fit.exp.boot.ests['c.train', 'val'], 3)`
205
#' (`r round(fit.exp.boot.ests['c.train', 'lower'], 3)` - 
206
#' `r round(fit.exp.boot.ests['c.train', 'upper'], 3)`)** on the training set and
207
#' **`r round(fit.exp.boot.ests['c.test', 'val'], 3)`
208
#' (`r round(fit.exp.boot.ests['c.test', 'lower'], 3)` - 
209
#' `r round(fit.exp.boot.ests['c.test', 'upper'], 3)`)** on the test set.
210
#' Not too bad!
211
#' 
212
#' 
213
#' ### Calibration
214
#' 
215
#' The bootstrapped calibration score is
216
#' **`r round(fit.exp.boot.ests['calibration.score', 'val'], 3)`
217
#' (`r round(fit.exp.boot.ests['calibration.score', 'lower'], 3)` - 
218
#' `r round(fit.exp.boot.ests['calibration.score', 'upper'], 3)`)**.
219
#' 
220
#' Let's draw a representative curve from the unbootstrapped fit... (It would be
221
#' better to draw all the curves from the bootstrap fit to get an idea of
222
#' variability, but I've not implemented this yet.)
223
#' 
224
#+ calibration_plot
225
226
calibration.table <-
227
  calibrationTable(fit.exp, imputed.data[[i]][test.set, ])
228
229
calibration.score <- calibrationScore(calibration.table)
230
231
calibrationPlot(calibration.table)
232
233
#' The area between the calibration curve and the diagonal is 
234
#' **`r round(calibration.score, 3)`**.
235
#'  
236
#' ## Coefficients
237
#' 
238
#' As well as getting comparable C-indices, it's also worth checking to see how
239
#' the risk coefficients calculated compare to those found in the original
240
#' paper. Let's compare...
241
242
# Load CSV of values from paper
243
old.coefficients <- read.csv(old.coefficients.filename)
244
245
# Get coefficients from this fit
246
new.coefficients <-
247
  bootMIStats(fit.exp.boot, uncertainty = '95ci', transform = negExp)
248
names(new.coefficients) <- c('our_value', 'our_lower', 'our_upper')
249
new.coefficients$quantity.level <- rownames(new.coefficients)
250
251
# Create a data frame comparing them
252
compare.coefficients <- merge(old.coefficients, new.coefficients)
253
254
# Kludge because age:genderWomen is the pure interaction term, not the risk for
255
# a woman per unit of advancing spline-transformed age
256
compare.coefficients[
257
  compare.coefficients$quantity.level == 'age:genderWomen', 'our_value'
258
  ] <-
259
  compare.coefficients[
260
    compare.coefficients$quantity.level == 'age:genderWomen', 'our_value'
261
    ] *
262
  compare.coefficients[
263
    compare.coefficients$quantity.level == 'age', 'our_value'
264
    ]
265
266
# Save CSV of results
267
write.csv(compare.coefficients, output.filename)
268
269
# Plot a graph by which to judge success
270
ggplot(compare.coefficients, aes(x = their_value, y = our_value)) +
271
  geom_abline(intercept = 0, slope = 1) +
272
  geom_hline(yintercept = 1, colour = 'grey') +
273
  geom_vline(xintercept = 1, colour = 'grey') +
274
  geom_point() +
275
  geom_errorbar(aes(ymin = our_lower, ymax = our_upper)) +
276
  geom_errorbarh(aes(xmin = their_lower, xmax = their_upper)) +
277
  geom_text_repel(aes(label = long_name)) +
278
  theme_classic(base_size = 8)
279
280
#+ coefficients_table, results='asis'
281
282
print(
283
  xtable(
284
    data.frame(
285
      variable =
286
        paste(
287
          compare.coefficients$long_name, compare.coefficients$unit, sep=', '
288
        ),
289
      compare.coefficients[c('our_value', 'their_value')]
290
    ),
291
    digits = c(0,0,3,3)
292
  ),
293
  type = 'html',
294
  include.rownames = FALSE
295
)
296
297
#' ### Variable importance
298
#' 
299
#' Let's compare the variable importance from this method with accounting for
300
#' missing values explicitly. Slight kludge as it's only using one imputed
301
#' dataset and a fit based on another, but should give some idea.
302
#' 
303
#+ cox_variable_importance
304
305
cox.var.imp.perm <- 
306
  generalVarImp(
307
    fit.exp, imputed.data[[2]][test.set, ], model.type = 'survreg'
308
  )
309
310
write.csv(cox.var.imp.perm, cox.var.imp.perm.filename, row.names = FALSE)
311
312
cox.var.imp.perm.missing <- read.csv(cox.var.imp.perm.missing.filename)
313
314
cox.var.imp.comparison <-
315
  merge(
316
    cox.var.imp.perm,
317
    cox.var.imp.perm.missing,
318
    by = 'var',
319
    suffixes = c('', '.missing')
320
  )
321
322
ggplot(cox.var.imp.comparison, aes(x = var.imp.missing, y = var.imp)) +
323
  geom_point() +
324
  scale_x_log10() +
325
  scale_y_log10()
326
327
#' There's a good correlation!