[0375db]: / cox-ph / caliber-replicate-with-missing.R

Download this file

629 lines (559 with data), 21.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
#+ knitr_setup, include = FALSE
# Whether to cache the intensive code sections. Set to FALSE to recalculate
# everything afresh.
cacheoption <- FALSE
# Disable lazy caching globally, because it fails for large objects, and all the
# objects we wish to cache are large...
opts_chunk$set(cache.lazy = FALSE)
#' # Replicating Rapsomaniki _et al._ 2014 without imputation
#'
#' Rapsomaniki and co-workers' paper creates a Cox hazard model to predict time
#' to death for patients using electronic health record data.
#'
#' Because such data
#' are gathered as part of routine clinical care, some values may be missing.
#' For example, covariates include blood tests such as creatine level, and use
#' the most recent value no older than six months. A patient may simply not have
#' had this test in that time period.
#'
#' The approach adopted in this situation, in common with many other similar
#' studies, is to impute missing values. This essentially involves finding
#' patients whose other values are similar to make a best guess at the likely
#' value of a missing datapoint. However, in the case of health records data,
#' the missingness of a value may be informative: the fact that a certain test
#' wasn't ordered could result from a doctor not thinking that test necessary,
#' meaning that its absence carries information.
#'
#' Whilst there are many approaches which could be employed here, this program
#' represents arguably the simplest: we keep the Cox model as close to the
#' published version as possible but, instead of imputing the missing data, we
#' include it explicitly.
#'
#' ## User variables
#'
#' First, define some variables...
#+ define_vars
n.data <- NA # This is of full dataset...further rows may be excluded in prep
endpoint <- 'death'
old.coefficients.filename <- 'rapsomaniki-cox-values-from-paper.csv'
output.filename.base <- '../../output/caliber-replicate-with-missing-survreg-6-linear-age'
bootstraps <- 200
n.threads <- 20
# Create a few filenames from the base
new.coefficients.filename <- paste0(output.filename.base, '-coeffs-3.csv')
compare.coefficients.filename <-
paste0(output.filename.base, '-bootstrap-3.csv')
cox.var.imp.perm.filename <- paste0(output.filename.base, '-var-imp-perm-3.csv')
model.filename <- paste0(output.filename.base, '-surv-boot.rds')
#' ## Setup
#+ setup, message=FALSE
source('../lib/shared.R')
require(xtable)
require(ggrepel)
source('caliber-scale.R')
# Load the data and convert to data frame to make column-selecting code in
# prepData simpler
COHORT.full <- data.frame(fread(data.filename))
# Remove the patients we shouldn't include
COHORT.full <-
COHORT.full[
# remove negative times to death
COHORT.full$time_death > 0 &
# remove patients who should be excluded
!COHORT.full$exclude
,
]
# If n.data was specified...
if(!is.na(n.data)){
# Take a subset n.data in size
COHORT.use <- sample.df(COHORT.full, n.data)
rm(COHORT.full)
} else {
# Use all the data
COHORT.use <- COHORT.full
rm(COHORT.full)
}
# Redefine n.data just in case we lost any rows
n.data <- nrow(COHORT.use)
# Define indices of test set
test.set <- testSetIndices(COHORT.use, random.seed = 78361)
#' OK, we've now got **`r n.data`** patients, split into a training set of
#' `r n.data - length(test.set)` and a test set of `r length(test.set)`.
#'
#'
#' ## Transform variables
#'
#' The model uses variables which have been standardised in various ways, so
#' let's go through and transform our input variables in the same way...
#' ### Age
#'
#' There seem to be some issues with the age spline function, so let's just fit
#' to a linear one as a check...
ageSpline <- identity
#'
#' ### Other variables
#'
#' * Most other variables in the model are simply normalised by a factor with an
#' offset.
#' * Variables which are factors (such as diagnosis) need to make sure that
#' their first level is the one which was used as the baseline in the paper.
#' * The IMD (social deprivation) score is used by flagging those patients who
#' are in the bottom quintile.
COHORT.scaled <-
caliberScale(COHORT.use, surv.time, surv.event)
#' ## Missing values
#'
#' To incorporate missing values, we need to do different things depending on
#' the variable type.
#'
#' * If the variable is a factor, we can simply add an extra factor level
#' 'missing' to account for missing values.
#' * For logical or numerical values, we first make a new column of logicals to
#' indicate whether the value was missing or not. Those logicals will then
#' themselves be variables with associated beta values, giving the risk
#' associated with having a value missing. Then, set the a logical to FALSE or
#' a numeric to 0, the baseline value, therefore not having any effect on the
#' final survival curve.
# Specify missing columns - diagnosis only has a handful of missing values so
# sometimes doesn't have missing ones in the sampled training set, meaning
# prepCoxMissing wouldn't fix it.
missing.cols <-
c(
"diagnosis", "most_deprived", "smokstatus", "total_chol_6mo", "hdl_6mo",
"pulse_6mo", "crea_6mo", "total_wbc_6mo", "haemoglobin_6mo"
)
COHORT.scaled.demissed <- prepCoxMissing(COHORT.scaled, missing.cols)
#' ## Survival fitting
#'
#' Fit a Cox model to the preprocessed data. The paper uses a Cox model with an
#' exponential baseline hazard, as here. The standard errors were calculated
#' with 200 bootstrap samples, which we're also doing here.
#+ fit_cox_model, cache=cacheoption
surv.formula <-
Surv(surv_time, surv_event) ~
### Sociodemographic characteristics #######################################
## Age in men, per year
## Age in women, per year
## Women vs. men
# ie include interaction between age and gender!
age * gender +
## Most deprived quintile, yes vs. no
most_deprived +
most_deprived_missing +
### SCAD diagnosis and severity ############################################
## Other CHD vs. stable angina
## Unstable angina vs. stable angina
## NSTEMI vs. stable angina
## STEMI vs. stable angina
diagnosis +
#diagnosis_missing +
## PCI in last 6 months, yes vs. no
pci_6mo +
## CABG in last 6 months, yes vs. no
cabg_6mo +
## Previous/recurrent MI, yes vs. no
#hx_mi +
## Use of nitrates, yes vs. no
long_nitrate +
### CVD risk factors #######################################################
## Ex-smoker / current smoker / missing data vs. never
smokstatus +
## Hypertension, present vs. absent
hypertension +
## Diabetes mellitus, present vs. absent
diabetes_logical +
## Total cholesterol, per 1 mmol/L increase
total_chol_6mo +
total_chol_6mo_missing +
## HDL, per 0.5 mmol/L increase
hdl_6mo +
hdl_6mo_missing +
### CVD co-morbidities #####################################################
## Heart failure, present vs. absent
heart_failure +
## Peripheral arterial disease, present vs. absent
pad +
## Atrial fibrillation, present vs. absent
hx_af +
## Stroke, present vs. absent
hx_stroke +
### Non-CVD comorbidities ##################################################
## Chronic kidney disease, present vs. absent
hx_renal +
## Chronic obstructive pulmonary disease, present vs. absent
hx_copd +
## Cancer, present vs. absent
hx_cancer +
## Chronic liver disease, present vs. absent
hx_liver +
### Psychosocial characteristics ###########################################
## Depression at diagnosis, present vs. absent
hx_depression +
## Anxiety at diagnosis, present vs. absent
hx_anxiety +
### Biomarkers #############################################################
## Heart rate, per 10 b.p.m increase
pulse_6mo +
pulse_6mo_missing +
## Creatinine, per 30 μmol/L increase
crea_6mo +
crea_6mo_missing +
## White cell count, per 1.5 109/L increase
total_wbc_6mo +
total_wbc_6mo_missing +
## Haemoglobin, per 1.5 g/dL increase
haemoglobin_6mo +
haemoglobin_6mo_missing
# Fit the main model with all data
fit.exp <- survreg(
formula = surv.formula,
data = COHORT.scaled.demissed[-test.set, ],
dist = "exponential"
)
# Run a bootstrap on the model to find uncertainties
fit.exp.boot <-
boot(
formula = surv.formula,
data = COHORT.scaled.demissed[-test.set, ],
statistic = bootstrapFitSurvreg,
R = bootstraps,
parallel = 'multicore',
ncpus = n.threads,
test.data = COHORT.scaled.demissed[test.set, ]
)
# Save the fit, because it might've taken a while!
saveRDS(fit.exp.boot, model.filename)
# Unpackage the uncertainties from the bootstrapped data
fit.exp.boot.ests <- bootStats(fit.exp.boot, uncertainty = '95ci')
# Write the raw bootstrap estimates to a file
write.csv(
fit.exp.boot.ests, paste0(output.filename.base, '-bootstrap-coeffs.csv')
)
# Save bootstrapped performance values
varsToTable(
data.frame(
model = 'cox',
imputation = FALSE,
discretised = FALSE,
c.index = fit.exp.boot.ests['c.index', 'val'],
c.index.lower = fit.exp.boot.ests['c.index', 'lower'],
c.index.upper = fit.exp.boot.ests['c.index', 'upper'],
calibration.score = fit.exp.boot.ests['calibration.score', 'val'],
calibration.score.lower = fit.exp.boot.ests['calibration.score', 'lower'],
calibration.score.upper = fit.exp.boot.ests['calibration.score', 'upper']
),
performance.file,
index.cols = c('model', 'imputation', 'discretised')
)
#' ## Performance
#'
#' ### C-index
#'
#' Having fitted the Cox model, how did we do? The c-indices were calculated as
#' part of the bootstrapping, so we just need to take a look at those...
#'
#' C-indices are **`r round(fit.exp.boot.ests['c.index', 'val'], 3)`
#' (`r round(fit.exp.boot.ests['c.index', 'lower'], 3)` -
#' `r round(fit.exp.boot.ests['c.index', 'upper'], 3)`)** on the held-out test
#' set.
#'
#' ### Calibration
#'
#' The bootstrapped calibration score is
#' **`r round(fit.exp.boot.ests['calibration.score', 'val'], 3)`
#' (`r round(fit.exp.boot.ests['calibration.score', 'lower'], 3)` -
#' `r round(fit.exp.boot.ests['calibration.score', 'upper'], 3)`)**.
#'
#' Let's draw a representative curve from the unbootstrapped fit... (It would be
#' better to draw all the curves from the bootstrap fit to get an idea of
#' variability, but I've not implemented this yet.)
#'
#+ calibration_plot
calibration.table <-
calibrationTable(fit.exp, COHORT.scaled.demissed[test.set, ])
calibrationPlot(calibration.table)
#'
#' ## Coefficients
#'
#' As well as getting comparable C-indices, it's also worth checking to see how
#' the risk coefficients calculated compare to those found in the original
#' paper. Let's compare...
# Load CSV of values from paper
old.coefficients <- read.csv(old.coefficients.filename)
# Get coefficients from this fit
new.coefficients <-
bootStats(fit.exp.boot, uncertainty = '95ci', transform = negExp)
names(new.coefficients) <- c('our_value', 'our_lower', 'our_upper')
new.coefficients$quantity.level <- rownames(new.coefficients)
# Create a data frame comparing them
compare.coefficients <- merge(old.coefficients, new.coefficients)
# Kludge because age:genderWomen is the pure interaction term, not the risk for
# a woman per unit of advancing spline-transformed age
compare.coefficients[
compare.coefficients$quantity.level == 'age:genderWomen', 'our_value'
] <-
compare.coefficients[
compare.coefficients$quantity.level == 'age:genderWomen', 'our_value'
] *
compare.coefficients[
compare.coefficients$quantity.level == 'age', 'our_value'
]
# Save CSV of results
write.csv(compare.coefficients, new.coefficients.filename)
# Plot a graph by which to judge success
ggplot(compare.coefficients, aes(x = their_value, y = our_value)) +
geom_abline(intercept = 0, slope = 1) +
geom_hline(yintercept = 1, colour = 'grey') +
geom_vline(xintercept = 1, colour = 'grey') +
geom_point() +
geom_errorbar(aes(ymin = our_lower, ymax = our_upper)) +
geom_errorbarh(aes(xmin = their_lower, xmax = their_upper)) +
geom_text_repel(aes(label = long_name)) +
theme_classic(base_size = 8)
#+ coefficients_table, results='asis'
print(
xtable(
data.frame(
variable =
paste(
compare.coefficients$long_name, compare.coefficients$unit, sep=', '
),
compare.coefficients[c('our_value', 'their_value')]
),
digits = c(0,0,3,3)
),
type = 'html',
include.rownames = FALSE
)
#' ### Variable importance
#'
#' To establish how important a given variable is in determining outcome (and to
#' compare with other measures such as variable importance with random forests),
#' it would be worthwhile to calculate an equivalent measure for a Cox model.
#' The easiest way to do this across techniques is to look at it by permutation:
#' randomly permute values in each variable, and see how much worse it makes the
#' prediction.
#'
#+ cox_variable_importance
cox.var.imp.perm <-
generalVarImp(
fit.exp, COHORT.scaled.demissed[test.set, ], model.type = 'survreg'
)
write.csv(cox.var.imp.perm, cox.var.imp.perm.filename, row.names = FALSE)
#' ## Strange things about gender
#'
#' The only huge outlier in this comparison is gender: according to the paper,
#' being a woman is significantly safer than being a man, whereas we
#' don't find a striking difference between genders.
#'
#' Let's try some simple fits to see if this is explicable.
#'
#' ### Fit based only on gender
print(
exp(
-survreg(
Surv(surv_time, surv_event) ~ gender,
data = COHORT.scaled[-test.set, ],
dist = "exponential"
)$coeff
)
)
#' According to a model based only on gender, women are at higher risk than men.
#' This suggests that there are indeed confounding factors in this dataset.
#'
#' ### Fit based on age and gender
print(
exp(
-survreg(
Surv(surv_time, surv_event) ~ age + gender,
data = COHORT.scaled[-test.set, ],
dist = "exponential"
)$coeff
)
)
#' Once age is taken into account, it is (obviously) more dangerous to be older,
#' and it is once again safer to be female.
ggplot(COHORT.use, aes(x = age, group = gender, fill = gender)) +
geom_histogram(alpha = 0.8, position = 'dodge', binwidth = 1)
#' And, as we can see from this histogram, this is explained by the fact that
#' the women in this dataset do indeed tend to be older than the men.
print(
exp(
-survreg(
Surv(surv_time, surv_event) ~ age * gender,
data = COHORT.scaled[-test.set, ],
dist = "exponential"
)$coeff
)
)
#' Finally, looking at a model where age and gender are allowed to interact, the
#' results are once again a little confusing: being older is worse, which makes
#' sense, but being a woman is again worse. This is then offset by the slightly
#' positive interaction between age and being a woman, meaning that the overall
#' effect of being a slightly older woman will be positive (especially because
#' the spline function gets much larger with advancing age).
#'
#' It's important to remember that the output of any regression model with many
#' terms gives the strength of a given relationship having already controlled
#' for variation due to those other terms. In this case, even just using two
#' terms can give counter-intuitive results if you try to interpret coefficients
#' in isolation.
#'
#' ## Coefficients for missing values
#'
#' Let's see how coefficients for missing values compare to the range of risks
#' implied by the range of values for data...
# Value of creatinine which would result in a 25% increased risk of death
creatinine.25pc.risk <- 60 + 30 * log(1.25)/log(compare.coefficients[
compare.coefficients$quantity.level == 'crea_6mo', 'our_value'
])
# Equivalent value of creatinine for patients with missing data
creatinine.missing <- 60 + 30 *
log(compare.coefficients[
compare.coefficients$quantity.level == 'crea_6mo_missingTRUE', 'our_value'
]) /
log(compare.coefficients[
compare.coefficients$quantity.level == 'crea_6mo', 'our_value'
])
text.y.pos <- 3500
ggplot(
# Only plot the lowest 95 percentiles of data due to outliers
subset(COHORT.use, crea_6mo < quantile(crea_6mo, 0.95, na.rm = TRUE)),
aes(x = crea_6mo)
) +
geom_histogram(bins = 30) +
geom_vline(xintercept = 60) +
annotate("text", x = 60, y = text.y.pos, angle = 270, hjust = 0, vjust = 1,
label = "Baseline") +
geom_vline(xintercept = creatinine.25pc.risk) +
annotate("text", x = creatinine.25pc.risk, y = text.y.pos, angle = 270,
hjust = 0, vjust = 1, label = "25% more risk") +
geom_vline(xintercept = creatinine.missing) +
annotate("text", x = creatinine.missing, y = text.y.pos, angle = 270,
hjust = 0, vjust = 1, label = "missing data eqv")
#' So, having a missing creatinine value is slightly safer than having a
#' baseline reading, which is on the low end of observed values in this cohort.
#' Even an extremely high creatinine reading doesn't confer more than 25%
#' additional risk.
# Value which would result in a 25% increased risk of death
hdl.10pc.risk <- 1.5 + 0.5 * log(1.1)/log(compare.coefficients[
compare.coefficients$quantity.level == 'hdl_6mo', 'our_value'
])
# Equivalent value for patients with missing data
hdl.missing <- 1.5 + 0.5 *
log(compare.coefficients[
compare.coefficients$quantity.level == 'hdl_6mo_missingTRUE', 'our_value'
]) /
log(compare.coefficients[
compare.coefficients$quantity.level == 'hdl_6mo', 'our_value'
])
text.y.pos <- 4000
ggplot(
# Only plot the lowest 95 percentiles of data due to outliers
subset(COHORT.use, hdl_6mo < quantile(hdl_6mo, 0.95, na.rm = TRUE)),
aes(x = hdl_6mo)
) +
geom_histogram(bins = 30) +
geom_vline(xintercept = 1.5) +
annotate("text", x = 1.5, y = text.y.pos, angle = 270, hjust = 0, vjust = 1,
label = "Baseline") +
geom_vline(xintercept = hdl.10pc.risk) +
annotate("text", x = hdl.10pc.risk, y = text.y.pos, angle = 270, hjust = 0,
vjust = 1, label = "10% more risk") +
geom_vline(xintercept = hdl.missing) +
annotate("text", x = hdl.missing, y = text.y.pos, angle = 270, hjust = 0,
vjust = 1, label = "missing data eqv")
#' Missing HDL values seem to have the opposite effect: it's substantially more
#' risky to have a blank value here. One possible explanation is that this is
#' such a common blood test that its absence indicates that the patient is not
#' seeking or receiving adequate medical care.
#'
#' ## Systematic analysis of missing values
#'
#' Clearly the danger (or otherwise) of having missing values differs by
#' variable, according to this model. Let's look at all of the variables, to see
#' how all missing values compare, and whether they make sense.
#'
#+ missing_values_vs_ranges
# For continuous variables, get risk values for all patients for violin plots
risk.dist.by.var <- data.frame()
# For categorical variables, let's get all levels of the factor
risk.cats <- data.frame()
# Quantities not to plot in this graph
exclude.quantities <-
c(
'age', # too large a risk range, and no missing values anyway
'gender', 'diagnosis', # no missing values
'diabetes' # is converted to diabetes_logical so causes an error if included
)
for(quantity in surv.predict) {
# If we're not excluding..
if(!(quantity %in% exclude.quantities)) {
# If it's numeric
if(is.numeric(COHORT.scaled[, quantity])) {
# Get risks by taking the scaled values (NOT processed for missing ones, or
# there will be a lot of 0s in there) and taking quantity risks to that power
risk <-
new.coefficients$our_value[new.coefficients$quantity.level == quantity] ^
COHORT.scaled[, quantity]
# Discard outliers with absurd values
inliers <- inRange(risk, quantile(risk, c(0.01, 0.99), na.rm = TRUE))
risk <- risk[inliers]
risk.dist.by.var <-
rbind(
risk.dist.by.var,
data.frame(quantity, risk)
)
risk.cats <-
rbind(
risk.cats,
data.frame(
quantity,
new.coefficients[
new.coefficients$quantity.level ==
paste0(quantity, '_missingTRUE'),
]
)
)
} else if(is.factor(COHORT.scaled[, quantity])) {
risk.cats <-
rbind(
risk.cats,
data.frame(
quantity,
new.coefficients[
startsWith(as.character(new.coefficients$quantity), quantity),
]
)
)
} else {
# We're not going to include logicals in this plot, because they cannot
# be missing, by definition, in the logicals used here.
# There shouldn't be any other kinds of variables.
# If your code requires them, put something here.
}
}
}
# Save the results
write.csv(risk.dist.by.var, paste0(output.filename.base, '-risk-violins.csv'))
write.csv(risk.cats, paste0(output.filename.base, '-risk-cats.csv'))
# Plot the results
ggplot() +
# First, and therefore at the bottom, draw the reference line at risk = 1
geom_hline(yintercept = 1) +
# Then, on top of that, draw the violin plot of the risk from the data
geom_violin(data = risk.dist.by.var, aes(x = quantity, y = risk)) +
geom_pointrange(
data = risk.cats,
aes(x = quantity, y = our_value, ymin = our_lower,
ymax = our_upper),
position = position_jitter(width = 0.1)
) +
geom_text(
data = risk.cats,
aes(
x = quantity,
y = our_value,
label = quantity.level
)
)