|
a |
|
b/cox-ph/caliber-replicate-with-imputation.R |
|
|
1 |
#+ knitr_setup, include = FALSE |
|
|
2 |
|
|
|
3 |
# Whether to cache the intensive code sections. Set to FALSE to recalculate |
|
|
4 |
# everything afresh. |
|
|
5 |
cacheoption <- FALSE |
|
|
6 |
# Disable lazy caching globally, because it fails for large objects, and all the |
|
|
7 |
# objects we wish to cache are large... |
|
|
8 |
opts_chunk$set(cache.lazy = FALSE) |
|
|
9 |
|
|
|
10 |
#' # Replicating Rapsomaniki _et al._ 2014 |
|
|
11 |
#' |
|
|
12 |
#' ## User variables |
|
|
13 |
#' |
|
|
14 |
#' First, define some variables... |
|
|
15 |
|
|
|
16 |
#+ define_vars |
|
|
17 |
|
|
|
18 |
imputed.data.filename <- '../../data/COHORT_complete.rds' |
|
|
19 |
n.data <- NA # This is of full dataset...further rows may be excluded in prep |
|
|
20 |
endpoint <- 'death.imputed' |
|
|
21 |
|
|
|
22 |
old.coefficients.filename <- 'rapsomaniki-cox-values-from-paper.csv' |
|
|
23 |
|
|
|
24 |
output.filename.base <- '../../output/caliber-replicate-imputed-survreg-4' |
|
|
25 |
|
|
|
26 |
|
|
|
27 |
cox.var.imp.perm.filename <- |
|
|
28 |
'../../output/caliber-replicate-imputed-survreg-bootstrap-var-imp-perm-1.csv' |
|
|
29 |
cox.var.imp.perm.missing.filename <- |
|
|
30 |
'../../output/caliber-replicate-with-missing-survreg-bootstrap-var-imp-perm-1.csv' |
|
|
31 |
|
|
|
32 |
|
|
|
33 |
bootstraps <- 100 |
|
|
34 |
n.threads <- 10 |
|
|
35 |
|
|
|
36 |
#' ## Setup |
|
|
37 |
|
|
|
38 |
#+ setup, message=FALSE |
|
|
39 |
|
|
|
40 |
source('../lib/shared.R') |
|
|
41 |
require(xtable) |
|
|
42 |
require(ggrepel) |
|
|
43 |
|
|
|
44 |
# Load the data and convert to data frame to make column-selecting code in |
|
|
45 |
# prepData simpler |
|
|
46 |
imputed.data <- readRDS(imputed.data.filename) |
|
|
47 |
|
|
|
48 |
# Remove rows with death time of 0 to avoid fitting errors |
|
|
49 |
for(i in 1:length(imputed.data)) { |
|
|
50 |
imputed.data[[i]] <- imputed.data[[i]][imputed.data[[i]][, surv.time] > 0, ] |
|
|
51 |
} |
|
|
52 |
|
|
|
53 |
# Define n.data based on the imputed data, which has already been preprocessed |
|
|
54 |
n.data <- nrow(imputed.data[[1]]) |
|
|
55 |
# Define indices of test set |
|
|
56 |
test.set <- testSetIndices(imputed.data[[1]], random.seed = 78361) |
|
|
57 |
|
|
|
58 |
#' OK, we've now got **`r n.data`** patients, split into a training set of |
|
|
59 |
#' `r n.data - length(test.set)` and a test set of `r length(test.set)`. |
|
|
60 |
#' |
|
|
61 |
#' |
|
|
62 |
#' ## Transform variables |
|
|
63 |
#' |
|
|
64 |
#' The model uses variables which have been standardised in various ways, so |
|
|
65 |
#' let's go through and transform our input variables in the same way... |
|
|
66 |
|
|
|
67 |
source('caliber-scale.R') |
|
|
68 |
# Remove the age splining |
|
|
69 |
ageSpline <- identity |
|
|
70 |
|
|
|
71 |
for(i in 1:length(imputed.data)) { |
|
|
72 |
imputed.data[[i]] <- caliberScale(imputed.data[[i]], surv.time, surv.event) |
|
|
73 |
} |
|
|
74 |
|
|
|
75 |
#' ## Survival fitting |
|
|
76 |
#' |
|
|
77 |
#' Fit a Cox model to the preprocessed data. The paper uses a Cox model with an |
|
|
78 |
#' exponential baseline hazard, as here. The standard errors were calculated |
|
|
79 |
#' with 200 bootstrap samples, which we're also doing here. |
|
|
80 |
|
|
|
81 |
#+ fit_cox_model, cache=cacheoption |
|
|
82 |
|
|
|
83 |
surv.formula <- |
|
|
84 |
Surv(surv_time, surv_event) ~ |
|
|
85 |
### Sociodemographic characteristics ####################################### |
|
|
86 |
## Age in men, per year |
|
|
87 |
## Age in women, per year |
|
|
88 |
## Women vs. men |
|
|
89 |
# ie include interaction between age and gender! |
|
|
90 |
age*gender + |
|
|
91 |
## Most deprived quintile, yes vs. no |
|
|
92 |
most_deprived + |
|
|
93 |
### SCAD diagnosis and severity ############################################ |
|
|
94 |
## Other CHD vs. stable angina |
|
|
95 |
## Unstable angina vs. stable angina |
|
|
96 |
## NSTEMI vs. stable angina |
|
|
97 |
## STEMI vs. stable angina |
|
|
98 |
diagnosis + |
|
|
99 |
#diagnosis_missing + |
|
|
100 |
## PCI in last 6 months, yes vs. no |
|
|
101 |
pci_6mo + |
|
|
102 |
## CABG in last 6 months, yes vs. no |
|
|
103 |
cabg_6mo + |
|
|
104 |
## Previous/recurrent MI, yes vs. no |
|
|
105 |
hx_mi + |
|
|
106 |
## Use of nitrates, yes vs. no |
|
|
107 |
long_nitrate + |
|
|
108 |
### CVD risk factors ####################################################### |
|
|
109 |
## Ex-smoker / current smoker / missing data vs. never |
|
|
110 |
smokstatus + |
|
|
111 |
## Hypertension, present vs. absent |
|
|
112 |
hypertension + |
|
|
113 |
## Diabetes mellitus, present vs. absent |
|
|
114 |
diabetes_logical + |
|
|
115 |
## Total cholesterol, per 1 mmol/L increase |
|
|
116 |
total_chol_6mo + |
|
|
117 |
## HDL, per 0.5 mmol/L increase |
|
|
118 |
hdl_6mo + |
|
|
119 |
### CVD co-morbidities ##################################################### |
|
|
120 |
## Heart failure, present vs. absent |
|
|
121 |
heart_failure + |
|
|
122 |
## Peripheral arterial disease, present vs. absent |
|
|
123 |
pad + |
|
|
124 |
## Atrial fibrillation, present vs. absent |
|
|
125 |
hx_af + |
|
|
126 |
## Stroke, present vs. absent |
|
|
127 |
hx_stroke + |
|
|
128 |
### Non-CVD comorbidities ################################################## |
|
|
129 |
## Chronic kidney disease, present vs. absent |
|
|
130 |
hx_renal + |
|
|
131 |
## Chronic obstructive pulmonary disease, present vs. absent |
|
|
132 |
hx_copd + |
|
|
133 |
## Cancer, present vs. absent |
|
|
134 |
hx_cancer + |
|
|
135 |
## Chronic liver disease, present vs. absent |
|
|
136 |
hx_liver + |
|
|
137 |
### Psychosocial characteristics ########################################### |
|
|
138 |
## Depression at diagnosis, present vs. absent |
|
|
139 |
hx_depression + |
|
|
140 |
## Anxiety at diagnosis, present vs. absent |
|
|
141 |
hx_anxiety + |
|
|
142 |
### Biomarkers ############################################################# |
|
|
143 |
## Heart rate, per 10 b.p.m increase |
|
|
144 |
pulse_6mo + |
|
|
145 |
## Creatinine, per 30 μmol/L increase |
|
|
146 |
crea_6mo + |
|
|
147 |
## White cell count, per 1.5 109/L increase |
|
|
148 |
total_wbc_6mo + |
|
|
149 |
## Haemoglobin, per 1.5 g/dL increase |
|
|
150 |
haemoglobin_6mo |
|
|
151 |
|
|
|
152 |
# Do a quick and dirty fit on a single imputed dataset, to draw calibration |
|
|
153 |
# curve from |
|
|
154 |
fit.exp <- survreg( |
|
|
155 |
formula = surv.formula, |
|
|
156 |
data = imputed.data[[1]][-test.set, ], |
|
|
157 |
dist = "exponential" |
|
|
158 |
) |
|
|
159 |
|
|
|
160 |
fit.exp.boot <- list() |
|
|
161 |
|
|
|
162 |
# Perform bootstrap fitting for every multiply imputed dataset |
|
|
163 |
for(i in 1:length(imputed.data)) { |
|
|
164 |
fit.exp.boot[[i]] <- |
|
|
165 |
boot( |
|
|
166 |
formula = surv.formula, |
|
|
167 |
data = imputed.data[[i]][-test.set, ], |
|
|
168 |
statistic = bootstrapFitSurvreg, |
|
|
169 |
R = bootstraps, |
|
|
170 |
parallel = 'multicore', |
|
|
171 |
ncpus = n.threads, |
|
|
172 |
test.data = imputed.data[[i]][test.set, ] |
|
|
173 |
) |
|
|
174 |
} |
|
|
175 |
|
|
|
176 |
# Save the fits, because it might've taken a while! |
|
|
177 |
saveRDS(fit.exp.boot, paste0(output.filename.base, '-surv-boot-imp.rds')) |
|
|
178 |
|
|
|
179 |
# Unpackage the uncertainties from the bootstrapped data |
|
|
180 |
fit.exp.boot.ests <- bootMIStats(fit.exp.boot) |
|
|
181 |
|
|
|
182 |
# Save bootstrapped performance values |
|
|
183 |
varsToTable( |
|
|
184 |
data.frame( |
|
|
185 |
model = 'cox', |
|
|
186 |
imputation = TRUE, |
|
|
187 |
discretised = FALSE, |
|
|
188 |
c.index = fit.exp.boot.ests['c.index', 'val'], |
|
|
189 |
c.index.lower = fit.exp.boot.ests['c.index', 'lower'], |
|
|
190 |
c.index.upper = fit.exp.boot.ests['c.index', 'upper'], |
|
|
191 |
calibration.score = fit.exp.boot.ests['calibration.score', 'val'], |
|
|
192 |
calibration.score.lower = fit.exp.boot.ests['calibration.score', 'lower'], |
|
|
193 |
calibration.score.upper = fit.exp.boot.ests['calibration.score', 'upper'] |
|
|
194 |
), |
|
|
195 |
performance.file, |
|
|
196 |
index.cols = c('model', 'imputation', 'discretised') |
|
|
197 |
) |
|
|
198 |
|
|
|
199 |
#' ## Performance |
|
|
200 |
#' |
|
|
201 |
#' Having fitted the Cox model, how did we do? The c-indices were calculated as |
|
|
202 |
#' part of the bootstrapping, so we just need to take a look at those... |
|
|
203 |
#' |
|
|
204 |
#' C-indices are **`r round(fit.exp.boot.ests['c.train', 'val'], 3)` |
|
|
205 |
#' (`r round(fit.exp.boot.ests['c.train', 'lower'], 3)` - |
|
|
206 |
#' `r round(fit.exp.boot.ests['c.train', 'upper'], 3)`)** on the training set and |
|
|
207 |
#' **`r round(fit.exp.boot.ests['c.test', 'val'], 3)` |
|
|
208 |
#' (`r round(fit.exp.boot.ests['c.test', 'lower'], 3)` - |
|
|
209 |
#' `r round(fit.exp.boot.ests['c.test', 'upper'], 3)`)** on the test set. |
|
|
210 |
#' Not too bad! |
|
|
211 |
#' |
|
|
212 |
#' |
|
|
213 |
#' ### Calibration |
|
|
214 |
#' |
|
|
215 |
#' The bootstrapped calibration score is |
|
|
216 |
#' **`r round(fit.exp.boot.ests['calibration.score', 'val'], 3)` |
|
|
217 |
#' (`r round(fit.exp.boot.ests['calibration.score', 'lower'], 3)` - |
|
|
218 |
#' `r round(fit.exp.boot.ests['calibration.score', 'upper'], 3)`)**. |
|
|
219 |
#' |
|
|
220 |
#' Let's draw a representative curve from the unbootstrapped fit... (It would be |
|
|
221 |
#' better to draw all the curves from the bootstrap fit to get an idea of |
|
|
222 |
#' variability, but I've not implemented this yet.) |
|
|
223 |
#' |
|
|
224 |
#+ calibration_plot |
|
|
225 |
|
|
|
226 |
calibration.table <- |
|
|
227 |
calibrationTable(fit.exp, imputed.data[[i]][test.set, ]) |
|
|
228 |
|
|
|
229 |
calibration.score <- calibrationScore(calibration.table) |
|
|
230 |
|
|
|
231 |
calibrationPlot(calibration.table) |
|
|
232 |
|
|
|
233 |
#' The area between the calibration curve and the diagonal is |
|
|
234 |
#' **`r round(calibration.score, 3)`**. |
|
|
235 |
#' |
|
|
236 |
#' ## Coefficients |
|
|
237 |
#' |
|
|
238 |
#' As well as getting comparable C-indices, it's also worth checking to see how |
|
|
239 |
#' the risk coefficients calculated compare to those found in the original |
|
|
240 |
#' paper. Let's compare... |
|
|
241 |
|
|
|
242 |
# Load CSV of values from paper |
|
|
243 |
old.coefficients <- read.csv(old.coefficients.filename) |
|
|
244 |
|
|
|
245 |
# Get coefficients from this fit |
|
|
246 |
new.coefficients <- |
|
|
247 |
bootMIStats(fit.exp.boot, uncertainty = '95ci', transform = negExp) |
|
|
248 |
names(new.coefficients) <- c('our_value', 'our_lower', 'our_upper') |
|
|
249 |
new.coefficients$quantity.level <- rownames(new.coefficients) |
|
|
250 |
|
|
|
251 |
# Create a data frame comparing them |
|
|
252 |
compare.coefficients <- merge(old.coefficients, new.coefficients) |
|
|
253 |
|
|
|
254 |
# Kludge because age:genderWomen is the pure interaction term, not the risk for |
|
|
255 |
# a woman per unit of advancing spline-transformed age |
|
|
256 |
compare.coefficients[ |
|
|
257 |
compare.coefficients$quantity.level == 'age:genderWomen', 'our_value' |
|
|
258 |
] <- |
|
|
259 |
compare.coefficients[ |
|
|
260 |
compare.coefficients$quantity.level == 'age:genderWomen', 'our_value' |
|
|
261 |
] * |
|
|
262 |
compare.coefficients[ |
|
|
263 |
compare.coefficients$quantity.level == 'age', 'our_value' |
|
|
264 |
] |
|
|
265 |
|
|
|
266 |
# Save CSV of results |
|
|
267 |
write.csv(compare.coefficients, output.filename) |
|
|
268 |
|
|
|
269 |
# Plot a graph by which to judge success |
|
|
270 |
ggplot(compare.coefficients, aes(x = their_value, y = our_value)) + |
|
|
271 |
geom_abline(intercept = 0, slope = 1) + |
|
|
272 |
geom_hline(yintercept = 1, colour = 'grey') + |
|
|
273 |
geom_vline(xintercept = 1, colour = 'grey') + |
|
|
274 |
geom_point() + |
|
|
275 |
geom_errorbar(aes(ymin = our_lower, ymax = our_upper)) + |
|
|
276 |
geom_errorbarh(aes(xmin = their_lower, xmax = their_upper)) + |
|
|
277 |
geom_text_repel(aes(label = long_name)) + |
|
|
278 |
theme_classic(base_size = 8) |
|
|
279 |
|
|
|
280 |
#+ coefficients_table, results='asis' |
|
|
281 |
|
|
|
282 |
print( |
|
|
283 |
xtable( |
|
|
284 |
data.frame( |
|
|
285 |
variable = |
|
|
286 |
paste( |
|
|
287 |
compare.coefficients$long_name, compare.coefficients$unit, sep=', ' |
|
|
288 |
), |
|
|
289 |
compare.coefficients[c('our_value', 'their_value')] |
|
|
290 |
), |
|
|
291 |
digits = c(0,0,3,3) |
|
|
292 |
), |
|
|
293 |
type = 'html', |
|
|
294 |
include.rownames = FALSE |
|
|
295 |
) |
|
|
296 |
|
|
|
297 |
#' ### Variable importance |
|
|
298 |
#' |
|
|
299 |
#' Let's compare the variable importance from this method with accounting for |
|
|
300 |
#' missing values explicitly. Slight kludge as it's only using one imputed |
|
|
301 |
#' dataset and a fit based on another, but should give some idea. |
|
|
302 |
#' |
|
|
303 |
#+ cox_variable_importance |
|
|
304 |
|
|
|
305 |
cox.var.imp.perm <- |
|
|
306 |
generalVarImp( |
|
|
307 |
fit.exp, imputed.data[[2]][test.set, ], model.type = 'survreg' |
|
|
308 |
) |
|
|
309 |
|
|
|
310 |
write.csv(cox.var.imp.perm, cox.var.imp.perm.filename, row.names = FALSE) |
|
|
311 |
|
|
|
312 |
cox.var.imp.perm.missing <- read.csv(cox.var.imp.perm.missing.filename) |
|
|
313 |
|
|
|
314 |
cox.var.imp.comparison <- |
|
|
315 |
merge( |
|
|
316 |
cox.var.imp.perm, |
|
|
317 |
cox.var.imp.perm.missing, |
|
|
318 |
by = 'var', |
|
|
319 |
suffixes = c('', '.missing') |
|
|
320 |
) |
|
|
321 |
|
|
|
322 |
ggplot(cox.var.imp.comparison, aes(x = var.imp.missing, y = var.imp)) + |
|
|
323 |
geom_point() + |
|
|
324 |
scale_x_log10() + |
|
|
325 |
scale_y_log10() |
|
|
326 |
|
|
|
327 |
#' There's a good correlation! |