|
a |
|
b/cox-ph/caliber-replicate-with-missing.R |
|
|
1 |
#+ knitr_setup, include = FALSE |
|
|
2 |
|
|
|
3 |
# Whether to cache the intensive code sections. Set to FALSE to recalculate |
|
|
4 |
# everything afresh. |
|
|
5 |
cacheoption <- FALSE |
|
|
6 |
# Disable lazy caching globally, because it fails for large objects, and all the |
|
|
7 |
# objects we wish to cache are large... |
|
|
8 |
opts_chunk$set(cache.lazy = FALSE) |
|
|
9 |
|
|
|
10 |
#' # Replicating Rapsomaniki _et al._ 2014 without imputation |
|
|
11 |
#' |
|
|
12 |
#' Rapsomaniki and co-workers' paper creates a Cox hazard model to predict time |
|
|
13 |
#' to death for patients using electronic health record data. |
|
|
14 |
#' |
|
|
15 |
#' Because such data |
|
|
16 |
#' are gathered as part of routine clinical care, some values may be missing. |
|
|
17 |
#' For example, covariates include blood tests such as creatine level, and use |
|
|
18 |
#' the most recent value no older than six months. A patient may simply not have |
|
|
19 |
#' had this test in that time period. |
|
|
20 |
#' |
|
|
21 |
#' The approach adopted in this situation, in common with many other similar |
|
|
22 |
#' studies, is to impute missing values. This essentially involves finding |
|
|
23 |
#' patients whose other values are similar to make a best guess at the likely |
|
|
24 |
#' value of a missing datapoint. However, in the case of health records data, |
|
|
25 |
#' the missingness of a value may be informative: the fact that a certain test |
|
|
26 |
#' wasn't ordered could result from a doctor not thinking that test necessary, |
|
|
27 |
#' meaning that its absence carries information. |
|
|
28 |
#' |
|
|
29 |
#' Whilst there are many approaches which could be employed here, this program |
|
|
30 |
#' represents arguably the simplest: we keep the Cox model as close to the |
|
|
31 |
#' published version as possible but, instead of imputing the missing data, we |
|
|
32 |
#' include it explicitly. |
|
|
33 |
#' |
|
|
34 |
#' ## User variables |
|
|
35 |
#' |
|
|
36 |
#' First, define some variables... |
|
|
37 |
|
|
|
38 |
#+ define_vars |
|
|
39 |
|
|
|
40 |
n.data <- NA # This is of full dataset...further rows may be excluded in prep |
|
|
41 |
endpoint <- 'death' |
|
|
42 |
|
|
|
43 |
old.coefficients.filename <- 'rapsomaniki-cox-values-from-paper.csv' |
|
|
44 |
|
|
|
45 |
output.filename.base <- '../../output/caliber-replicate-with-missing-survreg-6-linear-age' |
|
|
46 |
|
|
|
47 |
bootstraps <- 200 |
|
|
48 |
n.threads <- 20 |
|
|
49 |
|
|
|
50 |
# Create a few filenames from the base |
|
|
51 |
new.coefficients.filename <- paste0(output.filename.base, '-coeffs-3.csv') |
|
|
52 |
compare.coefficients.filename <- |
|
|
53 |
paste0(output.filename.base, '-bootstrap-3.csv') |
|
|
54 |
cox.var.imp.perm.filename <- paste0(output.filename.base, '-var-imp-perm-3.csv') |
|
|
55 |
model.filename <- paste0(output.filename.base, '-surv-boot.rds') |
|
|
56 |
|
|
|
57 |
#' ## Setup |
|
|
58 |
|
|
|
59 |
#+ setup, message=FALSE |
|
|
60 |
|
|
|
61 |
source('../lib/shared.R') |
|
|
62 |
require(xtable) |
|
|
63 |
require(ggrepel) |
|
|
64 |
source('caliber-scale.R') |
|
|
65 |
|
|
|
66 |
# Load the data and convert to data frame to make column-selecting code in |
|
|
67 |
# prepData simpler |
|
|
68 |
COHORT.full <- data.frame(fread(data.filename)) |
|
|
69 |
|
|
|
70 |
# Remove the patients we shouldn't include |
|
|
71 |
COHORT.full <- |
|
|
72 |
COHORT.full[ |
|
|
73 |
# remove negative times to death |
|
|
74 |
COHORT.full$time_death > 0 & |
|
|
75 |
# remove patients who should be excluded |
|
|
76 |
!COHORT.full$exclude |
|
|
77 |
, |
|
|
78 |
] |
|
|
79 |
|
|
|
80 |
# If n.data was specified... |
|
|
81 |
if(!is.na(n.data)){ |
|
|
82 |
# Take a subset n.data in size |
|
|
83 |
COHORT.use <- sample.df(COHORT.full, n.data) |
|
|
84 |
rm(COHORT.full) |
|
|
85 |
} else { |
|
|
86 |
# Use all the data |
|
|
87 |
COHORT.use <- COHORT.full |
|
|
88 |
rm(COHORT.full) |
|
|
89 |
} |
|
|
90 |
|
|
|
91 |
# Redefine n.data just in case we lost any rows |
|
|
92 |
n.data <- nrow(COHORT.use) |
|
|
93 |
# Define indices of test set |
|
|
94 |
test.set <- testSetIndices(COHORT.use, random.seed = 78361) |
|
|
95 |
|
|
|
96 |
#' OK, we've now got **`r n.data`** patients, split into a training set of |
|
|
97 |
#' `r n.data - length(test.set)` and a test set of `r length(test.set)`. |
|
|
98 |
#' |
|
|
99 |
#' |
|
|
100 |
#' ## Transform variables |
|
|
101 |
#' |
|
|
102 |
#' The model uses variables which have been standardised in various ways, so |
|
|
103 |
#' let's go through and transform our input variables in the same way... |
|
|
104 |
|
|
|
105 |
#' ### Age |
|
|
106 |
#' |
|
|
107 |
#' There seem to be some issues with the age spline function, so let's just fit |
|
|
108 |
#' to a linear one as a check... |
|
|
109 |
|
|
|
110 |
ageSpline <- identity |
|
|
111 |
|
|
|
112 |
#' |
|
|
113 |
#' ### Other variables |
|
|
114 |
#' |
|
|
115 |
#' * Most other variables in the model are simply normalised by a factor with an |
|
|
116 |
#' offset. |
|
|
117 |
#' * Variables which are factors (such as diagnosis) need to make sure that |
|
|
118 |
#' their first level is the one which was used as the baseline in the paper. |
|
|
119 |
#' * The IMD (social deprivation) score is used by flagging those patients who |
|
|
120 |
#' are in the bottom quintile. |
|
|
121 |
|
|
|
122 |
COHORT.scaled <- |
|
|
123 |
caliberScale(COHORT.use, surv.time, surv.event) |
|
|
124 |
|
|
|
125 |
#' ## Missing values |
|
|
126 |
#' |
|
|
127 |
#' To incorporate missing values, we need to do different things depending on |
|
|
128 |
#' the variable type. |
|
|
129 |
#' |
|
|
130 |
#' * If the variable is a factor, we can simply add an extra factor level |
|
|
131 |
#' 'missing' to account for missing values. |
|
|
132 |
#' * For logical or numerical values, we first make a new column of logicals to |
|
|
133 |
#' indicate whether the value was missing or not. Those logicals will then |
|
|
134 |
#' themselves be variables with associated beta values, giving the risk |
|
|
135 |
#' associated with having a value missing. Then, set the a logical to FALSE or |
|
|
136 |
#' a numeric to 0, the baseline value, therefore not having any effect on the |
|
|
137 |
#' final survival curve. |
|
|
138 |
|
|
|
139 |
# Specify missing columns - diagnosis only has a handful of missing values so |
|
|
140 |
# sometimes doesn't have missing ones in the sampled training set, meaning |
|
|
141 |
# prepCoxMissing wouldn't fix it. |
|
|
142 |
missing.cols <- |
|
|
143 |
c( |
|
|
144 |
"diagnosis", "most_deprived", "smokstatus", "total_chol_6mo", "hdl_6mo", |
|
|
145 |
"pulse_6mo", "crea_6mo", "total_wbc_6mo", "haemoglobin_6mo" |
|
|
146 |
) |
|
|
147 |
COHORT.scaled.demissed <- prepCoxMissing(COHORT.scaled, missing.cols) |
|
|
148 |
|
|
|
149 |
#' ## Survival fitting |
|
|
150 |
#' |
|
|
151 |
#' Fit a Cox model to the preprocessed data. The paper uses a Cox model with an |
|
|
152 |
#' exponential baseline hazard, as here. The standard errors were calculated |
|
|
153 |
#' with 200 bootstrap samples, which we're also doing here. |
|
|
154 |
|
|
|
155 |
#+ fit_cox_model, cache=cacheoption |
|
|
156 |
|
|
|
157 |
surv.formula <- |
|
|
158 |
Surv(surv_time, surv_event) ~ |
|
|
159 |
### Sociodemographic characteristics ####################################### |
|
|
160 |
## Age in men, per year |
|
|
161 |
## Age in women, per year |
|
|
162 |
## Women vs. men |
|
|
163 |
# ie include interaction between age and gender! |
|
|
164 |
age * gender + |
|
|
165 |
## Most deprived quintile, yes vs. no |
|
|
166 |
most_deprived + |
|
|
167 |
most_deprived_missing + |
|
|
168 |
### SCAD diagnosis and severity ############################################ |
|
|
169 |
## Other CHD vs. stable angina |
|
|
170 |
## Unstable angina vs. stable angina |
|
|
171 |
## NSTEMI vs. stable angina |
|
|
172 |
## STEMI vs. stable angina |
|
|
173 |
diagnosis + |
|
|
174 |
#diagnosis_missing + |
|
|
175 |
## PCI in last 6 months, yes vs. no |
|
|
176 |
pci_6mo + |
|
|
177 |
## CABG in last 6 months, yes vs. no |
|
|
178 |
cabg_6mo + |
|
|
179 |
## Previous/recurrent MI, yes vs. no |
|
|
180 |
#hx_mi + |
|
|
181 |
## Use of nitrates, yes vs. no |
|
|
182 |
long_nitrate + |
|
|
183 |
### CVD risk factors ####################################################### |
|
|
184 |
## Ex-smoker / current smoker / missing data vs. never |
|
|
185 |
smokstatus + |
|
|
186 |
## Hypertension, present vs. absent |
|
|
187 |
hypertension + |
|
|
188 |
## Diabetes mellitus, present vs. absent |
|
|
189 |
diabetes_logical + |
|
|
190 |
## Total cholesterol, per 1 mmol/L increase |
|
|
191 |
total_chol_6mo + |
|
|
192 |
total_chol_6mo_missing + |
|
|
193 |
## HDL, per 0.5 mmol/L increase |
|
|
194 |
hdl_6mo + |
|
|
195 |
hdl_6mo_missing + |
|
|
196 |
### CVD co-morbidities ##################################################### |
|
|
197 |
## Heart failure, present vs. absent |
|
|
198 |
heart_failure + |
|
|
199 |
## Peripheral arterial disease, present vs. absent |
|
|
200 |
pad + |
|
|
201 |
## Atrial fibrillation, present vs. absent |
|
|
202 |
hx_af + |
|
|
203 |
## Stroke, present vs. absent |
|
|
204 |
hx_stroke + |
|
|
205 |
### Non-CVD comorbidities ################################################## |
|
|
206 |
## Chronic kidney disease, present vs. absent |
|
|
207 |
hx_renal + |
|
|
208 |
## Chronic obstructive pulmonary disease, present vs. absent |
|
|
209 |
hx_copd + |
|
|
210 |
## Cancer, present vs. absent |
|
|
211 |
hx_cancer + |
|
|
212 |
## Chronic liver disease, present vs. absent |
|
|
213 |
hx_liver + |
|
|
214 |
### Psychosocial characteristics ########################################### |
|
|
215 |
## Depression at diagnosis, present vs. absent |
|
|
216 |
hx_depression + |
|
|
217 |
## Anxiety at diagnosis, present vs. absent |
|
|
218 |
hx_anxiety + |
|
|
219 |
### Biomarkers ############################################################# |
|
|
220 |
## Heart rate, per 10 b.p.m increase |
|
|
221 |
pulse_6mo + |
|
|
222 |
pulse_6mo_missing + |
|
|
223 |
## Creatinine, per 30 μmol/L increase |
|
|
224 |
crea_6mo + |
|
|
225 |
crea_6mo_missing + |
|
|
226 |
## White cell count, per 1.5 109/L increase |
|
|
227 |
total_wbc_6mo + |
|
|
228 |
total_wbc_6mo_missing + |
|
|
229 |
## Haemoglobin, per 1.5 g/dL increase |
|
|
230 |
haemoglobin_6mo + |
|
|
231 |
haemoglobin_6mo_missing |
|
|
232 |
|
|
|
233 |
# Fit the main model with all data |
|
|
234 |
fit.exp <- survreg( |
|
|
235 |
formula = surv.formula, |
|
|
236 |
data = COHORT.scaled.demissed[-test.set, ], |
|
|
237 |
dist = "exponential" |
|
|
238 |
) |
|
|
239 |
|
|
|
240 |
# Run a bootstrap on the model to find uncertainties |
|
|
241 |
fit.exp.boot <- |
|
|
242 |
boot( |
|
|
243 |
formula = surv.formula, |
|
|
244 |
data = COHORT.scaled.demissed[-test.set, ], |
|
|
245 |
statistic = bootstrapFitSurvreg, |
|
|
246 |
R = bootstraps, |
|
|
247 |
parallel = 'multicore', |
|
|
248 |
ncpus = n.threads, |
|
|
249 |
test.data = COHORT.scaled.demissed[test.set, ] |
|
|
250 |
) |
|
|
251 |
|
|
|
252 |
# Save the fit, because it might've taken a while! |
|
|
253 |
saveRDS(fit.exp.boot, model.filename) |
|
|
254 |
|
|
|
255 |
# Unpackage the uncertainties from the bootstrapped data |
|
|
256 |
fit.exp.boot.ests <- bootStats(fit.exp.boot, uncertainty = '95ci') |
|
|
257 |
|
|
|
258 |
# Write the raw bootstrap estimates to a file |
|
|
259 |
write.csv( |
|
|
260 |
fit.exp.boot.ests, paste0(output.filename.base, '-bootstrap-coeffs.csv') |
|
|
261 |
) |
|
|
262 |
|
|
|
263 |
# Save bootstrapped performance values |
|
|
264 |
varsToTable( |
|
|
265 |
data.frame( |
|
|
266 |
model = 'cox', |
|
|
267 |
imputation = FALSE, |
|
|
268 |
discretised = FALSE, |
|
|
269 |
c.index = fit.exp.boot.ests['c.index', 'val'], |
|
|
270 |
c.index.lower = fit.exp.boot.ests['c.index', 'lower'], |
|
|
271 |
c.index.upper = fit.exp.boot.ests['c.index', 'upper'], |
|
|
272 |
calibration.score = fit.exp.boot.ests['calibration.score', 'val'], |
|
|
273 |
calibration.score.lower = fit.exp.boot.ests['calibration.score', 'lower'], |
|
|
274 |
calibration.score.upper = fit.exp.boot.ests['calibration.score', 'upper'] |
|
|
275 |
), |
|
|
276 |
performance.file, |
|
|
277 |
index.cols = c('model', 'imputation', 'discretised') |
|
|
278 |
) |
|
|
279 |
|
|
|
280 |
#' ## Performance |
|
|
281 |
#' |
|
|
282 |
#' ### C-index |
|
|
283 |
#' |
|
|
284 |
#' Having fitted the Cox model, how did we do? The c-indices were calculated as |
|
|
285 |
#' part of the bootstrapping, so we just need to take a look at those... |
|
|
286 |
#' |
|
|
287 |
#' C-indices are **`r round(fit.exp.boot.ests['c.index', 'val'], 3)` |
|
|
288 |
#' (`r round(fit.exp.boot.ests['c.index', 'lower'], 3)` - |
|
|
289 |
#' `r round(fit.exp.boot.ests['c.index', 'upper'], 3)`)** on the held-out test |
|
|
290 |
#' set. |
|
|
291 |
#' |
|
|
292 |
#' ### Calibration |
|
|
293 |
#' |
|
|
294 |
#' The bootstrapped calibration score is |
|
|
295 |
#' **`r round(fit.exp.boot.ests['calibration.score', 'val'], 3)` |
|
|
296 |
#' (`r round(fit.exp.boot.ests['calibration.score', 'lower'], 3)` - |
|
|
297 |
#' `r round(fit.exp.boot.ests['calibration.score', 'upper'], 3)`)**. |
|
|
298 |
#' |
|
|
299 |
#' Let's draw a representative curve from the unbootstrapped fit... (It would be |
|
|
300 |
#' better to draw all the curves from the bootstrap fit to get an idea of |
|
|
301 |
#' variability, but I've not implemented this yet.) |
|
|
302 |
#' |
|
|
303 |
#+ calibration_plot |
|
|
304 |
|
|
|
305 |
calibration.table <- |
|
|
306 |
calibrationTable(fit.exp, COHORT.scaled.demissed[test.set, ]) |
|
|
307 |
|
|
|
308 |
calibrationPlot(calibration.table) |
|
|
309 |
|
|
|
310 |
#' |
|
|
311 |
#' ## Coefficients |
|
|
312 |
#' |
|
|
313 |
#' As well as getting comparable C-indices, it's also worth checking to see how |
|
|
314 |
#' the risk coefficients calculated compare to those found in the original |
|
|
315 |
#' paper. Let's compare... |
|
|
316 |
|
|
|
317 |
# Load CSV of values from paper |
|
|
318 |
old.coefficients <- read.csv(old.coefficients.filename) |
|
|
319 |
|
|
|
320 |
# Get coefficients from this fit |
|
|
321 |
new.coefficients <- |
|
|
322 |
bootStats(fit.exp.boot, uncertainty = '95ci', transform = negExp) |
|
|
323 |
names(new.coefficients) <- c('our_value', 'our_lower', 'our_upper') |
|
|
324 |
new.coefficients$quantity.level <- rownames(new.coefficients) |
|
|
325 |
|
|
|
326 |
# Create a data frame comparing them |
|
|
327 |
compare.coefficients <- merge(old.coefficients, new.coefficients) |
|
|
328 |
|
|
|
329 |
# Kludge because age:genderWomen is the pure interaction term, not the risk for |
|
|
330 |
# a woman per unit of advancing spline-transformed age |
|
|
331 |
compare.coefficients[ |
|
|
332 |
compare.coefficients$quantity.level == 'age:genderWomen', 'our_value' |
|
|
333 |
] <- |
|
|
334 |
compare.coefficients[ |
|
|
335 |
compare.coefficients$quantity.level == 'age:genderWomen', 'our_value' |
|
|
336 |
] * |
|
|
337 |
compare.coefficients[ |
|
|
338 |
compare.coefficients$quantity.level == 'age', 'our_value' |
|
|
339 |
] |
|
|
340 |
|
|
|
341 |
# Save CSV of results |
|
|
342 |
write.csv(compare.coefficients, new.coefficients.filename) |
|
|
343 |
|
|
|
344 |
# Plot a graph by which to judge success |
|
|
345 |
ggplot(compare.coefficients, aes(x = their_value, y = our_value)) + |
|
|
346 |
geom_abline(intercept = 0, slope = 1) + |
|
|
347 |
geom_hline(yintercept = 1, colour = 'grey') + |
|
|
348 |
geom_vline(xintercept = 1, colour = 'grey') + |
|
|
349 |
geom_point() + |
|
|
350 |
geom_errorbar(aes(ymin = our_lower, ymax = our_upper)) + |
|
|
351 |
geom_errorbarh(aes(xmin = their_lower, xmax = their_upper)) + |
|
|
352 |
geom_text_repel(aes(label = long_name)) + |
|
|
353 |
theme_classic(base_size = 8) |
|
|
354 |
|
|
|
355 |
#+ coefficients_table, results='asis' |
|
|
356 |
|
|
|
357 |
print( |
|
|
358 |
xtable( |
|
|
359 |
data.frame( |
|
|
360 |
variable = |
|
|
361 |
paste( |
|
|
362 |
compare.coefficients$long_name, compare.coefficients$unit, sep=', ' |
|
|
363 |
), |
|
|
364 |
compare.coefficients[c('our_value', 'their_value')] |
|
|
365 |
), |
|
|
366 |
digits = c(0,0,3,3) |
|
|
367 |
), |
|
|
368 |
type = 'html', |
|
|
369 |
include.rownames = FALSE |
|
|
370 |
) |
|
|
371 |
|
|
|
372 |
#' ### Variable importance |
|
|
373 |
#' |
|
|
374 |
#' To establish how important a given variable is in determining outcome (and to |
|
|
375 |
#' compare with other measures such as variable importance with random forests), |
|
|
376 |
#' it would be worthwhile to calculate an equivalent measure for a Cox model. |
|
|
377 |
#' The easiest way to do this across techniques is to look at it by permutation: |
|
|
378 |
#' randomly permute values in each variable, and see how much worse it makes the |
|
|
379 |
#' prediction. |
|
|
380 |
#' |
|
|
381 |
#+ cox_variable_importance |
|
|
382 |
|
|
|
383 |
cox.var.imp.perm <- |
|
|
384 |
generalVarImp( |
|
|
385 |
fit.exp, COHORT.scaled.demissed[test.set, ], model.type = 'survreg' |
|
|
386 |
) |
|
|
387 |
|
|
|
388 |
write.csv(cox.var.imp.perm, cox.var.imp.perm.filename, row.names = FALSE) |
|
|
389 |
|
|
|
390 |
#' ## Strange things about gender |
|
|
391 |
#' |
|
|
392 |
#' The only huge outlier in this comparison is gender: according to the paper, |
|
|
393 |
#' being a woman is significantly safer than being a man, whereas we |
|
|
394 |
#' don't find a striking difference between genders. |
|
|
395 |
#' |
|
|
396 |
#' Let's try some simple fits to see if this is explicable. |
|
|
397 |
#' |
|
|
398 |
#' ### Fit based only on gender |
|
|
399 |
|
|
|
400 |
print( |
|
|
401 |
exp( |
|
|
402 |
-survreg( |
|
|
403 |
Surv(surv_time, surv_event) ~ gender, |
|
|
404 |
data = COHORT.scaled[-test.set, ], |
|
|
405 |
dist = "exponential" |
|
|
406 |
)$coeff |
|
|
407 |
) |
|
|
408 |
) |
|
|
409 |
|
|
|
410 |
#' According to a model based only on gender, women are at higher risk than men. |
|
|
411 |
#' This suggests that there are indeed confounding factors in this dataset. |
|
|
412 |
#' |
|
|
413 |
#' ### Fit based on age and gender |
|
|
414 |
|
|
|
415 |
print( |
|
|
416 |
exp( |
|
|
417 |
-survreg( |
|
|
418 |
Surv(surv_time, surv_event) ~ age + gender, |
|
|
419 |
data = COHORT.scaled[-test.set, ], |
|
|
420 |
dist = "exponential" |
|
|
421 |
)$coeff |
|
|
422 |
) |
|
|
423 |
) |
|
|
424 |
|
|
|
425 |
#' Once age is taken into account, it is (obviously) more dangerous to be older, |
|
|
426 |
#' and it is once again safer to be female. |
|
|
427 |
|
|
|
428 |
ggplot(COHORT.use, aes(x = age, group = gender, fill = gender)) + |
|
|
429 |
geom_histogram(alpha = 0.8, position = 'dodge', binwidth = 1) |
|
|
430 |
|
|
|
431 |
#' And, as we can see from this histogram, this is explained by the fact that |
|
|
432 |
#' the women in this dataset do indeed tend to be older than the men. |
|
|
433 |
|
|
|
434 |
print( |
|
|
435 |
exp( |
|
|
436 |
-survreg( |
|
|
437 |
Surv(surv_time, surv_event) ~ age * gender, |
|
|
438 |
data = COHORT.scaled[-test.set, ], |
|
|
439 |
dist = "exponential" |
|
|
440 |
)$coeff |
|
|
441 |
) |
|
|
442 |
) |
|
|
443 |
|
|
|
444 |
#' Finally, looking at a model where age and gender are allowed to interact, the |
|
|
445 |
#' results are once again a little confusing: being older is worse, which makes |
|
|
446 |
#' sense, but being a woman is again worse. This is then offset by the slightly |
|
|
447 |
#' positive interaction between age and being a woman, meaning that the overall |
|
|
448 |
#' effect of being a slightly older woman will be positive (especially because |
|
|
449 |
#' the spline function gets much larger with advancing age). |
|
|
450 |
#' |
|
|
451 |
#' It's important to remember that the output of any regression model with many |
|
|
452 |
#' terms gives the strength of a given relationship having already controlled |
|
|
453 |
#' for variation due to those other terms. In this case, even just using two |
|
|
454 |
#' terms can give counter-intuitive results if you try to interpret coefficients |
|
|
455 |
#' in isolation. |
|
|
456 |
#' |
|
|
457 |
#' ## Coefficients for missing values |
|
|
458 |
#' |
|
|
459 |
#' Let's see how coefficients for missing values compare to the range of risks |
|
|
460 |
#' implied by the range of values for data... |
|
|
461 |
|
|
|
462 |
# Value of creatinine which would result in a 25% increased risk of death |
|
|
463 |
creatinine.25pc.risk <- 60 + 30 * log(1.25)/log(compare.coefficients[ |
|
|
464 |
compare.coefficients$quantity.level == 'crea_6mo', 'our_value' |
|
|
465 |
]) |
|
|
466 |
|
|
|
467 |
# Equivalent value of creatinine for patients with missing data |
|
|
468 |
creatinine.missing <- 60 + 30 * |
|
|
469 |
log(compare.coefficients[ |
|
|
470 |
compare.coefficients$quantity.level == 'crea_6mo_missingTRUE', 'our_value' |
|
|
471 |
]) / |
|
|
472 |
log(compare.coefficients[ |
|
|
473 |
compare.coefficients$quantity.level == 'crea_6mo', 'our_value' |
|
|
474 |
]) |
|
|
475 |
|
|
|
476 |
text.y.pos <- 3500 |
|
|
477 |
|
|
|
478 |
ggplot( |
|
|
479 |
# Only plot the lowest 95 percentiles of data due to outliers |
|
|
480 |
subset(COHORT.use, crea_6mo < quantile(crea_6mo, 0.95, na.rm = TRUE)), |
|
|
481 |
aes(x = crea_6mo) |
|
|
482 |
) + |
|
|
483 |
geom_histogram(bins = 30) + |
|
|
484 |
geom_vline(xintercept = 60) + |
|
|
485 |
annotate("text", x = 60, y = text.y.pos, angle = 270, hjust = 0, vjust = 1, |
|
|
486 |
label = "Baseline") + |
|
|
487 |
geom_vline(xintercept = creatinine.25pc.risk) + |
|
|
488 |
annotate("text", x = creatinine.25pc.risk, y = text.y.pos, angle = 270, |
|
|
489 |
hjust = 0, vjust = 1, label = "25% more risk") + |
|
|
490 |
geom_vline(xintercept = creatinine.missing) + |
|
|
491 |
annotate("text", x = creatinine.missing, y = text.y.pos, angle = 270, |
|
|
492 |
hjust = 0, vjust = 1, label = "missing data eqv") |
|
|
493 |
|
|
|
494 |
#' So, having a missing creatinine value is slightly safer than having a |
|
|
495 |
#' baseline reading, which is on the low end of observed values in this cohort. |
|
|
496 |
#' Even an extremely high creatinine reading doesn't confer more than 25% |
|
|
497 |
#' additional risk. |
|
|
498 |
|
|
|
499 |
# Value which would result in a 25% increased risk of death |
|
|
500 |
hdl.10pc.risk <- 1.5 + 0.5 * log(1.1)/log(compare.coefficients[ |
|
|
501 |
compare.coefficients$quantity.level == 'hdl_6mo', 'our_value' |
|
|
502 |
]) |
|
|
503 |
|
|
|
504 |
# Equivalent value for patients with missing data |
|
|
505 |
hdl.missing <- 1.5 + 0.5 * |
|
|
506 |
log(compare.coefficients[ |
|
|
507 |
compare.coefficients$quantity.level == 'hdl_6mo_missingTRUE', 'our_value' |
|
|
508 |
]) / |
|
|
509 |
log(compare.coefficients[ |
|
|
510 |
compare.coefficients$quantity.level == 'hdl_6mo', 'our_value' |
|
|
511 |
]) |
|
|
512 |
|
|
|
513 |
text.y.pos <- 4000 |
|
|
514 |
|
|
|
515 |
ggplot( |
|
|
516 |
# Only plot the lowest 95 percentiles of data due to outliers |
|
|
517 |
subset(COHORT.use, hdl_6mo < quantile(hdl_6mo, 0.95, na.rm = TRUE)), |
|
|
518 |
aes(x = hdl_6mo) |
|
|
519 |
) + |
|
|
520 |
geom_histogram(bins = 30) + |
|
|
521 |
geom_vline(xintercept = 1.5) + |
|
|
522 |
annotate("text", x = 1.5, y = text.y.pos, angle = 270, hjust = 0, vjust = 1, |
|
|
523 |
label = "Baseline") + |
|
|
524 |
geom_vline(xintercept = hdl.10pc.risk) + |
|
|
525 |
annotate("text", x = hdl.10pc.risk, y = text.y.pos, angle = 270, hjust = 0, |
|
|
526 |
vjust = 1, label = "10% more risk") + |
|
|
527 |
geom_vline(xintercept = hdl.missing) + |
|
|
528 |
annotate("text", x = hdl.missing, y = text.y.pos, angle = 270, hjust = 0, |
|
|
529 |
vjust = 1, label = "missing data eqv") |
|
|
530 |
|
|
|
531 |
#' Missing HDL values seem to have the opposite effect: it's substantially more |
|
|
532 |
#' risky to have a blank value here. One possible explanation is that this is |
|
|
533 |
#' such a common blood test that its absence indicates that the patient is not |
|
|
534 |
#' seeking or receiving adequate medical care. |
|
|
535 |
#' |
|
|
536 |
#' ## Systematic analysis of missing values |
|
|
537 |
#' |
|
|
538 |
#' Clearly the danger (or otherwise) of having missing values differs by |
|
|
539 |
#' variable, according to this model. Let's look at all of the variables, to see |
|
|
540 |
#' how all missing values compare, and whether they make sense. |
|
|
541 |
#' |
|
|
542 |
#+ missing_values_vs_ranges |
|
|
543 |
|
|
|
544 |
# For continuous variables, get risk values for all patients for violin plots |
|
|
545 |
risk.dist.by.var <- data.frame() |
|
|
546 |
# For categorical variables, let's get all levels of the factor |
|
|
547 |
risk.cats <- data.frame() |
|
|
548 |
# Quantities not to plot in this graph |
|
|
549 |
exclude.quantities <- |
|
|
550 |
c( |
|
|
551 |
'age', # too large a risk range, and no missing values anyway |
|
|
552 |
'gender', 'diagnosis', # no missing values |
|
|
553 |
'diabetes' # is converted to diabetes_logical so causes an error if included |
|
|
554 |
) |
|
|
555 |
|
|
|
556 |
for(quantity in surv.predict) { |
|
|
557 |
# If we're not excluding.. |
|
|
558 |
if(!(quantity %in% exclude.quantities)) { |
|
|
559 |
# If it's numeric |
|
|
560 |
if(is.numeric(COHORT.scaled[, quantity])) { |
|
|
561 |
# Get risks by taking the scaled values (NOT processed for missing ones, or |
|
|
562 |
# there will be a lot of 0s in there) and taking quantity risks to that power |
|
|
563 |
risk <- |
|
|
564 |
new.coefficients$our_value[new.coefficients$quantity.level == quantity] ^ |
|
|
565 |
COHORT.scaled[, quantity] |
|
|
566 |
# Discard outliers with absurd values |
|
|
567 |
inliers <- inRange(risk, quantile(risk, c(0.01, 0.99), na.rm = TRUE)) |
|
|
568 |
risk <- risk[inliers] |
|
|
569 |
risk.dist.by.var <- |
|
|
570 |
rbind( |
|
|
571 |
risk.dist.by.var, |
|
|
572 |
data.frame(quantity, risk) |
|
|
573 |
) |
|
|
574 |
risk.cats <- |
|
|
575 |
rbind( |
|
|
576 |
risk.cats, |
|
|
577 |
data.frame( |
|
|
578 |
quantity, |
|
|
579 |
new.coefficients[ |
|
|
580 |
new.coefficients$quantity.level == |
|
|
581 |
paste0(quantity, '_missingTRUE'), |
|
|
582 |
] |
|
|
583 |
) |
|
|
584 |
) |
|
|
585 |
} else if(is.factor(COHORT.scaled[, quantity])) { |
|
|
586 |
risk.cats <- |
|
|
587 |
rbind( |
|
|
588 |
risk.cats, |
|
|
589 |
data.frame( |
|
|
590 |
quantity, |
|
|
591 |
new.coefficients[ |
|
|
592 |
startsWith(as.character(new.coefficients$quantity), quantity), |
|
|
593 |
] |
|
|
594 |
) |
|
|
595 |
) |
|
|
596 |
} else { |
|
|
597 |
# We're not going to include logicals in this plot, because they cannot |
|
|
598 |
# be missing, by definition, in the logicals used here. |
|
|
599 |
# There shouldn't be any other kinds of variables. |
|
|
600 |
# If your code requires them, put something here. |
|
|
601 |
} |
|
|
602 |
} |
|
|
603 |
} |
|
|
604 |
|
|
|
605 |
# Save the results |
|
|
606 |
write.csv(risk.dist.by.var, paste0(output.filename.base, '-risk-violins.csv')) |
|
|
607 |
write.csv(risk.cats, paste0(output.filename.base, '-risk-cats.csv')) |
|
|
608 |
|
|
|
609 |
# Plot the results |
|
|
610 |
ggplot() + |
|
|
611 |
# First, and therefore at the bottom, draw the reference line at risk = 1 |
|
|
612 |
geom_hline(yintercept = 1) + |
|
|
613 |
# Then, on top of that, draw the violin plot of the risk from the data |
|
|
614 |
geom_violin(data = risk.dist.by.var, aes(x = quantity, y = risk)) + |
|
|
615 |
geom_pointrange( |
|
|
616 |
data = risk.cats, |
|
|
617 |
aes(x = quantity, y = our_value, ymin = our_lower, |
|
|
618 |
ymax = our_upper), |
|
|
619 |
|
|
|
620 |
position = position_jitter(width = 0.1) |
|
|
621 |
) + |
|
|
622 |
geom_text( |
|
|
623 |
data = risk.cats, |
|
|
624 |
aes( |
|
|
625 |
x = quantity, |
|
|
626 |
y = our_value, |
|
|
627 |
label = quantity.level |
|
|
628 |
) |
|
|
629 |
) |