#+ knitr_setup, include = FALSE
# Whether to cache the intensive code sections. Set to FALSE to recalculate
# everything afresh.
cacheoption <- FALSE
# Disable lazy caching globally, because it fails for large objects, and all the
# objects we wish to cache are large...
opts_chunk$set(cache.lazy = FALSE)
#' # Replicating Rapsomaniki _et al._ 2014 without imputation
#'
#' Rapsomaniki and co-workers' paper creates a Cox hazard model to predict time
#' to death for patients using electronic health record data.
#'
#' Because such data
#' are gathered as part of routine clinical care, some values may be missing.
#' For example, covariates include blood tests such as creatine level, and use
#' the most recent value no older than six months. A patient may simply not have
#' had this test in that time period.
#'
#' The approach adopted in this situation, in common with many other similar
#' studies, is to impute missing values. This essentially involves finding
#' patients whose other values are similar to make a best guess at the likely
#' value of a missing datapoint. However, in the case of health records data,
#' the missingness of a value may be informative: the fact that a certain test
#' wasn't ordered could result from a doctor not thinking that test necessary,
#' meaning that its absence carries information.
#'
#' Whilst there are many approaches which could be employed here, this program
#' represents arguably the simplest: we keep the Cox model as close to the
#' published version as possible but, instead of imputing the missing data, we
#' include it explicitly.
#'
#' ## User variables
#'
#' First, define some variables...
#+ define_vars
n.data <- NA # This is of full dataset...further rows may be excluded in prep
endpoint <- 'death'
old.coefficients.filename <- 'rapsomaniki-cox-values-from-paper.csv'
output.filename.base <- '../../output/caliber-replicate-with-missing-survreg-6-linear-age'
bootstraps <- 200
n.threads <- 20
# Create a few filenames from the base
new.coefficients.filename <- paste0(output.filename.base, '-coeffs-3.csv')
compare.coefficients.filename <-
paste0(output.filename.base, '-bootstrap-3.csv')
cox.var.imp.perm.filename <- paste0(output.filename.base, '-var-imp-perm-3.csv')
model.filename <- paste0(output.filename.base, '-surv-boot.rds')
#' ## Setup
#+ setup, message=FALSE
source('../lib/shared.R')
require(xtable)
require(ggrepel)
source('caliber-scale.R')
# Load the data and convert to data frame to make column-selecting code in
# prepData simpler
COHORT.full <- data.frame(fread(data.filename))
# Remove the patients we shouldn't include
COHORT.full <-
COHORT.full[
# remove negative times to death
COHORT.full$time_death > 0 &
# remove patients who should be excluded
!COHORT.full$exclude
,
]
# If n.data was specified...
if(!is.na(n.data)){
# Take a subset n.data in size
COHORT.use <- sample.df(COHORT.full, n.data)
rm(COHORT.full)
} else {
# Use all the data
COHORT.use <- COHORT.full
rm(COHORT.full)
}
# Redefine n.data just in case we lost any rows
n.data <- nrow(COHORT.use)
# Define indices of test set
test.set <- testSetIndices(COHORT.use, random.seed = 78361)
#' OK, we've now got **`r n.data`** patients, split into a training set of
#' `r n.data - length(test.set)` and a test set of `r length(test.set)`.
#'
#'
#' ## Transform variables
#'
#' The model uses variables which have been standardised in various ways, so
#' let's go through and transform our input variables in the same way...
#' ### Age
#'
#' There seem to be some issues with the age spline function, so let's just fit
#' to a linear one as a check...
ageSpline <- identity
#'
#' ### Other variables
#'
#' * Most other variables in the model are simply normalised by a factor with an
#' offset.
#' * Variables which are factors (such as diagnosis) need to make sure that
#' their first level is the one which was used as the baseline in the paper.
#' * The IMD (social deprivation) score is used by flagging those patients who
#' are in the bottom quintile.
COHORT.scaled <-
caliberScale(COHORT.use, surv.time, surv.event)
#' ## Missing values
#'
#' To incorporate missing values, we need to do different things depending on
#' the variable type.
#'
#' * If the variable is a factor, we can simply add an extra factor level
#' 'missing' to account for missing values.
#' * For logical or numerical values, we first make a new column of logicals to
#' indicate whether the value was missing or not. Those logicals will then
#' themselves be variables with associated beta values, giving the risk
#' associated with having a value missing. Then, set the a logical to FALSE or
#' a numeric to 0, the baseline value, therefore not having any effect on the
#' final survival curve.
# Specify missing columns - diagnosis only has a handful of missing values so
# sometimes doesn't have missing ones in the sampled training set, meaning
# prepCoxMissing wouldn't fix it.
missing.cols <-
c(
"diagnosis", "most_deprived", "smokstatus", "total_chol_6mo", "hdl_6mo",
"pulse_6mo", "crea_6mo", "total_wbc_6mo", "haemoglobin_6mo"
)
COHORT.scaled.demissed <- prepCoxMissing(COHORT.scaled, missing.cols)
#' ## Survival fitting
#'
#' Fit a Cox model to the preprocessed data. The paper uses a Cox model with an
#' exponential baseline hazard, as here. The standard errors were calculated
#' with 200 bootstrap samples, which we're also doing here.
#+ fit_cox_model, cache=cacheoption
surv.formula <-
Surv(surv_time, surv_event) ~
### Sociodemographic characteristics #######################################
## Age in men, per year
## Age in women, per year
## Women vs. men
# ie include interaction between age and gender!
age * gender +
## Most deprived quintile, yes vs. no
most_deprived +
most_deprived_missing +
### SCAD diagnosis and severity ############################################
## Other CHD vs. stable angina
## Unstable angina vs. stable angina
## NSTEMI vs. stable angina
## STEMI vs. stable angina
diagnosis +
#diagnosis_missing +
## PCI in last 6 months, yes vs. no
pci_6mo +
## CABG in last 6 months, yes vs. no
cabg_6mo +
## Previous/recurrent MI, yes vs. no
#hx_mi +
## Use of nitrates, yes vs. no
long_nitrate +
### CVD risk factors #######################################################
## Ex-smoker / current smoker / missing data vs. never
smokstatus +
## Hypertension, present vs. absent
hypertension +
## Diabetes mellitus, present vs. absent
diabetes_logical +
## Total cholesterol, per 1 mmol/L increase
total_chol_6mo +
total_chol_6mo_missing +
## HDL, per 0.5 mmol/L increase
hdl_6mo +
hdl_6mo_missing +
### CVD co-morbidities #####################################################
## Heart failure, present vs. absent
heart_failure +
## Peripheral arterial disease, present vs. absent
pad +
## Atrial fibrillation, present vs. absent
hx_af +
## Stroke, present vs. absent
hx_stroke +
### Non-CVD comorbidities ##################################################
## Chronic kidney disease, present vs. absent
hx_renal +
## Chronic obstructive pulmonary disease, present vs. absent
hx_copd +
## Cancer, present vs. absent
hx_cancer +
## Chronic liver disease, present vs. absent
hx_liver +
### Psychosocial characteristics ###########################################
## Depression at diagnosis, present vs. absent
hx_depression +
## Anxiety at diagnosis, present vs. absent
hx_anxiety +
### Biomarkers #############################################################
## Heart rate, per 10 b.p.m increase
pulse_6mo +
pulse_6mo_missing +
## Creatinine, per 30 μmol/L increase
crea_6mo +
crea_6mo_missing +
## White cell count, per 1.5 109/L increase
total_wbc_6mo +
total_wbc_6mo_missing +
## Haemoglobin, per 1.5 g/dL increase
haemoglobin_6mo +
haemoglobin_6mo_missing
# Fit the main model with all data
fit.exp <- survreg(
formula = surv.formula,
data = COHORT.scaled.demissed[-test.set, ],
dist = "exponential"
)
# Run a bootstrap on the model to find uncertainties
fit.exp.boot <-
boot(
formula = surv.formula,
data = COHORT.scaled.demissed[-test.set, ],
statistic = bootstrapFitSurvreg,
R = bootstraps,
parallel = 'multicore',
ncpus = n.threads,
test.data = COHORT.scaled.demissed[test.set, ]
)
# Save the fit, because it might've taken a while!
saveRDS(fit.exp.boot, model.filename)
# Unpackage the uncertainties from the bootstrapped data
fit.exp.boot.ests <- bootStats(fit.exp.boot, uncertainty = '95ci')
# Write the raw bootstrap estimates to a file
write.csv(
fit.exp.boot.ests, paste0(output.filename.base, '-bootstrap-coeffs.csv')
)
# Save bootstrapped performance values
varsToTable(
data.frame(
model = 'cox',
imputation = FALSE,
discretised = FALSE,
c.index = fit.exp.boot.ests['c.index', 'val'],
c.index.lower = fit.exp.boot.ests['c.index', 'lower'],
c.index.upper = fit.exp.boot.ests['c.index', 'upper'],
calibration.score = fit.exp.boot.ests['calibration.score', 'val'],
calibration.score.lower = fit.exp.boot.ests['calibration.score', 'lower'],
calibration.score.upper = fit.exp.boot.ests['calibration.score', 'upper']
),
performance.file,
index.cols = c('model', 'imputation', 'discretised')
)
#' ## Performance
#'
#' ### C-index
#'
#' Having fitted the Cox model, how did we do? The c-indices were calculated as
#' part of the bootstrapping, so we just need to take a look at those...
#'
#' C-indices are **`r round(fit.exp.boot.ests['c.index', 'val'], 3)`
#' (`r round(fit.exp.boot.ests['c.index', 'lower'], 3)` -
#' `r round(fit.exp.boot.ests['c.index', 'upper'], 3)`)** on the held-out test
#' set.
#'
#' ### Calibration
#'
#' The bootstrapped calibration score is
#' **`r round(fit.exp.boot.ests['calibration.score', 'val'], 3)`
#' (`r round(fit.exp.boot.ests['calibration.score', 'lower'], 3)` -
#' `r round(fit.exp.boot.ests['calibration.score', 'upper'], 3)`)**.
#'
#' Let's draw a representative curve from the unbootstrapped fit... (It would be
#' better to draw all the curves from the bootstrap fit to get an idea of
#' variability, but I've not implemented this yet.)
#'
#+ calibration_plot
calibration.table <-
calibrationTable(fit.exp, COHORT.scaled.demissed[test.set, ])
calibrationPlot(calibration.table)
#'
#' ## Coefficients
#'
#' As well as getting comparable C-indices, it's also worth checking to see how
#' the risk coefficients calculated compare to those found in the original
#' paper. Let's compare...
# Load CSV of values from paper
old.coefficients <- read.csv(old.coefficients.filename)
# Get coefficients from this fit
new.coefficients <-
bootStats(fit.exp.boot, uncertainty = '95ci', transform = negExp)
names(new.coefficients) <- c('our_value', 'our_lower', 'our_upper')
new.coefficients$quantity.level <- rownames(new.coefficients)
# Create a data frame comparing them
compare.coefficients <- merge(old.coefficients, new.coefficients)
# Kludge because age:genderWomen is the pure interaction term, not the risk for
# a woman per unit of advancing spline-transformed age
compare.coefficients[
compare.coefficients$quantity.level == 'age:genderWomen', 'our_value'
] <-
compare.coefficients[
compare.coefficients$quantity.level == 'age:genderWomen', 'our_value'
] *
compare.coefficients[
compare.coefficients$quantity.level == 'age', 'our_value'
]
# Save CSV of results
write.csv(compare.coefficients, new.coefficients.filename)
# Plot a graph by which to judge success
ggplot(compare.coefficients, aes(x = their_value, y = our_value)) +
geom_abline(intercept = 0, slope = 1) +
geom_hline(yintercept = 1, colour = 'grey') +
geom_vline(xintercept = 1, colour = 'grey') +
geom_point() +
geom_errorbar(aes(ymin = our_lower, ymax = our_upper)) +
geom_errorbarh(aes(xmin = their_lower, xmax = their_upper)) +
geom_text_repel(aes(label = long_name)) +
theme_classic(base_size = 8)
#+ coefficients_table, results='asis'
print(
xtable(
data.frame(
variable =
paste(
compare.coefficients$long_name, compare.coefficients$unit, sep=', '
),
compare.coefficients[c('our_value', 'their_value')]
),
digits = c(0,0,3,3)
),
type = 'html',
include.rownames = FALSE
)
#' ### Variable importance
#'
#' To establish how important a given variable is in determining outcome (and to
#' compare with other measures such as variable importance with random forests),
#' it would be worthwhile to calculate an equivalent measure for a Cox model.
#' The easiest way to do this across techniques is to look at it by permutation:
#' randomly permute values in each variable, and see how much worse it makes the
#' prediction.
#'
#+ cox_variable_importance
cox.var.imp.perm <-
generalVarImp(
fit.exp, COHORT.scaled.demissed[test.set, ], model.type = 'survreg'
)
write.csv(cox.var.imp.perm, cox.var.imp.perm.filename, row.names = FALSE)
#' ## Strange things about gender
#'
#' The only huge outlier in this comparison is gender: according to the paper,
#' being a woman is significantly safer than being a man, whereas we
#' don't find a striking difference between genders.
#'
#' Let's try some simple fits to see if this is explicable.
#'
#' ### Fit based only on gender
print(
exp(
-survreg(
Surv(surv_time, surv_event) ~ gender,
data = COHORT.scaled[-test.set, ],
dist = "exponential"
)$coeff
)
)
#' According to a model based only on gender, women are at higher risk than men.
#' This suggests that there are indeed confounding factors in this dataset.
#'
#' ### Fit based on age and gender
print(
exp(
-survreg(
Surv(surv_time, surv_event) ~ age + gender,
data = COHORT.scaled[-test.set, ],
dist = "exponential"
)$coeff
)
)
#' Once age is taken into account, it is (obviously) more dangerous to be older,
#' and it is once again safer to be female.
ggplot(COHORT.use, aes(x = age, group = gender, fill = gender)) +
geom_histogram(alpha = 0.8, position = 'dodge', binwidth = 1)
#' And, as we can see from this histogram, this is explained by the fact that
#' the women in this dataset do indeed tend to be older than the men.
print(
exp(
-survreg(
Surv(surv_time, surv_event) ~ age * gender,
data = COHORT.scaled[-test.set, ],
dist = "exponential"
)$coeff
)
)
#' Finally, looking at a model where age and gender are allowed to interact, the
#' results are once again a little confusing: being older is worse, which makes
#' sense, but being a woman is again worse. This is then offset by the slightly
#' positive interaction between age and being a woman, meaning that the overall
#' effect of being a slightly older woman will be positive (especially because
#' the spline function gets much larger with advancing age).
#'
#' It's important to remember that the output of any regression model with many
#' terms gives the strength of a given relationship having already controlled
#' for variation due to those other terms. In this case, even just using two
#' terms can give counter-intuitive results if you try to interpret coefficients
#' in isolation.
#'
#' ## Coefficients for missing values
#'
#' Let's see how coefficients for missing values compare to the range of risks
#' implied by the range of values for data...
# Value of creatinine which would result in a 25% increased risk of death
creatinine.25pc.risk <- 60 + 30 * log(1.25)/log(compare.coefficients[
compare.coefficients$quantity.level == 'crea_6mo', 'our_value'
])
# Equivalent value of creatinine for patients with missing data
creatinine.missing <- 60 + 30 *
log(compare.coefficients[
compare.coefficients$quantity.level == 'crea_6mo_missingTRUE', 'our_value'
]) /
log(compare.coefficients[
compare.coefficients$quantity.level == 'crea_6mo', 'our_value'
])
text.y.pos <- 3500
ggplot(
# Only plot the lowest 95 percentiles of data due to outliers
subset(COHORT.use, crea_6mo < quantile(crea_6mo, 0.95, na.rm = TRUE)),
aes(x = crea_6mo)
) +
geom_histogram(bins = 30) +
geom_vline(xintercept = 60) +
annotate("text", x = 60, y = text.y.pos, angle = 270, hjust = 0, vjust = 1,
label = "Baseline") +
geom_vline(xintercept = creatinine.25pc.risk) +
annotate("text", x = creatinine.25pc.risk, y = text.y.pos, angle = 270,
hjust = 0, vjust = 1, label = "25% more risk") +
geom_vline(xintercept = creatinine.missing) +
annotate("text", x = creatinine.missing, y = text.y.pos, angle = 270,
hjust = 0, vjust = 1, label = "missing data eqv")
#' So, having a missing creatinine value is slightly safer than having a
#' baseline reading, which is on the low end of observed values in this cohort.
#' Even an extremely high creatinine reading doesn't confer more than 25%
#' additional risk.
# Value which would result in a 25% increased risk of death
hdl.10pc.risk <- 1.5 + 0.5 * log(1.1)/log(compare.coefficients[
compare.coefficients$quantity.level == 'hdl_6mo', 'our_value'
])
# Equivalent value for patients with missing data
hdl.missing <- 1.5 + 0.5 *
log(compare.coefficients[
compare.coefficients$quantity.level == 'hdl_6mo_missingTRUE', 'our_value'
]) /
log(compare.coefficients[
compare.coefficients$quantity.level == 'hdl_6mo', 'our_value'
])
text.y.pos <- 4000
ggplot(
# Only plot the lowest 95 percentiles of data due to outliers
subset(COHORT.use, hdl_6mo < quantile(hdl_6mo, 0.95, na.rm = TRUE)),
aes(x = hdl_6mo)
) +
geom_histogram(bins = 30) +
geom_vline(xintercept = 1.5) +
annotate("text", x = 1.5, y = text.y.pos, angle = 270, hjust = 0, vjust = 1,
label = "Baseline") +
geom_vline(xintercept = hdl.10pc.risk) +
annotate("text", x = hdl.10pc.risk, y = text.y.pos, angle = 270, hjust = 0,
vjust = 1, label = "10% more risk") +
geom_vline(xintercept = hdl.missing) +
annotate("text", x = hdl.missing, y = text.y.pos, angle = 270, hjust = 0,
vjust = 1, label = "missing data eqv")
#' Missing HDL values seem to have the opposite effect: it's substantially more
#' risky to have a blank value here. One possible explanation is that this is
#' such a common blood test that its absence indicates that the patient is not
#' seeking or receiving adequate medical care.
#'
#' ## Systematic analysis of missing values
#'
#' Clearly the danger (or otherwise) of having missing values differs by
#' variable, according to this model. Let's look at all of the variables, to see
#' how all missing values compare, and whether they make sense.
#'
#+ missing_values_vs_ranges
# For continuous variables, get risk values for all patients for violin plots
risk.dist.by.var <- data.frame()
# For categorical variables, let's get all levels of the factor
risk.cats <- data.frame()
# Quantities not to plot in this graph
exclude.quantities <-
c(
'age', # too large a risk range, and no missing values anyway
'gender', 'diagnosis', # no missing values
'diabetes' # is converted to diabetes_logical so causes an error if included
)
for(quantity in surv.predict) {
# If we're not excluding..
if(!(quantity %in% exclude.quantities)) {
# If it's numeric
if(is.numeric(COHORT.scaled[, quantity])) {
# Get risks by taking the scaled values (NOT processed for missing ones, or
# there will be a lot of 0s in there) and taking quantity risks to that power
risk <-
new.coefficients$our_value[new.coefficients$quantity.level == quantity] ^
COHORT.scaled[, quantity]
# Discard outliers with absurd values
inliers <- inRange(risk, quantile(risk, c(0.01, 0.99), na.rm = TRUE))
risk <- risk[inliers]
risk.dist.by.var <-
rbind(
risk.dist.by.var,
data.frame(quantity, risk)
)
risk.cats <-
rbind(
risk.cats,
data.frame(
quantity,
new.coefficients[
new.coefficients$quantity.level ==
paste0(quantity, '_missingTRUE'),
]
)
)
} else if(is.factor(COHORT.scaled[, quantity])) {
risk.cats <-
rbind(
risk.cats,
data.frame(
quantity,
new.coefficients[
startsWith(as.character(new.coefficients$quantity), quantity),
]
)
)
} else {
# We're not going to include logicals in this plot, because they cannot
# be missing, by definition, in the logicals used here.
# There shouldn't be any other kinds of variables.
# If your code requires them, put something here.
}
}
}
# Save the results
write.csv(risk.dist.by.var, paste0(output.filename.base, '-risk-violins.csv'))
write.csv(risk.cats, paste0(output.filename.base, '-risk-cats.csv'))
# Plot the results
ggplot() +
# First, and therefore at the bottom, draw the reference line at risk = 1
geom_hline(yintercept = 1) +
# Then, on top of that, draw the violin plot of the risk from the data
geom_violin(data = risk.dist.by.var, aes(x = quantity, y = risk)) +
geom_pointrange(
data = risk.cats,
aes(x = quantity, y = our_value, ymin = our_lower,
ymax = our_upper),
position = position_jitter(width = 0.1)
) +
geom_text(
data = risk.cats,
aes(
x = quantity,
y = our_value,
label = quantity.level
)
)