--- a +++ b/cox-ph/caliber-replicate-with-missing.R @@ -0,0 +1,629 @@ +#+ knitr_setup, include = FALSE + +# Whether to cache the intensive code sections. Set to FALSE to recalculate +# everything afresh. +cacheoption <- FALSE +# Disable lazy caching globally, because it fails for large objects, and all the +# objects we wish to cache are large... +opts_chunk$set(cache.lazy = FALSE) + +#' # Replicating Rapsomaniki _et al._ 2014 without imputation +#' +#' Rapsomaniki and co-workers' paper creates a Cox hazard model to predict time +#' to death for patients using electronic health record data. +#' +#' Because such data +#' are gathered as part of routine clinical care, some values may be missing. +#' For example, covariates include blood tests such as creatine level, and use +#' the most recent value no older than six months. A patient may simply not have +#' had this test in that time period. +#' +#' The approach adopted in this situation, in common with many other similar +#' studies, is to impute missing values. This essentially involves finding +#' patients whose other values are similar to make a best guess at the likely +#' value of a missing datapoint. However, in the case of health records data, +#' the missingness of a value may be informative: the fact that a certain test +#' wasn't ordered could result from a doctor not thinking that test necessary, +#' meaning that its absence carries information. +#' +#' Whilst there are many approaches which could be employed here, this program +#' represents arguably the simplest: we keep the Cox model as close to the +#' published version as possible but, instead of imputing the missing data, we +#' include it explicitly. +#' +#' ## User variables +#' +#' First, define some variables... + +#+ define_vars + +n.data <- NA # This is of full dataset...further rows may be excluded in prep +endpoint <- 'death' + +old.coefficients.filename <- 'rapsomaniki-cox-values-from-paper.csv' + +output.filename.base <- '../../output/caliber-replicate-with-missing-survreg-6-linear-age' + +bootstraps <- 200 +n.threads <- 20 + +# Create a few filenames from the base +new.coefficients.filename <- paste0(output.filename.base, '-coeffs-3.csv') +compare.coefficients.filename <- + paste0(output.filename.base, '-bootstrap-3.csv') +cox.var.imp.perm.filename <- paste0(output.filename.base, '-var-imp-perm-3.csv') +model.filename <- paste0(output.filename.base, '-surv-boot.rds') + +#' ## Setup + +#+ setup, message=FALSE + +source('../lib/shared.R') +require(xtable) +require(ggrepel) +source('caliber-scale.R') + +# Load the data and convert to data frame to make column-selecting code in +# prepData simpler +COHORT.full <- data.frame(fread(data.filename)) + +# Remove the patients we shouldn't include +COHORT.full <- + COHORT.full[ + # remove negative times to death + COHORT.full$time_death > 0 & + # remove patients who should be excluded + !COHORT.full$exclude + , + ] + +# If n.data was specified... +if(!is.na(n.data)){ + # Take a subset n.data in size + COHORT.use <- sample.df(COHORT.full, n.data) + rm(COHORT.full) +} else { + # Use all the data + COHORT.use <- COHORT.full + rm(COHORT.full) +} + +# Redefine n.data just in case we lost any rows +n.data <- nrow(COHORT.use) +# Define indices of test set +test.set <- testSetIndices(COHORT.use, random.seed = 78361) + +#' OK, we've now got **`r n.data`** patients, split into a training set of +#' `r n.data - length(test.set)` and a test set of `r length(test.set)`. +#' +#' +#' ## Transform variables +#' +#' The model uses variables which have been standardised in various ways, so +#' let's go through and transform our input variables in the same way... + +#' ### Age +#' +#' There seem to be some issues with the age spline function, so let's just fit +#' to a linear one as a check... + +ageSpline <- identity + +#' +#' ### Other variables +#' +#' * Most other variables in the model are simply normalised by a factor with an +#' offset. +#' * Variables which are factors (such as diagnosis) need to make sure that +#' their first level is the one which was used as the baseline in the paper. +#' * The IMD (social deprivation) score is used by flagging those patients who +#' are in the bottom quintile. + +COHORT.scaled <- + caliberScale(COHORT.use, surv.time, surv.event) + +#' ## Missing values +#' +#' To incorporate missing values, we need to do different things depending on +#' the variable type. +#' +#' * If the variable is a factor, we can simply add an extra factor level +#' 'missing' to account for missing values. +#' * For logical or numerical values, we first make a new column of logicals to +#' indicate whether the value was missing or not. Those logicals will then +#' themselves be variables with associated beta values, giving the risk +#' associated with having a value missing. Then, set the a logical to FALSE or +#' a numeric to 0, the baseline value, therefore not having any effect on the +#' final survival curve. + +# Specify missing columns - diagnosis only has a handful of missing values so +# sometimes doesn't have missing ones in the sampled training set, meaning +# prepCoxMissing wouldn't fix it. +missing.cols <- + c( + "diagnosis", "most_deprived", "smokstatus", "total_chol_6mo", "hdl_6mo", + "pulse_6mo", "crea_6mo", "total_wbc_6mo", "haemoglobin_6mo" + ) +COHORT.scaled.demissed <- prepCoxMissing(COHORT.scaled, missing.cols) + +#' ## Survival fitting +#' +#' Fit a Cox model to the preprocessed data. The paper uses a Cox model with an +#' exponential baseline hazard, as here. The standard errors were calculated +#' with 200 bootstrap samples, which we're also doing here. + +#+ fit_cox_model, cache=cacheoption + +surv.formula <- + Surv(surv_time, surv_event) ~ + ### Sociodemographic characteristics ####################################### + ## Age in men, per year + ## Age in women, per year + ## Women vs. men + # ie include interaction between age and gender! + age * gender + + ## Most deprived quintile, yes vs. no + most_deprived + + most_deprived_missing + + ### SCAD diagnosis and severity ############################################ + ## Other CHD vs. stable angina + ## Unstable angina vs. stable angina + ## NSTEMI vs. stable angina + ## STEMI vs. stable angina + diagnosis + + #diagnosis_missing + + ## PCI in last 6 months, yes vs. no + pci_6mo + + ## CABG in last 6 months, yes vs. no + cabg_6mo + + ## Previous/recurrent MI, yes vs. no + #hx_mi + + ## Use of nitrates, yes vs. no + long_nitrate + + ### CVD risk factors ####################################################### + ## Ex-smoker / current smoker / missing data vs. never + smokstatus + + ## Hypertension, present vs. absent + hypertension + + ## Diabetes mellitus, present vs. absent + diabetes_logical + + ## Total cholesterol, per 1 mmol/L increase + total_chol_6mo + + total_chol_6mo_missing + + ## HDL, per 0.5 mmol/L increase + hdl_6mo + + hdl_6mo_missing + + ### CVD co-morbidities ##################################################### + ## Heart failure, present vs. absent + heart_failure + + ## Peripheral arterial disease, present vs. absent + pad + + ## Atrial fibrillation, present vs. absent + hx_af + + ## Stroke, present vs. absent + hx_stroke + + ### Non-CVD comorbidities ################################################## + ## Chronic kidney disease, present vs. absent + hx_renal + + ## Chronic obstructive pulmonary disease, present vs. absent + hx_copd + + ## Cancer, present vs. absent + hx_cancer + + ## Chronic liver disease, present vs. absent + hx_liver + + ### Psychosocial characteristics ########################################### + ## Depression at diagnosis, present vs. absent + hx_depression + + ## Anxiety at diagnosis, present vs. absent + hx_anxiety + + ### Biomarkers ############################################################# + ## Heart rate, per 10 b.p.m increase + pulse_6mo + + pulse_6mo_missing + + ## Creatinine, per 30 μmol/L increase + crea_6mo + + crea_6mo_missing + + ## White cell count, per 1.5 109/L increase + total_wbc_6mo + + total_wbc_6mo_missing + + ## Haemoglobin, per 1.5 g/dL increase + haemoglobin_6mo + + haemoglobin_6mo_missing + +# Fit the main model with all data +fit.exp <- survreg( + formula = surv.formula, + data = COHORT.scaled.demissed[-test.set, ], + dist = "exponential" +) + +# Run a bootstrap on the model to find uncertainties +fit.exp.boot <- + boot( + formula = surv.formula, + data = COHORT.scaled.demissed[-test.set, ], + statistic = bootstrapFitSurvreg, + R = bootstraps, + parallel = 'multicore', + ncpus = n.threads, + test.data = COHORT.scaled.demissed[test.set, ] + ) + +# Save the fit, because it might've taken a while! +saveRDS(fit.exp.boot, model.filename) + +# Unpackage the uncertainties from the bootstrapped data +fit.exp.boot.ests <- bootStats(fit.exp.boot, uncertainty = '95ci') + +# Write the raw bootstrap estimates to a file +write.csv( + fit.exp.boot.ests, paste0(output.filename.base, '-bootstrap-coeffs.csv') +) + +# Save bootstrapped performance values +varsToTable( + data.frame( + model = 'cox', + imputation = FALSE, + discretised = FALSE, + c.index = fit.exp.boot.ests['c.index', 'val'], + c.index.lower = fit.exp.boot.ests['c.index', 'lower'], + c.index.upper = fit.exp.boot.ests['c.index', 'upper'], + calibration.score = fit.exp.boot.ests['calibration.score', 'val'], + calibration.score.lower = fit.exp.boot.ests['calibration.score', 'lower'], + calibration.score.upper = fit.exp.boot.ests['calibration.score', 'upper'] + ), + performance.file, + index.cols = c('model', 'imputation', 'discretised') +) + +#' ## Performance +#' +#' ### C-index +#' +#' Having fitted the Cox model, how did we do? The c-indices were calculated as +#' part of the bootstrapping, so we just need to take a look at those... +#' +#' C-indices are **`r round(fit.exp.boot.ests['c.index', 'val'], 3)` +#' (`r round(fit.exp.boot.ests['c.index', 'lower'], 3)` - +#' `r round(fit.exp.boot.ests['c.index', 'upper'], 3)`)** on the held-out test +#' set. +#' +#' ### Calibration +#' +#' The bootstrapped calibration score is +#' **`r round(fit.exp.boot.ests['calibration.score', 'val'], 3)` +#' (`r round(fit.exp.boot.ests['calibration.score', 'lower'], 3)` - +#' `r round(fit.exp.boot.ests['calibration.score', 'upper'], 3)`)**. +#' +#' Let's draw a representative curve from the unbootstrapped fit... (It would be +#' better to draw all the curves from the bootstrap fit to get an idea of +#' variability, but I've not implemented this yet.) +#' +#+ calibration_plot + +calibration.table <- + calibrationTable(fit.exp, COHORT.scaled.demissed[test.set, ]) + +calibrationPlot(calibration.table) + +#' +#' ## Coefficients +#' +#' As well as getting comparable C-indices, it's also worth checking to see how +#' the risk coefficients calculated compare to those found in the original +#' paper. Let's compare... + +# Load CSV of values from paper +old.coefficients <- read.csv(old.coefficients.filename) + +# Get coefficients from this fit +new.coefficients <- + bootStats(fit.exp.boot, uncertainty = '95ci', transform = negExp) +names(new.coefficients) <- c('our_value', 'our_lower', 'our_upper') +new.coefficients$quantity.level <- rownames(new.coefficients) + +# Create a data frame comparing them +compare.coefficients <- merge(old.coefficients, new.coefficients) + +# Kludge because age:genderWomen is the pure interaction term, not the risk for +# a woman per unit of advancing spline-transformed age +compare.coefficients[ + compare.coefficients$quantity.level == 'age:genderWomen', 'our_value' +] <- + compare.coefficients[ + compare.coefficients$quantity.level == 'age:genderWomen', 'our_value' + ] * + compare.coefficients[ + compare.coefficients$quantity.level == 'age', 'our_value' + ] + +# Save CSV of results +write.csv(compare.coefficients, new.coefficients.filename) + +# Plot a graph by which to judge success +ggplot(compare.coefficients, aes(x = their_value, y = our_value)) + + geom_abline(intercept = 0, slope = 1) + + geom_hline(yintercept = 1, colour = 'grey') + + geom_vline(xintercept = 1, colour = 'grey') + + geom_point() + + geom_errorbar(aes(ymin = our_lower, ymax = our_upper)) + + geom_errorbarh(aes(xmin = their_lower, xmax = their_upper)) + + geom_text_repel(aes(label = long_name)) + + theme_classic(base_size = 8) + +#+ coefficients_table, results='asis' + +print( + xtable( + data.frame( + variable = + paste( + compare.coefficients$long_name, compare.coefficients$unit, sep=', ' + ), + compare.coefficients[c('our_value', 'their_value')] + ), + digits = c(0,0,3,3) + ), + type = 'html', + include.rownames = FALSE +) + +#' ### Variable importance +#' +#' To establish how important a given variable is in determining outcome (and to +#' compare with other measures such as variable importance with random forests), +#' it would be worthwhile to calculate an equivalent measure for a Cox model. +#' The easiest way to do this across techniques is to look at it by permutation: +#' randomly permute values in each variable, and see how much worse it makes the +#' prediction. +#' +#+ cox_variable_importance + +cox.var.imp.perm <- + generalVarImp( + fit.exp, COHORT.scaled.demissed[test.set, ], model.type = 'survreg' + ) + +write.csv(cox.var.imp.perm, cox.var.imp.perm.filename, row.names = FALSE) + +#' ## Strange things about gender +#' +#' The only huge outlier in this comparison is gender: according to the paper, +#' being a woman is significantly safer than being a man, whereas we +#' don't find a striking difference between genders. +#' +#' Let's try some simple fits to see if this is explicable. +#' +#' ### Fit based only on gender + +print( + exp( + -survreg( + Surv(surv_time, surv_event) ~ gender, + data = COHORT.scaled[-test.set, ], + dist = "exponential" + )$coeff + ) +) + +#' According to a model based only on gender, women are at higher risk than men. +#' This suggests that there are indeed confounding factors in this dataset. +#' +#' ### Fit based on age and gender + +print( + exp( + -survreg( + Surv(surv_time, surv_event) ~ age + gender, + data = COHORT.scaled[-test.set, ], + dist = "exponential" + )$coeff + ) +) + +#' Once age is taken into account, it is (obviously) more dangerous to be older, +#' and it is once again safer to be female. + +ggplot(COHORT.use, aes(x = age, group = gender, fill = gender)) + + geom_histogram(alpha = 0.8, position = 'dodge', binwidth = 1) + +#' And, as we can see from this histogram, this is explained by the fact that +#' the women in this dataset do indeed tend to be older than the men. + +print( + exp( + -survreg( + Surv(surv_time, surv_event) ~ age * gender, + data = COHORT.scaled[-test.set, ], + dist = "exponential" + )$coeff + ) +) + +#' Finally, looking at a model where age and gender are allowed to interact, the +#' results are once again a little confusing: being older is worse, which makes +#' sense, but being a woman is again worse. This is then offset by the slightly +#' positive interaction between age and being a woman, meaning that the overall +#' effect of being a slightly older woman will be positive (especially because +#' the spline function gets much larger with advancing age). +#' +#' It's important to remember that the output of any regression model with many +#' terms gives the strength of a given relationship having already controlled +#' for variation due to those other terms. In this case, even just using two +#' terms can give counter-intuitive results if you try to interpret coefficients +#' in isolation. +#' +#' ## Coefficients for missing values +#' +#' Let's see how coefficients for missing values compare to the range of risks +#' implied by the range of values for data... + +# Value of creatinine which would result in a 25% increased risk of death +creatinine.25pc.risk <- 60 + 30 * log(1.25)/log(compare.coefficients[ + compare.coefficients$quantity.level == 'crea_6mo', 'our_value' + ]) + +# Equivalent value of creatinine for patients with missing data +creatinine.missing <- 60 + 30 * + log(compare.coefficients[ + compare.coefficients$quantity.level == 'crea_6mo_missingTRUE', 'our_value' + ]) / + log(compare.coefficients[ + compare.coefficients$quantity.level == 'crea_6mo', 'our_value' + ]) + +text.y.pos <- 3500 + +ggplot( + # Only plot the lowest 95 percentiles of data due to outliers + subset(COHORT.use, crea_6mo < quantile(crea_6mo, 0.95, na.rm = TRUE)), + aes(x = crea_6mo) +) + + geom_histogram(bins = 30) + + geom_vline(xintercept = 60) + + annotate("text", x = 60, y = text.y.pos, angle = 270, hjust = 0, vjust = 1, + label = "Baseline") + + geom_vline(xintercept = creatinine.25pc.risk) + + annotate("text", x = creatinine.25pc.risk, y = text.y.pos, angle = 270, + hjust = 0, vjust = 1, label = "25% more risk") + + geom_vline(xintercept = creatinine.missing) + + annotate("text", x = creatinine.missing, y = text.y.pos, angle = 270, + hjust = 0, vjust = 1, label = "missing data eqv") + +#' So, having a missing creatinine value is slightly safer than having a +#' baseline reading, which is on the low end of observed values in this cohort. +#' Even an extremely high creatinine reading doesn't confer more than 25% +#' additional risk. + +# Value which would result in a 25% increased risk of death +hdl.10pc.risk <- 1.5 + 0.5 * log(1.1)/log(compare.coefficients[ + compare.coefficients$quantity.level == 'hdl_6mo', 'our_value' + ]) + +# Equivalent value for patients with missing data +hdl.missing <- 1.5 + 0.5 * + log(compare.coefficients[ + compare.coefficients$quantity.level == 'hdl_6mo_missingTRUE', 'our_value' + ]) / + log(compare.coefficients[ + compare.coefficients$quantity.level == 'hdl_6mo', 'our_value' + ]) + +text.y.pos <- 4000 + +ggplot( + # Only plot the lowest 95 percentiles of data due to outliers + subset(COHORT.use, hdl_6mo < quantile(hdl_6mo, 0.95, na.rm = TRUE)), + aes(x = hdl_6mo) +) + + geom_histogram(bins = 30) + + geom_vline(xintercept = 1.5) + + annotate("text", x = 1.5, y = text.y.pos, angle = 270, hjust = 0, vjust = 1, + label = "Baseline") + + geom_vline(xintercept = hdl.10pc.risk) + + annotate("text", x = hdl.10pc.risk, y = text.y.pos, angle = 270, hjust = 0, + vjust = 1, label = "10% more risk") + + geom_vline(xintercept = hdl.missing) + + annotate("text", x = hdl.missing, y = text.y.pos, angle = 270, hjust = 0, + vjust = 1, label = "missing data eqv") + +#' Missing HDL values seem to have the opposite effect: it's substantially more +#' risky to have a blank value here. One possible explanation is that this is +#' such a common blood test that its absence indicates that the patient is not +#' seeking or receiving adequate medical care. +#' +#' ## Systematic analysis of missing values +#' +#' Clearly the danger (or otherwise) of having missing values differs by +#' variable, according to this model. Let's look at all of the variables, to see +#' how all missing values compare, and whether they make sense. +#' +#+ missing_values_vs_ranges + +# For continuous variables, get risk values for all patients for violin plots +risk.dist.by.var <- data.frame() +# For categorical variables, let's get all levels of the factor +risk.cats <- data.frame() +# Quantities not to plot in this graph +exclude.quantities <- + c( + 'age', # too large a risk range, and no missing values anyway + 'gender', 'diagnosis', # no missing values + 'diabetes' # is converted to diabetes_logical so causes an error if included + ) + +for(quantity in surv.predict) { + # If we're not excluding.. + if(!(quantity %in% exclude.quantities)) { + # If it's numeric + if(is.numeric(COHORT.scaled[, quantity])) { + # Get risks by taking the scaled values (NOT processed for missing ones, or + # there will be a lot of 0s in there) and taking quantity risks to that power + risk <- + new.coefficients$our_value[new.coefficients$quantity.level == quantity] ^ + COHORT.scaled[, quantity] + # Discard outliers with absurd values + inliers <- inRange(risk, quantile(risk, c(0.01, 0.99), na.rm = TRUE)) + risk <- risk[inliers] + risk.dist.by.var <- + rbind( + risk.dist.by.var, + data.frame(quantity, risk) + ) + risk.cats <- + rbind( + risk.cats, + data.frame( + quantity, + new.coefficients[ + new.coefficients$quantity.level == + paste0(quantity, '_missingTRUE'), + ] + ) + ) + } else if(is.factor(COHORT.scaled[, quantity])) { + risk.cats <- + rbind( + risk.cats, + data.frame( + quantity, + new.coefficients[ + startsWith(as.character(new.coefficients$quantity), quantity), + ] + ) + ) + } else { + # We're not going to include logicals in this plot, because they cannot + # be missing, by definition, in the logicals used here. + # There shouldn't be any other kinds of variables. + # If your code requires them, put something here. + } + } +} + +# Save the results +write.csv(risk.dist.by.var, paste0(output.filename.base, '-risk-violins.csv')) +write.csv(risk.cats, paste0(output.filename.base, '-risk-cats.csv')) + +# Plot the results +ggplot() + + # First, and therefore at the bottom, draw the reference line at risk = 1 + geom_hline(yintercept = 1) + + # Then, on top of that, draw the violin plot of the risk from the data + geom_violin(data = risk.dist.by.var, aes(x = quantity, y = risk)) + + geom_pointrange( + data = risk.cats, + aes(x = quantity, y = our_value, ymin = our_lower, + ymax = our_upper), + + position = position_jitter(width = 0.1) + ) + + geom_text( + data = risk.cats, + aes( + x = quantity, + y = our_value, + label = quantity.level + ) + ) \ No newline at end of file