--- a +++ b/cox-ph/cox-discretised.R @@ -0,0 +1,355 @@ +#+ knitr_setup, include = FALSE + +# Whether to cache the intensive code sections. Set to FALSE to recalculate +# everything afresh. +cacheoption <- FALSE +# Disable lazy caching globally, because it fails for large objects, and all the +# objects we wish to cache are large... +opts_chunk$set(cache.lazy = FALSE) + +#' # Cross-validating discretisation of input variables in a survival model +#' +#' + +calibration.filename <- '../../output/survreg-crossvalidation-try5.csv' +caliber.missing.coefficients.filename <- + '../../output/caliber-replicate-with-missing-survreg-bootstrap-coeffs-1.csv' +comparison.filename <- + '../../output/caliber-replicate-with-missing-var-imp-try2.csv' +# The first part of the filename for any output +output.filename.base <- '../../output/all-cv-survreg-boot-try5' + + +# What kind of model to fit to...currently 'cph' (Cox model), 'ranger' or +# 'rfsrc' (two implementations of random survival forests) +model.type <- 'survreg' + +# If surv.vars is defined as a character vector here, the model only uses those +# variables specified, eg c('age') would build a model purely based on age. If +# not specified (ie commented out), it will use the defaults. +# surv.predict <- c('age') + +#' ## Do the cross-validation +#' +#' The steps for this are common regardless of model type, so run the script to +#' get a cross-validated model to further analyse... +#+ cox_discretised_cv, cache=cacheoption + +source('../lib/all-cv-bootstrap.R', chdir = TRUE) + +#' # Results +#' +#' ## Performance +#' +#' ### C-index +#' +#' C-index is **`r round(surv.model.fit.coeffs['c.index', 'val'], 3)` +#' (`r round(surv.model.fit.coeffs['c.index', 'lower'], 3)` - +#' `r round(surv.model.fit.coeffs['c.index', 'upper'], 3)`)** on the held-out +#' test set. +#' +#' +#' ### Calibration +#' +#' The bootstrapped calibration score is +#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)` +#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` - +#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**. +#' +#' Let's draw a representative curve from the unbootstrapped fit... (It would be +#' better to draw all the curves from the bootstrap fit to get an idea of +#' variability, but I've not implemented this yet.) +#' +#+ calibration_plot + +calibration.table <- + calibrationTable(surv.model.fit, COHORT.optimised[test.set, ]) + +calibration.score <- calibrationScore(calibration.table) + +calibrationPlot(calibration.table) + +#' +#' ## Model fit +#' +#+ resulting_fit + +print(surv.model.fit) + +#' ## Cox coefficients +#' +#+ cox_coefficients_plot + + +# Save bootstrapped performance values +varsToTable( + data.frame( + model = 'cox', + imputation = FALSE, + discretised = TRUE, + c.index = surv.model.fit.coeffs['c.index', 'val'], + c.index.lower = surv.model.fit.coeffs['c.index', 'lower'], + c.index.upper = surv.model.fit.coeffs['c.index', 'upper'], + calibration.score = surv.model.fit.coeffs['calibration.score', 'val'], + calibration.score.lower = + surv.model.fit.coeffs['calibration.score', 'lower'], + calibration.score.upper = + surv.model.fit.coeffs['calibration.score', 'upper'] + ), + performance.file, + index.cols = c('model', 'imputation', 'discretised') +) + +# Unpackage the uncertainties again, this time transformed because survreg +# returns negative values +surv.boot.ests <- bootStatsDf(surv.model.params.boot, transform = `-`) + +#' First, plot the factors and logicals as a scatter plot to compare with the +#' continuous Cox model... + +# Pull coefficients from model with missing data +caliber.missing.coeffs <- read.csv(caliber.missing.coefficients.filename) + +# Rename surv.boot.ests ready for merging +names(surv.boot.ests) <- + c('cox_discrete_value', 'cox_discrete_lower', 'cox_discrete_upper') +surv.boot.ests$quantity.level <- rownames(surv.boot.ests) +# Convert variablemissing to variable_missingTRUE for compatibility +vars.with.missing <- endsWith(surv.boot.ests$quantity.level, 'missing') +surv.boot.ests$quantity.level[vars.with.missing] <- + paste0( + substr( + surv.boot.ests$quantity.level[vars.with.missing], + 1, + nchar(surv.boot.ests$quantity.level[vars.with.missing]) - nchar('missing') + ), + '_missingTRUE' + ) + +# Create a data frame comparing them +compare.coefficients <- merge(caliber.missing.coeffs, surv.boot.ests) + +ggplot( + compare.coefficients, + aes(x = our_value, y = cox_discrete_value, colour = unit == 'missing') + ) + + geom_abline(intercept = 0, slope = 1) + + geom_hline(yintercept = 1, colour = 'grey') + + geom_vline(xintercept = 1, colour = 'grey') + + geom_point() + + geom_errorbar(aes(ymin = cox_discrete_lower, ymax = cox_discrete_upper)) + + geom_errorbarh(aes(xmin = our_lower, xmax = our_upper)) + + geom_text_repel(aes(label = long_name)) + + theme_classic(base_size = 8) + +# Unpack variable and level names +cph.coeffs <- cphCoeffs( + bootStats(surv.model.fit.boot, uncertainty = '95ci', transform = `-`), + COHORT.optimised, surv.predict, model.type = 'boot.survreg' +) + +# We'll need the CALIBER scaling functions for plotting +source('../cox-ph/caliber-scale.R') + +# set up list to store the plots +cox.discrete.plots <- list() +# Add dummy columns for x-position of missing values +cph.coeffs$missing.x.pos.cont <- NA +cph.coeffs$missing.x.pos.disc <- NA + +for(variable in unique(cph.coeffs$var)) { + # If it's a continuous variable, get the real centres of the bins + if(variable %in% process.settings$var) { + process.i <- which(variable == process.settings$var) + + if(process.settings$method[[process.i]] == 'binByQuantile') { + + variable.quantiles <- + getQuantiles( + COHORT.use[, variable], + process.settings$settings[[process.i]] + ) + # For those rows which relate to this variable, and whose level isn't + # missing, put in the appropriate quantile boundaries for plotting + cph.coeffs$bin.min[cph.coeffs$var == variable & + cph.coeffs$level != 'missing'] <- + variable.quantiles[1:(length(variable.quantiles) - 1)] + cph.coeffs$bin.max[cph.coeffs$var == variable & + cph.coeffs$level != 'missing'] <- + variable.quantiles[2:length(variable.quantiles)] + # Make the final bin the 99th percentile + cph.coeffs$bin.max[cph.coeffs$var == variable & + cph.coeffs$level != 'missing'][ + length(variable.quantiles) - 1] <- + quantile(COHORT.use[, variable], 0.99, na.rm = TRUE) + + # Add a fake data point at the highest value to finish the graph + cph.coeffs <- + rbind( + cph.coeffs, + cph.coeffs[cph.coeffs$var == variable & + cph.coeffs$level != 'missing', ][ + length(variable.quantiles) - 1, ] + ) + # Change it so that bin.min is bin.max from the old one + cph.coeffs$bin.min[nrow(cph.coeffs)] <- + cph.coeffs$bin.max[cph.coeffs$var == variable & + cph.coeffs$level != 'missing'][ + length(variable.quantiles) - 1] + + # Work out data range by taking the 1st and 99th percentiles + # Use the max to provide a max value for the final bin + # Also use for x-axis limits, unless there are missing values to + # accommodate on the right-hand edge. + x.data.range <- + quantile(COHORT.use[, variable], c(0.01, 0.99), na.rm = TRUE) + x.axis.limits <- x.data.range + + + # Finally, we need to scale this such that the baseline value is equal + # to the value for the equivalent place in the Cox model, to make the + # risks comparable... + + # First, we need to find the average value of this variable in the lowest + # bin (which is always the baseline here) + baseline.bin <- variable.quantiles[1:2] + baseline.bin.avg <- + mean( + # Take only those values of the variable which are in the range + COHORT.use[ + inRange(COHORT.use[, variable], baseline.bin, na.false = TRUE), + variable + ] + ) + # Then, scale it with the caliber scaling + baseline.bin.val <- + caliberScaleUnits(baseline.bin.avg, variable) * + caliber.missing.coeffs$our_value[ + caliber.missing.coeffs$quantity == variable + ] + + # And now, add all the discretised values to that value to make them + # comparable... + cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] <- + cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] - + baseline.bin.val + + # Now, plot this variable as a stepped line plot using those quantile + # boundaries + cox.discrete.plot <- + ggplot( + subset(cph.coeffs, var == variable), + aes(x = bin.min, y = val) + ) + + geom_step() + + geom_step(aes(y = lower), colour = 'grey') + + geom_step(aes(y = upper), colour = 'grey') + + ggtitle(variable) + + # If there's a missing value risk, add it + if(any(cph.coeffs$var == variable & cph.coeffs$level == 'missing')) { + # Expand the x-axis to squeeze the missing values in + x.axis.limits[2] <- + x.axis.limits[2] + diff(x.data.range) * missing.padding + # Put this missing value a third of the way into the missing area + cph.coeffs$missing.x.pos.disc[ + cph.coeffs$var == variable & + cph.coeffs$level == 'missing'] <- + x.axis.limits[2] + diff(x.data.range) * missing.padding / 3 + + # Add the point to the graph (we'll set axis limits later) + cox.discrete.plot <- + cox.discrete.plot + + geom_pointrange( + data = cph.coeffs[cph.coeffs$var == variable & + cph.coeffs$level == 'missing', ], + aes( + x = missing.x.pos.disc, + y = val, ymin = lower, + ymax = upper + ), + colour = 'red' + ) + } + + # Now, let's add the line from the continuous Cox model. We only need two + # points because the lines are straight! + continuous.cox <- + data.frame( + var.x.values = x.data.range + ) + # Scale the x-values + continuous.cox$var.x.scaled <- + caliberScaleUnits(continuous.cox$var.x.values, variable) + # Use the risks to calculate risk per x for central estimate and errors + continuous.cox$y <- + -caliber.missing.coeffs$our_value[ + caliber.missing.coeffs$quantity == variable + ] * continuous.cox$var.x.scaled + continuous.cox$upper <- + -caliber.missing.coeffs$our_upper[ + caliber.missing.coeffs$quantity == variable + ] * continuous.cox$var.x.scaled + continuous.cox$lower <- + -caliber.missing.coeffs$our_lower[ + caliber.missing.coeffs$quantity == variable + ] * continuous.cox$var.x.scaled + + cox.discrete.plot <- + cox.discrete.plot + + geom_line( + data = continuous.cox, + aes(x = var.x.values, y = y), + colour = 'blue' + ) + + geom_line( + data = continuous.cox, + aes(x = var.x.values, y = upper), + colour = 'lightblue' + ) + + geom_line( + data = continuous.cox, + aes(x = var.x.values, y = lower), + colour = 'lightblue' + ) + + # If there is one, add missing value risk from the continuous model + if(any(caliber.missing.coeffs$quantity == paste0(variable, '_missing') & + caliber.missing.coeffs$unit == 'missing')) { + # Expand the x-axis to squeeze the missing values in + x.axis.limits[2] <- + x.axis.limits[2] + diff(x.data.range) * missing.padding + # Put this missing value 2/3rds of the way into the missing area + cph.coeffs$missing.x.pos.cont[ + cph.coeffs$var == variable & + cph.coeffs$level == 'missing'] <- + x.axis.limits[2] + diff(x.data.range) * missing.padding / 3 + x.axis.limits[2] + 2 * diff(x.data.range) * missing.padding / 3 + + cox.discrete.plot <- + cox.discrete.plot + + geom_pointrange( + data = cph.coeffs[ + cph.coeffs$var == variable & + cph.coeffs$level == 'missing', + ], + aes( + x = missing.x.pos.cont, + y = val, ymin = lower, ymax = upper + ), + colour = 'blue' + ) + } + + # Finally, set the x-axis limits; will just be the data range, or data + # range plus a bit if there are missing values to squeeze in + cox.discrete.plot <- + cox.discrete.plot + + coord_cartesian(xlim = x.axis.limits) + + cox.discrete.plots[[variable]] <- cox.discrete.plot + } + } +} + +print(cox.discrete.plots)