--- a +++ b/cox-ph/cox-discretised-imputed.R @@ -0,0 +1,221 @@ +#+ knitr_setup, include = FALSE + +# Whether to cache the intensive code sections. Set to FALSE to recalculate +# everything afresh. +cacheoption <- FALSE +# Disable lazy caching globally, because it fails for large objects, and all the +# objects we wish to cache are large... +opts_chunk$set(cache.lazy = FALSE) + +#' # Discrete Cox model with imputed data +#' +#' The discrete Cox model performs very similarly to the normal Cox model, even +#' without performing imputation first. Let's try it with imputation, and see +#' if the performance is boosted. +#' +#' We're going to use the same parameters for discretising as were found for the +#' data with missing values. On the one hand, this is slightly unfair because +#' these values might be suboptimal now that the data are imputed but, on the +#' other, it does allow for direct comparison. If we cross-validated again, we +#' would be very likely to find different parameters (there are far more than +#' can plausibly be tried), which may lead performance to be better or worse +#' entirely at random. + +# Calibration from cross-validation performed on data before imputation +calibration.filename <- '../../output/survreg-crossvalidation-try4.csv' +# The first part of the filename for any output +output.filename.base <- '../../output/survreg-discrete-imputed-try1' +imputed.data.filename <- '../../data/COHORT_complete.rds' +boot.filename <- paste0(output.filename.base, '-boot.rds') + +n.threads <- 20 +bootstraps <- 50 + +model.type <- 'survreg' + +#' ## Discretise data +#' +#' First, let's load the results from the calibrations and find the parameters +#' for discretisation. + +source('../lib/shared.R') + +continuous.vars <- + c( + 'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo', + 'total_wbc_6mo', 'haemoglobin_6mo' + ) + +# read file containing calibrations +cv.performance <- read.csv(calibration.filename) + +# Find the best calibration... +# First, average performance across cross-validation folds +cv.performance.average <- + aggregate( + c.index.val ~ calibration, + data = cv.performance, + mean + ) +# Find the highest value +best.calibration <- + cv.performance.average$calibration[ + which.max(cv.performance.average$c.index.val) + ] +# And finally, find the first row of that calibration to get the n.bins values +best.calibration.row1 <- + min(which(cv.performance$calibration == best.calibration)) + +# Get its parameters +n.bins <- + t( + cv.performance[best.calibration.row1, continuous.vars] + ) + + +# Reset process settings with the base setings +process.settings <- + list( + var = c('anonpatid', 'time_death', 'imd_score', 'exclude'), + method = c(NA, NA, NA, NA), + settings = list(NA, NA, NA, NA) + ) +for(j in 1:length(continuous.vars)) { + process.settings$var <- c(process.settings$var, continuous.vars[j]) + process.settings$method <- c(process.settings$method, 'binByQuantile') + process.settings$settings <- + c( + process.settings$settings, + list( + seq( + # Quantiles are obviously between 0 and 1 + 0, 1, + # Choose a random number of bins (and for n bins, you need n + 1 breaks) + length.out = n.bins[j] + ) + ) + ) +} + +#' ## Load imputed data +#' +#' Load the data and prepare it with the settings above + +# Load the data from its RDS file +imputed.data <- readRDS(imputed.data.filename) + +# Remove rows with death time of 0 to avoid fitting errors, and get the survival +# columns ready +for(i in 1:length(imputed.data)) { + imputed.data[[i]] <- imputed.data[[i]][imputed.data[[i]][, surv.time] > 0, ] + # Put in a fake exclude column for the next function (exclusions are already + # excluded in the imputed dataset) + imputed.data[[i]]$exclude <- FALSE + imputed.data[[i]]$imd_score <- as.numeric(imputed.data[[i]]$imd_score) + imputed.data[[i]] <- + prepData( + imputed.data[[i]], + cols.keep, + process.settings, + 'time_death', 'endpoint_death', 1, + extra.fun = caliberExtraPrep + ) +} + +# Define test set +test.set <- testSetIndices(imputed.data[[1]], random.seed = 78361) + +# Do a quick and dirty fit on a single imputed dataset, to draw calibration +# curve from +time.start <- handyTimer() +surv.model.fit <- + survivalFit( + surv.predict, + COHORT.optimised[-test.set,], # Training set + model.type = model.type, + n.threads = n.threads + ) +time.fit <- handyTimer(time.start) + +# Perform bootstrap fitting for every multiply imputed dataset +surv.fit.boot <- list() +time.start <- handyTimer() +for(i in 1:length(imputed.data)) { + surv.fit.boot[[i]] <- + survivalBootstrap( + surv.predict, + imputed.data[[i]][-test.set, ], + imputed.data[[i]][test.set, ], + model.type = model.type, + bootstraps = bootstraps, + n.threads = n.threads + ) +} +time.boot <- handyTimer(time.start) + +# Save the fits, because it might've taken a while! +saveRDS(surv.fit.boot, boot.filename) + +#' Model fitted in `r round(time.fit)` seconds, and `r bootstraps` fits +#' performed on `r length(imputed.data)` imputed datasets in +#' `r round(time.boot)` seconds. + +# Unpackage the uncertainties from the bootstrapped data +surv.fit.boot.ests <- bootMIStats(surv.fit.boot) + +# Save bootstrapped performance values +varsToTable( + data.frame( + model = 'cox', + imputation = TRUE, + discretised = TRUE, + c.index = surv.fit.boot.ests['c.test', 'val'], + c.index.lower = surv.fit.boot.ests['c.test', 'lower'], + c.index.upper = surv.fit.boot.ests['c.test', 'upper'], + calibration.score = surv.fit.boot.ests['calibration.score', 'val'], + calibration.score.lower = surv.fit.boot.ests['calibration.score', 'lower'], + calibration.score.upper = surv.fit.boot.ests['calibration.score', 'upper'] + ), + performance.file, + index.cols = c('model', 'imputation', 'discretised') +) + + +#' # Results +#' +#' ## Performance +#' +#' Having fitted the Cox model, how did we do? The c-indices were calculated as +#' part of the bootstrapping, so we just need to take a look at those... +#' +#' C-indices are **`r round(surv.fit.boot.ests['c.train', 'val'], 3)` +#' (`r round(surv.fit.boot.ests['c.train', 'lower'], 3)` - +#' `r round(surv.fit.boot.ests['c.train', 'upper'], 3)`)** on the training set and +#' **`r round(surv.fit.boot.ests['c.test', 'val'], 3)` +#' (`r round(surv.fit.boot.ests['c.test', 'lower'], 3)` - +#' `r round(surv.fit.boot.ests['c.test', 'upper'], 3)`)** on the test set. +#' +#' +#' ### Calibration +#' +#' The bootstrapped calibration score is +#' **`r round(surv.fit.boot.ests['calibration.score', 'val'], 3)` +#' (`r round(surv.fit.boot.ests['calibration.score', 'lower'], 3)` - +#' `r round(surv.fit.boot.ests['calibration.score', 'upper'], 3)`)**. +#' +#' Let's draw a representative curve from the unbootstrapped fit... (It would be +#' better to draw all the curves from the bootstrap fit to get an idea of +#' variability, but I've not implemented this yet.) +#' +#+ calibration_plot + +calibration.table <- + calibrationTable(surv.model.fit, imputed.data[[1]][test.set, ]) + +calibration.score <- calibrationScore(calibration.table) + +calibrationPlot(calibration.table) + +#' The area between the calibration curve and the diagonal is +#' **`r round(calibration.score[['area']], 3)`** +/- +#' **`r round(calibration.score[['se']], 3)`**. \ No newline at end of file