--- a +++ b/age-only/age-only.R @@ -0,0 +1,135 @@ +#+ knitr_setup, include = FALSE + +# Whether to cache the intensive code sections. Set to FALSE to recalculate +# everything afresh. +cacheoption <- FALSE +# Disable lazy caching globally, because it fails for large objects, and all the +# objects we wish to cache are large... +opts_chunk$set(cache.lazy = FALSE) + +#' # Performance of an age-only model +#' +#+ setup, message=FALSE + +bootstraps <- 200 + +data.filename <- '../../data/cohort-sanitised.csv' +n.threads <- 16 + +source('../lib/shared.R') +require(rms) + +COHORT.full <- data.frame(fread(data.filename)) + +COHORT.use <- subset(COHORT.full, !exclude) + +n.data <- nrow(COHORT.use) + +# Define indices of test set +test.set <- testSetIndices(COHORT.use, random.seed = 78361) + +COHORT.use <- prepSurvCol(COHORT.use, surv.time, surv.event,surv.event.yes) + +#' ## Concordance index +#' +#' The c-index can be calculated taking age itself as a risk score. Since it's +#' purely rank-based, it doesn't matter that age is nonlinearly related to true +#' risk. +#' +#' We bootstrap this based purely on the test set to make the bootstrap +#' variability commensurate with other models tested on the test set. You can +#' imagine that this model was 'trained' on the training set, even though it's +#' so simple that we actually did no such thing... +#' +#+ c_index + +# Define a trivial function used for bootstrapping which simply returns the +# c-index of a 'model' based purely on age. + +ageOnly <- function(df, indices, df.test) { + # Create a Kaplan-Meier curve from the bootstrap sample + km.by.age <- + survfit( + Surv(surv_time, surv_event) ~ age, + data = df[indices, ], + conf.type = "log-log" + ) + + # Return the calibration score + c( + calibration.score = + calibrationScore( + calibrationTable(km.by.age, df.test) + )$area + ) +} + +# C-index is by definition non-variable because we do not bootstrap or otherwise +# permute the test set, and there's no 'training' phase +age.c.index <- + as.numeric( + survConcordance( + Surv(surv_time, surv_event) ~ age, + COHORT.use[test.set,] + )$concordance + ) + +# Bootstrap to establish variability of calibration +age.only.boot <- + boot( + data = COHORT.use[-test.set,], + statistic = ageOnly, + R = bootstraps, + parallel = 'multicore', + ncpus = n.threads, + df.test = COHORT.use[test.set,] + ) + +age.only.boot.stats <- bootStats(age.only.boot, uncertainty = '95ci') + +#' C-index is +#' **`r round(age.c.index, 3)`** +#' on the held-out test set (not that it really matters, the model isn't +#' 'trained' as such for the discrimination test...it's just oldest patient dies +#' first). +#' +#' +#' ## Calibration +#' +#' +#+ calibration + +km.by.age <- + survfit( + Surv(surv_time, surv_event) ~ age, + data = COHORT.use[-test.set,], + conf.type = "log-log" + ) + +calibration.table <- calibrationTable(km.by.age, COHORT.use[test.set, ]) + +print(calibrationScore(calibration.table)) + +calibrationPlot(calibration.table) + +#' Calibration score is +#' **`r round(age.only.boot.stats['calibration.score', 'val'], 3)` +#' (`r round(age.only.boot.stats['calibration.score', 'lower'], 3)` - +#' `r round(age.only.boot.stats['calibration.score', 'upper'], 3)`)** +#' on the held-out test set. + +varsToTable( + data.frame( + model = 'age', + imputation = FALSE, + discretised = FALSE, + c.index = age.c.index, + c.index.lower = NA, + c.index.upper = NA, + calibration.score = age.only.boot.stats['calibration.score', 'val'], + calibration.score.lower = age.only.boot.stats['calibration.score', 'lower'], + calibration.score.upper = age.only.boot.stats['calibration.score', 'upper'] + ), + performance.file, + index.cols = c('model', 'imputation', 'discretised') +)