MLehealth / Git / [0375db] /age-only/age-only.R

Models:
philipB/
MLehealth
Downloads: 1
[0375db]: / age-only / age-only.R
History
Download this file
136 lines (111 with data), 3.7 kB

#+ knitr_setup, include = FALSE

# Whether to cache the intensive code sections. Set to FALSE to recalculate
# everything afresh.
cacheoption <- FALSE
# Disable lazy caching globally, because it fails for large objects, and all the
# objects we wish to cache are large...
opts_chunk$set(cache.lazy = FALSE)

#' # Performance of an age-only model
#' 
#+ setup, message=FALSE

bootstraps <- 200

data.filename <- '../../data/cohort-sanitised.csv'
n.threads <- 16

source('../lib/shared.R')
require(rms)

COHORT.full <- data.frame(fread(data.filename))

COHORT.use <- subset(COHORT.full, !exclude)

n.data <- nrow(COHORT.use)

# Define indices of test set
test.set <- testSetIndices(COHORT.use, random.seed = 78361)

COHORT.use <- prepSurvCol(COHORT.use, surv.time, surv.event,surv.event.yes)

#' ## Concordance index
#' 
#' The c-index can be calculated taking age itself as a risk score. Since it's
#' purely rank-based, it doesn't matter that age is nonlinearly related to true
#' risk.
#' 
#' We bootstrap this based purely on the test set to make the bootstrap
#' variability commensurate with other models tested on the test set. You can
#' imagine that this model was 'trained' on the training set, even though it's
#' so simple that we actually did no such thing...
#' 
#+ c_index

# Define a trivial function used for bootstrapping which simply returns the
# c-index of a 'model' based purely on age.

ageOnly <- function(df, indices, df.test) {
  # Create a Kaplan-Meier curve from the bootstrap sample
  km.by.age <-
    survfit(
      Surv(surv_time, surv_event) ~ age,
      data = df[indices, ],
      conf.type = "log-log"
    )

  # Return the calibration score
  c(
    calibration.score =
      calibrationScore(
        calibrationTable(km.by.age, df.test)
      )$area
  )
}

# C-index is by definition non-variable because we do not bootstrap or otherwise
# permute the test set, and there's no 'training' phase
age.c.index <- 
  as.numeric(
    survConcordance(
      Surv(surv_time, surv_event) ~ age,
      COHORT.use[test.set,]
    )$concordance
  )

# Bootstrap to establish variability of calibration
age.only.boot <-
  boot(
    data = COHORT.use[-test.set,],
    statistic = ageOnly,
    R = bootstraps,
    parallel = 'multicore',
    ncpus = n.threads,
    df.test =  COHORT.use[test.set,]
  )

age.only.boot.stats <- bootStats(age.only.boot, uncertainty = '95ci')

#' C-index is
#' **`r round(age.c.index, 3)`** 
#' on the held-out test set (not that it really matters, the model isn't
#' 'trained' as such for the discrimination test...it's just oldest patient dies
#' first).
#' 
#' 
#' ## Calibration
#' 
#' 
#+ calibration

km.by.age <-
  survfit(
    Surv(surv_time, surv_event) ~ age,
    data = COHORT.use[-test.set,],
    conf.type = "log-log"
  )

calibration.table <- calibrationTable(km.by.age, COHORT.use[test.set, ])

print(calibrationScore(calibration.table))

calibrationPlot(calibration.table)

#' Calibration score is
#' **`r round(age.only.boot.stats['calibration.score', 'val'], 3)`
#' (`r round(age.only.boot.stats['calibration.score', 'lower'], 3)` -
#' `r round(age.only.boot.stats['calibration.score', 'upper'], 3)`)** 
#' on the held-out test set.

varsToTable(
  data.frame(
    model = 'age',
    imputation = FALSE,
    discretised = FALSE,
    c.index = age.c.index,
    c.index.lower = NA,
    c.index.upper = NA,
    calibration.score = age.only.boot.stats['calibration.score', 'val'],
    calibration.score.lower = age.only.boot.stats['calibration.score', 'lower'],
    calibration.score.upper = age.only.boot.stats['calibration.score', 'upper']
  ),
  performance.file,
  index.cols = c('model', 'imputation', 'discretised')
)