MLehealth / Git / [0375db] /random-forest/rfsrc-cv.R

Models:
philipB/
MLehealth
Downloads: 1
[0375db]: / random-forest / rfsrc-cv.R
History
Download this file
149 lines (121 with data), 4.8 kB

#+ knitr_setup, include = FALSE

# Whether to cache the intensive code sections. Set to FALSE to recalculate
# everything afresh.
cacheoption <- FALSE
# Disable lazy caching globally, because it fails for large objects, and all the
# objects we wish to cache are large...
opts_chunk$set(cache.lazy = FALSE)

#' # Cross-validating discretisation of input variables in a survival model
#' 
#' In difference to previous attempts at cross-validation, this uses between 10
#' and 20 bins, not between 2 and 20, in an attempt to avoid throwing away data.

output.filename.base <- '../../output/rfsrc-cv-nsplit-try3'
data.filename <- '../../data/cohort-sanitised.csv'

# If surv.vars is defined as a character vector here, the model only uses those
# variables specified, eg c('age') would build a model purely based on age. If
# not specified (ie commented out), it will use the defaults.
# surv.predict <- c('age')

#' ## Do the cross-validation
#' 
#' The steps for this are common regardless of model type, so run the script to
#' get a cross-validated model to further analyse...
#+ rf_discretised_cv, cache=cacheoption

source('../lib/rfsrc-cv-nsplit-bootstrap.R', chdir = TRUE)

# Save the resulting 'final' model
saveRDS(surv.model.fit, paste0(output.filename.base, '-final-model.rds'))

#' # Results
#' 
#' 
#' ## Performance
#' 
#' ### Discrimination
#' 
#' C-indices are **`r round(surv.model.fit.coeffs['c.train', 'val'], 3)` +/-
#' `r round(surv.model.fit.coeffs['c.train', 'err'], 3)`** on the training set and
#' **`r round(surv.model.fit.coeffs['c.test', 'val'], 3)` +/-
#' `r round(surv.model.fit.coeffs['c.test', 'err'], 3)`** on the held-out test set.
#' 
#' ### Calibration
#' 
#' Does the model predict realistic probabilities of an event?
#' 
#+ calibration_plot

calibration.table <-
  calibrationTable(
    # Standard calibration options
    surv.model.fit, COHORT.prep[test.set,],
    # Always need to specify NA imputation for rfsrc
    na.action = 'na.impute'
  )

calibration.score <- calibrationScore(calibration.table)

calibrationPlot(calibration.table, show.censored = TRUE, max.points = 10000)

# Save the calibration data for later plotting
write.csv(
  calibration.table, paste0(output.filename.base, '-calibration-table.csv')
)

#' The area between the calibration curve and the diagonal is 
#' **`r round(calibration.score['area'], 3)`** +/-
#' **`r round(calibration.score['se'], 3)`**.
#' 
#' ## Model fit
#' 
#+ resulting_fit

print(surv.model.fit)

#' ## Variable importance

# First, load data from Cox modelling for comparison
cox.var.imp <- read.csv(comparison.filename)

# Then, get the variable importance from the model just fitted
var.imp <-
  data.frame(
    var.imp = importance(surv.model.fit)/max(importance(surv.model.fit))
  )
var.imp$quantity <- rownames(var.imp)

var.imp <- merge(var.imp, cox.var.imp)

# Save the results as a CSV
write.csv(var.imp, paste0(output.filename, '-var-imp.csv'))

#' ## Variable importance vs Cox model replication variable importance

print(
  ggplot(var.imp, aes(x = our_range, y = var.imp)) +
    geom_point() +
    geom_text_repel(aes(label = quantity)) +
    # Log both...old coefficients for linearity, importance to shrink range!
    scale_x_log10() +
    scale_y_log10()
)

print(cor(var.imp[, c('var.imp', 'our_range')], method = 'spearman'))

#' ## Variable effects
#' 
#+ variable_effects

risk.by.variables <- data.frame()

for(variable in continuous.vars) {
  # Create a partial effect table for this variable
  risk.by.variable <-
    partialEffectTable(
      surv.model.fit, COHORT.prep[-test.set,], variable, na.action = 'na.impute'
    )
  # Slight kludge...rename the column which above is given the variable name to
  # just val, to allow rbinding
  names(risk.by.variable)[2] <- 'val'
  # Append a column with the variable's name so we can distinguish this in
  # a long data frame
  risk.by.variable$var <- variable
  # Append it to our ongoing big data frame
  risk.by.variables <- rbind(risk.by.variables, risk.by.variable)
  # Save the risks as we go
  write.csv(risk.by.variables, paste0(output.filename.base, '-var-effects.csv'))
  
  # Get the mean of the normalised risk for every value of the variable
  risk.aggregated <-
    aggregate(
      as.formula(paste0('risk.normalised ~ ', variable)),
      risk.by.variable, mean
    )
  
  # work out the limits on the x-axis by taking the 1st and 99th percentiles
  x.axis.limits <-
    quantile(COHORT.full[, variable], c(0.01, 0.99), na.rm = TRUE)
  
  print(
    ggplot(risk.by.variable, aes_string(x = variable, y = 'risk.normalised')) +
      geom_line(alpha=0.01, aes(group = id)) +
      geom_line(data = risk.aggregated, colour = 'blue') +
      coord_cartesian(xlim = c(x.axis.limits))
  )
}