Diff of /random-forest/rfsrc-cv.R [000000] .. [0375db]

Switch to side-by-side view

--- a
+++ b/random-forest/rfsrc-cv.R
@@ -0,0 +1,149 @@
+#+ knitr_setup, include = FALSE
+
+# Whether to cache the intensive code sections. Set to FALSE to recalculate
+# everything afresh.
+cacheoption <- FALSE
+# Disable lazy caching globally, because it fails for large objects, and all the
+# objects we wish to cache are large...
+opts_chunk$set(cache.lazy = FALSE)
+
+#' # Cross-validating discretisation of input variables in a survival model
+#' 
+#' In difference to previous attempts at cross-validation, this uses between 10
+#' and 20 bins, not between 2 and 20, in an attempt to avoid throwing away data.
+
+output.filename.base <- '../../output/rfsrc-cv-nsplit-try3'
+data.filename <- '../../data/cohort-sanitised.csv'
+
+# If surv.vars is defined as a character vector here, the model only uses those
+# variables specified, eg c('age') would build a model purely based on age. If
+# not specified (ie commented out), it will use the defaults.
+# surv.predict <- c('age')
+
+#' ## Do the cross-validation
+#' 
+#' The steps for this are common regardless of model type, so run the script to
+#' get a cross-validated model to further analyse...
+#+ rf_discretised_cv, cache=cacheoption
+
+source('../lib/rfsrc-cv-nsplit-bootstrap.R', chdir = TRUE)
+
+# Save the resulting 'final' model
+saveRDS(surv.model.fit, paste0(output.filename.base, '-final-model.rds'))
+
+#' # Results
+#' 
+#' 
+#' ## Performance
+#' 
+#' ### Discrimination
+#' 
+#' C-indices are **`r round(surv.model.fit.coeffs['c.train', 'val'], 3)` +/-
+#' `r round(surv.model.fit.coeffs['c.train', 'err'], 3)`** on the training set and
+#' **`r round(surv.model.fit.coeffs['c.test', 'val'], 3)` +/-
+#' `r round(surv.model.fit.coeffs['c.test', 'err'], 3)`** on the held-out test set.
+#' 
+#' ### Calibration
+#' 
+#' Does the model predict realistic probabilities of an event?
+#' 
+#+ calibration_plot
+
+calibration.table <-
+  calibrationTable(
+    # Standard calibration options
+    surv.model.fit, COHORT.prep[test.set,],
+    # Always need to specify NA imputation for rfsrc
+    na.action = 'na.impute'
+  )
+
+calibration.score <- calibrationScore(calibration.table)
+
+calibrationPlot(calibration.table, show.censored = TRUE, max.points = 10000)
+
+# Save the calibration data for later plotting
+write.csv(
+  calibration.table, paste0(output.filename.base, '-calibration-table.csv')
+)
+
+#' The area between the calibration curve and the diagonal is 
+#' **`r round(calibration.score['area'], 3)`** +/-
+#' **`r round(calibration.score['se'], 3)`**.
+#' 
+#' ## Model fit
+#' 
+#+ resulting_fit
+
+print(surv.model.fit)
+
+#' ## Variable importance
+
+# First, load data from Cox modelling for comparison
+cox.var.imp <- read.csv(comparison.filename)
+
+# Then, get the variable importance from the model just fitted
+var.imp <-
+  data.frame(
+    var.imp = importance(surv.model.fit)/max(importance(surv.model.fit))
+  )
+var.imp$quantity <- rownames(var.imp)
+
+var.imp <- merge(var.imp, cox.var.imp)
+
+# Save the results as a CSV
+write.csv(var.imp, paste0(output.filename, '-var-imp.csv'))
+
+#' ## Variable importance vs Cox model replication variable importance
+
+print(
+  ggplot(var.imp, aes(x = our_range, y = var.imp)) +
+    geom_point() +
+    geom_text_repel(aes(label = quantity)) +
+    # Log both...old coefficients for linearity, importance to shrink range!
+    scale_x_log10() +
+    scale_y_log10()
+)
+
+print(cor(var.imp[, c('var.imp', 'our_range')], method = 'spearman'))
+
+#' ## Variable effects
+#' 
+#+ variable_effects
+
+risk.by.variables <- data.frame()
+
+for(variable in continuous.vars) {
+  # Create a partial effect table for this variable
+  risk.by.variable <-
+    partialEffectTable(
+      surv.model.fit, COHORT.prep[-test.set,], variable, na.action = 'na.impute'
+    )
+  # Slight kludge...rename the column which above is given the variable name to
+  # just val, to allow rbinding
+  names(risk.by.variable)[2] <- 'val'
+  # Append a column with the variable's name so we can distinguish this in
+  # a long data frame
+  risk.by.variable$var <- variable
+  # Append it to our ongoing big data frame
+  risk.by.variables <- rbind(risk.by.variables, risk.by.variable)
+  # Save the risks as we go
+  write.csv(risk.by.variables, paste0(output.filename.base, '-var-effects.csv'))
+  
+  # Get the mean of the normalised risk for every value of the variable
+  risk.aggregated <-
+    aggregate(
+      as.formula(paste0('risk.normalised ~ ', variable)),
+      risk.by.variable, mean
+    )
+  
+  # work out the limits on the x-axis by taking the 1st and 99th percentiles
+  x.axis.limits <-
+    quantile(COHORT.full[, variable], c(0.01, 0.99), na.rm = TRUE)
+  
+  print(
+    ggplot(risk.by.variable, aes_string(x = variable, y = 'risk.normalised')) +
+      geom_line(alpha=0.01, aes(group = id)) +
+      geom_line(data = risk.aggregated, colour = 'blue') +
+      coord_cartesian(xlim = c(x.axis.limits))
+  )
+}
\ No newline at end of file