[0375db]: / random-forest / rfsrc-cv.R

Download this file

149 lines (121 with data), 4.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
#+ knitr_setup, include = FALSE
# Whether to cache the intensive code sections. Set to FALSE to recalculate
# everything afresh.
cacheoption <- FALSE
# Disable lazy caching globally, because it fails for large objects, and all the
# objects we wish to cache are large...
opts_chunk$set(cache.lazy = FALSE)
#' # Cross-validating discretisation of input variables in a survival model
#'
#' In difference to previous attempts at cross-validation, this uses between 10
#' and 20 bins, not between 2 and 20, in an attempt to avoid throwing away data.
output.filename.base <- '../../output/rfsrc-cv-nsplit-try3'
data.filename <- '../../data/cohort-sanitised.csv'
# If surv.vars is defined as a character vector here, the model only uses those
# variables specified, eg c('age') would build a model purely based on age. If
# not specified (ie commented out), it will use the defaults.
# surv.predict <- c('age')
#' ## Do the cross-validation
#'
#' The steps for this are common regardless of model type, so run the script to
#' get a cross-validated model to further analyse...
#+ rf_discretised_cv, cache=cacheoption
source('../lib/rfsrc-cv-nsplit-bootstrap.R', chdir = TRUE)
# Save the resulting 'final' model
saveRDS(surv.model.fit, paste0(output.filename.base, '-final-model.rds'))
#' # Results
#'
#'
#' ## Performance
#'
#' ### Discrimination
#'
#' C-indices are **`r round(surv.model.fit.coeffs['c.train', 'val'], 3)` +/-
#' `r round(surv.model.fit.coeffs['c.train', 'err'], 3)`** on the training set and
#' **`r round(surv.model.fit.coeffs['c.test', 'val'], 3)` +/-
#' `r round(surv.model.fit.coeffs['c.test', 'err'], 3)`** on the held-out test set.
#'
#' ### Calibration
#'
#' Does the model predict realistic probabilities of an event?
#'
#+ calibration_plot
calibration.table <-
calibrationTable(
# Standard calibration options
surv.model.fit, COHORT.prep[test.set,],
# Always need to specify NA imputation for rfsrc
na.action = 'na.impute'
)
calibration.score <- calibrationScore(calibration.table)
calibrationPlot(calibration.table, show.censored = TRUE, max.points = 10000)
# Save the calibration data for later plotting
write.csv(
calibration.table, paste0(output.filename.base, '-calibration-table.csv')
)
#' The area between the calibration curve and the diagonal is
#' **`r round(calibration.score['area'], 3)`** +/-
#' **`r round(calibration.score['se'], 3)`**.
#'
#' ## Model fit
#'
#+ resulting_fit
print(surv.model.fit)
#' ## Variable importance
# First, load data from Cox modelling for comparison
cox.var.imp <- read.csv(comparison.filename)
# Then, get the variable importance from the model just fitted
var.imp <-
data.frame(
var.imp = importance(surv.model.fit)/max(importance(surv.model.fit))
)
var.imp$quantity <- rownames(var.imp)
var.imp <- merge(var.imp, cox.var.imp)
# Save the results as a CSV
write.csv(var.imp, paste0(output.filename, '-var-imp.csv'))
#' ## Variable importance vs Cox model replication variable importance
print(
ggplot(var.imp, aes(x = our_range, y = var.imp)) +
geom_point() +
geom_text_repel(aes(label = quantity)) +
# Log both...old coefficients for linearity, importance to shrink range!
scale_x_log10() +
scale_y_log10()
)
print(cor(var.imp[, c('var.imp', 'our_range')], method = 'spearman'))
#' ## Variable effects
#'
#+ variable_effects
risk.by.variables <- data.frame()
for(variable in continuous.vars) {
# Create a partial effect table for this variable
risk.by.variable <-
partialEffectTable(
surv.model.fit, COHORT.prep[-test.set,], variable, na.action = 'na.impute'
)
# Slight kludge...rename the column which above is given the variable name to
# just val, to allow rbinding
names(risk.by.variable)[2] <- 'val'
# Append a column with the variable's name so we can distinguish this in
# a long data frame
risk.by.variable$var <- variable
# Append it to our ongoing big data frame
risk.by.variables <- rbind(risk.by.variables, risk.by.variable)
# Save the risks as we go
write.csv(risk.by.variables, paste0(output.filename.base, '-var-effects.csv'))
# Get the mean of the normalised risk for every value of the variable
risk.aggregated <-
aggregate(
as.formula(paste0('risk.normalised ~ ', variable)),
risk.by.variable, mean
)
# work out the limits on the x-axis by taking the 1st and 99th percentiles
x.axis.limits <-
quantile(COHORT.full[, variable], c(0.01, 0.99), na.rm = TRUE)
print(
ggplot(risk.by.variable, aes_string(x = variable, y = 'risk.normalised')) +
geom_line(alpha=0.01, aes(group = id)) +
geom_line(data = risk.aggregated, colour = 'blue') +
coord_cartesian(xlim = c(x.axis.limits))
)
}