Diff of /random-forest/rfsrc-cv.R [000000] .. [0375db]

Switch to unified view

a b/random-forest/rfsrc-cv.R
1
#+ knitr_setup, include = FALSE
2
3
# Whether to cache the intensive code sections. Set to FALSE to recalculate
4
# everything afresh.
5
cacheoption <- FALSE
6
# Disable lazy caching globally, because it fails for large objects, and all the
7
# objects we wish to cache are large...
8
opts_chunk$set(cache.lazy = FALSE)
9
10
#' # Cross-validating discretisation of input variables in a survival model
11
#' 
12
#' In difference to previous attempts at cross-validation, this uses between 10
13
#' and 20 bins, not between 2 and 20, in an attempt to avoid throwing away data.
14
15
output.filename.base <- '../../output/rfsrc-cv-nsplit-try3'
16
data.filename <- '../../data/cohort-sanitised.csv'
17
18
# If surv.vars is defined as a character vector here, the model only uses those
19
# variables specified, eg c('age') would build a model purely based on age. If
20
# not specified (ie commented out), it will use the defaults.
21
# surv.predict <- c('age')
22
23
#' ## Do the cross-validation
24
#' 
25
#' The steps for this are common regardless of model type, so run the script to
26
#' get a cross-validated model to further analyse...
27
#+ rf_discretised_cv, cache=cacheoption
28
29
source('../lib/rfsrc-cv-nsplit-bootstrap.R', chdir = TRUE)
30
31
# Save the resulting 'final' model
32
saveRDS(surv.model.fit, paste0(output.filename.base, '-final-model.rds'))
33
34
#' # Results
35
#' 
36
#' 
37
#' ## Performance
38
#' 
39
#' ### Discrimination
40
#' 
41
#' C-indices are **`r round(surv.model.fit.coeffs['c.train', 'val'], 3)` +/-
42
#' `r round(surv.model.fit.coeffs['c.train', 'err'], 3)`** on the training set and
43
#' **`r round(surv.model.fit.coeffs['c.test', 'val'], 3)` +/-
44
#' `r round(surv.model.fit.coeffs['c.test', 'err'], 3)`** on the held-out test set.
45
#' 
46
#' ### Calibration
47
#' 
48
#' Does the model predict realistic probabilities of an event?
49
#' 
50
#+ calibration_plot
51
52
calibration.table <-
53
  calibrationTable(
54
    # Standard calibration options
55
    surv.model.fit, COHORT.prep[test.set,],
56
    # Always need to specify NA imputation for rfsrc
57
    na.action = 'na.impute'
58
  )
59
60
calibration.score <- calibrationScore(calibration.table)
61
62
calibrationPlot(calibration.table, show.censored = TRUE, max.points = 10000)
63
64
# Save the calibration data for later plotting
65
write.csv(
66
  calibration.table, paste0(output.filename.base, '-calibration-table.csv')
67
)
68
69
#' The area between the calibration curve and the diagonal is 
70
#' **`r round(calibration.score['area'], 3)`** +/-
71
#' **`r round(calibration.score['se'], 3)`**.
72
#' 
73
#' ## Model fit
74
#' 
75
#+ resulting_fit
76
77
print(surv.model.fit)
78
79
#' ## Variable importance
80
81
# First, load data from Cox modelling for comparison
82
cox.var.imp <- read.csv(comparison.filename)
83
84
# Then, get the variable importance from the model just fitted
85
var.imp <-
86
  data.frame(
87
    var.imp = importance(surv.model.fit)/max(importance(surv.model.fit))
88
  )
89
var.imp$quantity <- rownames(var.imp)
90
91
var.imp <- merge(var.imp, cox.var.imp)
92
93
# Save the results as a CSV
94
write.csv(var.imp, paste0(output.filename, '-var-imp.csv'))
95
96
#' ## Variable importance vs Cox model replication variable importance
97
98
print(
99
  ggplot(var.imp, aes(x = our_range, y = var.imp)) +
100
    geom_point() +
101
    geom_text_repel(aes(label = quantity)) +
102
    # Log both...old coefficients for linearity, importance to shrink range!
103
    scale_x_log10() +
104
    scale_y_log10()
105
)
106
107
print(cor(var.imp[, c('var.imp', 'our_range')], method = 'spearman'))
108
109
#' ## Variable effects
110
#' 
111
#+ variable_effects
112
113
risk.by.variables <- data.frame()
114
115
for(variable in continuous.vars) {
116
  # Create a partial effect table for this variable
117
  risk.by.variable <-
118
    partialEffectTable(
119
      surv.model.fit, COHORT.prep[-test.set,], variable, na.action = 'na.impute'
120
    )
121
  # Slight kludge...rename the column which above is given the variable name to
122
  # just val, to allow rbinding
123
  names(risk.by.variable)[2] <- 'val'
124
  # Append a column with the variable's name so we can distinguish this in
125
  # a long data frame
126
  risk.by.variable$var <- variable
127
  # Append it to our ongoing big data frame
128
  risk.by.variables <- rbind(risk.by.variables, risk.by.variable)
129
  # Save the risks as we go
130
  write.csv(risk.by.variables, paste0(output.filename.base, '-var-effects.csv'))
131
  
132
  # Get the mean of the normalised risk for every value of the variable
133
  risk.aggregated <-
134
    aggregate(
135
      as.formula(paste0('risk.normalised ~ ', variable)),
136
      risk.by.variable, mean
137
    )
138
  
139
  # work out the limits on the x-axis by taking the 1st and 99th percentiles
140
  x.axis.limits <-
141
    quantile(COHORT.full[, variable], c(0.01, 0.99), na.rm = TRUE)
142
  
143
  print(
144
    ggplot(risk.by.variable, aes_string(x = variable, y = 'risk.normalised')) +
145
      geom_line(alpha=0.01, aes(group = id)) +
146
      geom_line(data = risk.aggregated, colour = 'blue') +
147
      coord_cartesian(xlim = c(x.axis.limits))
148
  )
149
}