|
a |
|
b/random-forest/rfsrc-cv.R |
|
|
1 |
#+ knitr_setup, include = FALSE |
|
|
2 |
|
|
|
3 |
# Whether to cache the intensive code sections. Set to FALSE to recalculate |
|
|
4 |
# everything afresh. |
|
|
5 |
cacheoption <- FALSE |
|
|
6 |
# Disable lazy caching globally, because it fails for large objects, and all the |
|
|
7 |
# objects we wish to cache are large... |
|
|
8 |
opts_chunk$set(cache.lazy = FALSE) |
|
|
9 |
|
|
|
10 |
#' # Cross-validating discretisation of input variables in a survival model |
|
|
11 |
#' |
|
|
12 |
#' In difference to previous attempts at cross-validation, this uses between 10 |
|
|
13 |
#' and 20 bins, not between 2 and 20, in an attempt to avoid throwing away data. |
|
|
14 |
|
|
|
15 |
output.filename.base <- '../../output/rfsrc-cv-nsplit-try3' |
|
|
16 |
data.filename <- '../../data/cohort-sanitised.csv' |
|
|
17 |
|
|
|
18 |
# If surv.vars is defined as a character vector here, the model only uses those |
|
|
19 |
# variables specified, eg c('age') would build a model purely based on age. If |
|
|
20 |
# not specified (ie commented out), it will use the defaults. |
|
|
21 |
# surv.predict <- c('age') |
|
|
22 |
|
|
|
23 |
#' ## Do the cross-validation |
|
|
24 |
#' |
|
|
25 |
#' The steps for this are common regardless of model type, so run the script to |
|
|
26 |
#' get a cross-validated model to further analyse... |
|
|
27 |
#+ rf_discretised_cv, cache=cacheoption |
|
|
28 |
|
|
|
29 |
source('../lib/rfsrc-cv-nsplit-bootstrap.R', chdir = TRUE) |
|
|
30 |
|
|
|
31 |
# Save the resulting 'final' model |
|
|
32 |
saveRDS(surv.model.fit, paste0(output.filename.base, '-final-model.rds')) |
|
|
33 |
|
|
|
34 |
#' # Results |
|
|
35 |
#' |
|
|
36 |
#' |
|
|
37 |
#' ## Performance |
|
|
38 |
#' |
|
|
39 |
#' ### Discrimination |
|
|
40 |
#' |
|
|
41 |
#' C-indices are **`r round(surv.model.fit.coeffs['c.train', 'val'], 3)` +/- |
|
|
42 |
#' `r round(surv.model.fit.coeffs['c.train', 'err'], 3)`** on the training set and |
|
|
43 |
#' **`r round(surv.model.fit.coeffs['c.test', 'val'], 3)` +/- |
|
|
44 |
#' `r round(surv.model.fit.coeffs['c.test', 'err'], 3)`** on the held-out test set. |
|
|
45 |
#' |
|
|
46 |
#' ### Calibration |
|
|
47 |
#' |
|
|
48 |
#' Does the model predict realistic probabilities of an event? |
|
|
49 |
#' |
|
|
50 |
#+ calibration_plot |
|
|
51 |
|
|
|
52 |
calibration.table <- |
|
|
53 |
calibrationTable( |
|
|
54 |
# Standard calibration options |
|
|
55 |
surv.model.fit, COHORT.prep[test.set,], |
|
|
56 |
# Always need to specify NA imputation for rfsrc |
|
|
57 |
na.action = 'na.impute' |
|
|
58 |
) |
|
|
59 |
|
|
|
60 |
calibration.score <- calibrationScore(calibration.table) |
|
|
61 |
|
|
|
62 |
calibrationPlot(calibration.table, show.censored = TRUE, max.points = 10000) |
|
|
63 |
|
|
|
64 |
# Save the calibration data for later plotting |
|
|
65 |
write.csv( |
|
|
66 |
calibration.table, paste0(output.filename.base, '-calibration-table.csv') |
|
|
67 |
) |
|
|
68 |
|
|
|
69 |
#' The area between the calibration curve and the diagonal is |
|
|
70 |
#' **`r round(calibration.score['area'], 3)`** +/- |
|
|
71 |
#' **`r round(calibration.score['se'], 3)`**. |
|
|
72 |
#' |
|
|
73 |
#' ## Model fit |
|
|
74 |
#' |
|
|
75 |
#+ resulting_fit |
|
|
76 |
|
|
|
77 |
print(surv.model.fit) |
|
|
78 |
|
|
|
79 |
#' ## Variable importance |
|
|
80 |
|
|
|
81 |
# First, load data from Cox modelling for comparison |
|
|
82 |
cox.var.imp <- read.csv(comparison.filename) |
|
|
83 |
|
|
|
84 |
# Then, get the variable importance from the model just fitted |
|
|
85 |
var.imp <- |
|
|
86 |
data.frame( |
|
|
87 |
var.imp = importance(surv.model.fit)/max(importance(surv.model.fit)) |
|
|
88 |
) |
|
|
89 |
var.imp$quantity <- rownames(var.imp) |
|
|
90 |
|
|
|
91 |
var.imp <- merge(var.imp, cox.var.imp) |
|
|
92 |
|
|
|
93 |
# Save the results as a CSV |
|
|
94 |
write.csv(var.imp, paste0(output.filename, '-var-imp.csv')) |
|
|
95 |
|
|
|
96 |
#' ## Variable importance vs Cox model replication variable importance |
|
|
97 |
|
|
|
98 |
print( |
|
|
99 |
ggplot(var.imp, aes(x = our_range, y = var.imp)) + |
|
|
100 |
geom_point() + |
|
|
101 |
geom_text_repel(aes(label = quantity)) + |
|
|
102 |
# Log both...old coefficients for linearity, importance to shrink range! |
|
|
103 |
scale_x_log10() + |
|
|
104 |
scale_y_log10() |
|
|
105 |
) |
|
|
106 |
|
|
|
107 |
print(cor(var.imp[, c('var.imp', 'our_range')], method = 'spearman')) |
|
|
108 |
|
|
|
109 |
#' ## Variable effects |
|
|
110 |
#' |
|
|
111 |
#+ variable_effects |
|
|
112 |
|
|
|
113 |
risk.by.variables <- data.frame() |
|
|
114 |
|
|
|
115 |
for(variable in continuous.vars) { |
|
|
116 |
# Create a partial effect table for this variable |
|
|
117 |
risk.by.variable <- |
|
|
118 |
partialEffectTable( |
|
|
119 |
surv.model.fit, COHORT.prep[-test.set,], variable, na.action = 'na.impute' |
|
|
120 |
) |
|
|
121 |
# Slight kludge...rename the column which above is given the variable name to |
|
|
122 |
# just val, to allow rbinding |
|
|
123 |
names(risk.by.variable)[2] <- 'val' |
|
|
124 |
# Append a column with the variable's name so we can distinguish this in |
|
|
125 |
# a long data frame |
|
|
126 |
risk.by.variable$var <- variable |
|
|
127 |
# Append it to our ongoing big data frame |
|
|
128 |
risk.by.variables <- rbind(risk.by.variables, risk.by.variable) |
|
|
129 |
# Save the risks as we go |
|
|
130 |
write.csv(risk.by.variables, paste0(output.filename.base, '-var-effects.csv')) |
|
|
131 |
|
|
|
132 |
# Get the mean of the normalised risk for every value of the variable |
|
|
133 |
risk.aggregated <- |
|
|
134 |
aggregate( |
|
|
135 |
as.formula(paste0('risk.normalised ~ ', variable)), |
|
|
136 |
risk.by.variable, mean |
|
|
137 |
) |
|
|
138 |
|
|
|
139 |
# work out the limits on the x-axis by taking the 1st and 99th percentiles |
|
|
140 |
x.axis.limits <- |
|
|
141 |
quantile(COHORT.full[, variable], c(0.01, 0.99), na.rm = TRUE) |
|
|
142 |
|
|
|
143 |
print( |
|
|
144 |
ggplot(risk.by.variable, aes_string(x = variable, y = 'risk.normalised')) + |
|
|
145 |
geom_line(alpha=0.01, aes(group = id)) + |
|
|
146 |
geom_line(data = risk.aggregated, colour = 'blue') + |
|
|
147 |
coord_cartesian(xlim = c(x.axis.limits)) |
|
|
148 |
) |
|
|
149 |
} |