|
a |
|
b/random-forest/rf-imputed.R |
|
|
1 |
#+ knitr_setup, include = FALSE |
|
|
2 |
|
|
|
3 |
# Whether to cache the intensive code sections. Set to FALSE to recalculate |
|
|
4 |
# everything afresh. |
|
|
5 |
cacheoption <- FALSE |
|
|
6 |
# Disable lazy caching globally, because it fails for large objects, and all the |
|
|
7 |
# objects we wish to cache are large... |
|
|
8 |
opts_chunk$set(cache.lazy = FALSE) |
|
|
9 |
|
|
|
10 |
#' # Random survival forests on pre-imputed data |
|
|
11 |
#' |
|
|
12 |
#' The imputation algorithm used for the Cox modelling 'cheats' in a couple of |
|
|
13 |
#' different ways. Firstly, the imputation model is fitted on the whole dataset, |
|
|
14 |
#' rather than training a model on a training set and then using it on the test |
|
|
15 |
#' set. Secondly, causality is violated in that future measurements and even the |
|
|
16 |
#' outcome variable (ie whether a patient died) is included in the model. The |
|
|
17 |
#' rationale for this is that you want to have the best and most complete |
|
|
18 |
#' dataset possible, and if doctors are going to go on and use this in clinical |
|
|
19 |
#' practice they will simply take the additional required measurements when |
|
|
20 |
#' calculating a patient's risk score, rather than trying to do the modelling on |
|
|
21 |
#' incomplete data. |
|
|
22 |
#' |
|
|
23 |
#' However, this could allow the imputation to 'pass back' useful data to the |
|
|
24 |
#' Cox model, and thus artificially inflate its performance statistics. |
|
|
25 |
#' |
|
|
26 |
#' It is non-trivial to work around this, not least because the imputation |
|
|
27 |
#' package we used does not expose the model to allow training on a subset of |
|
|
28 |
#' the data. A quick and dirty method to check whether this may be an issue, |
|
|
29 |
#' therefore, is to train a random forest model on the imputed dataset, and see |
|
|
30 |
#' if it can outperform the Cox model. So, let's try it... |
|
|
31 |
|
|
|
32 |
#+ user_variables, message=FALSE |
|
|
33 |
|
|
|
34 |
imputed.data.filename <- '../../data/COHORT_complete.rds' |
|
|
35 |
endpoint <- 'death.imputed' |
|
|
36 |
|
|
|
37 |
n.trees <- 500 |
|
|
38 |
n.split <- 10 |
|
|
39 |
n.threads <- 40 |
|
|
40 |
|
|
|
41 |
bootstraps <- 50 |
|
|
42 |
|
|
|
43 |
output.filename.base <- '../../output/rf-imputed-try1' |
|
|
44 |
boot.filename <- paste0(output.filename.base, '-boot.rds') |
|
|
45 |
|
|
|
46 |
# What to do with missing data |
|
|
47 |
continuous.vars <- |
|
|
48 |
c( |
|
|
49 |
'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo', |
|
|
50 |
'total_wbc_6mo', 'haemoglobin_6mo' |
|
|
51 |
) |
|
|
52 |
untransformed.vars <- c('anonpatid', 'surv_time', 'imd_score', 'exclude') |
|
|
53 |
|
|
|
54 |
source('../lib/shared.R') |
|
|
55 |
require(ggrepel) |
|
|
56 |
|
|
|
57 |
#' ## Read imputed data |
|
|
58 |
|
|
|
59 |
# Load the data from its RDS file |
|
|
60 |
imputed.data <- readRDS(imputed.data.filename) |
|
|
61 |
|
|
|
62 |
# Remove rows with death time of 0 to avoid fitting errors, and get the survival |
|
|
63 |
# columns ready |
|
|
64 |
for(i in 1:length(imputed.data)) { |
|
|
65 |
imputed.data[[i]] <- imputed.data[[i]][imputed.data[[i]][, surv.time] > 0, ] |
|
|
66 |
# Put in a fake exclude column for the next function (exclusions are already |
|
|
67 |
# excluded in the imputed dataset) |
|
|
68 |
imputed.data[[i]]$exclude <- FALSE |
|
|
69 |
imputed.data[[i]]$imd_score <- as.numeric(imputed.data[[i]]$imd_score) |
|
|
70 |
imputed.data[[i]] <- |
|
|
71 |
caliberExtraPrep( |
|
|
72 |
prepSurvCol( |
|
|
73 |
imputed.data[[i]], 'time_death', 'endpoint_death', 1 |
|
|
74 |
) |
|
|
75 |
) |
|
|
76 |
} |
|
|
77 |
|
|
|
78 |
# Define test set |
|
|
79 |
test.set <- testSetIndices(imputed.data[[1]], random.seed = 78361) |
|
|
80 |
|
|
|
81 |
|
|
|
82 |
#' ## Fit random forest model |
|
|
83 |
|
|
|
84 |
# Do a quick and dirty fit on a single imputed dataset, to draw calibration |
|
|
85 |
# curve from |
|
|
86 |
time.start <- handyTimer() |
|
|
87 |
surv.model.fit <- |
|
|
88 |
survivalFit( |
|
|
89 |
surv.predict, |
|
|
90 |
imputed.data[[1]][-test.set,], |
|
|
91 |
model.type = 'rfsrc', |
|
|
92 |
n.trees = n.trees, |
|
|
93 |
split.rule = split.rule, |
|
|
94 |
n.threads = n.threads, |
|
|
95 |
nsplit = n.split |
|
|
96 |
) |
|
|
97 |
time.fit <- handyTimer(time.start) |
|
|
98 |
|
|
|
99 |
fit.rf.boot <- list() |
|
|
100 |
|
|
|
101 |
# Perform bootstrap fitting for every multiply imputed dataset |
|
|
102 |
time.start <- handyTimer() |
|
|
103 |
for(i in 1:length(imputed.data)) { |
|
|
104 |
fit.rf.boot[[i]] <- |
|
|
105 |
survivalBootstrap( |
|
|
106 |
surv.predict, |
|
|
107 |
imputed.data[[i]][-test.set, ], |
|
|
108 |
imputed.data[[i]][test.set, ], |
|
|
109 |
model.type = 'rfsrc', |
|
|
110 |
bootstraps = bootstraps, |
|
|
111 |
n.threads = n.threads |
|
|
112 |
) |
|
|
113 |
} |
|
|
114 |
time.boot <- handyTimer(time.start) |
|
|
115 |
|
|
|
116 |
# Save the fits, because it might've taken a while! |
|
|
117 |
saveRDS(fit.rf.boot, boot.filename) |
|
|
118 |
|
|
|
119 |
#' Model fitted in `r round(time.fit)` seconds, and `r bootstraps` fits |
|
|
120 |
#' performed on `r length(imputed.data)` imputed datasets in |
|
|
121 |
#' `r round(time.boot)` seconds. |
|
|
122 |
|
|
|
123 |
# Unpackage the uncertainties from the bootstrapped data |
|
|
124 |
fit.rf.boot.ests <- bootMIStats(fit.rf.boot) |
|
|
125 |
|
|
|
126 |
# Save bootstrapped performance values |
|
|
127 |
varsToTable( |
|
|
128 |
data.frame( |
|
|
129 |
model = 'rfsrc', |
|
|
130 |
imputation = TRUE, |
|
|
131 |
discretised = FALSE, |
|
|
132 |
c.index = fit.rf.boot.ests['c.test', 'val'], |
|
|
133 |
c.index.lower = fit.rf.boot.ests['c.test', 'lower'], |
|
|
134 |
c.index.upper = fit.rf.boot.ests['c.test', 'upper'], |
|
|
135 |
calibration.score = fit.rf.boot.ests['calibration.score', 'val'], |
|
|
136 |
calibration.score.lower = fit.rf.boot.ests['calibration.score', 'lower'], |
|
|
137 |
calibration.score.upper = fit.rf.boot.ests['calibration.score', 'upper'] |
|
|
138 |
), |
|
|
139 |
performance.file, |
|
|
140 |
index.cols = c('model', 'imputation', 'discretised') |
|
|
141 |
) |
|
|
142 |
|
|
|
143 |
#' ## Performance |
|
|
144 |
#' |
|
|
145 |
#' Having fitted the Cox model, how did we do? The c-indices were calculated as |
|
|
146 |
#' part of the bootstrapping, so we just need to take a look at those... |
|
|
147 |
#' |
|
|
148 |
#' C-indices are **`r round(fit.rf.boot.ests['c.train', 'val'], 3)` |
|
|
149 |
#' (`r round(fit.rf.boot.ests['c.train', 'lower'], 3)` - |
|
|
150 |
#' `r round(fit.rf.boot.ests['c.train', 'upper'], 3)`)** on the training set and |
|
|
151 |
#' **`r round(fit.rf.boot.ests['c.test', 'val'], 3)` |
|
|
152 |
#' (`r round(fit.rf.boot.ests['c.test', 'lower'], 3)` - |
|
|
153 |
#' `r round(fit.rf.boot.ests['c.test', 'upper'], 3)`)** on the test set. |
|
|
154 |
#' |
|
|
155 |
#' |
|
|
156 |
#' ### Calibration |
|
|
157 |
#' |
|
|
158 |
#' The bootstrapped calibration score is |
|
|
159 |
#' **`r round(fit.rf.boot.ests['calibration.score', 'val'], 3)` |
|
|
160 |
#' (`r round(fit.rf.boot.ests['calibration.score', 'lower'], 3)` - |
|
|
161 |
#' `r round(fit.rf.boot.ests['calibration.score', 'upper'], 3)`)**. |
|
|
162 |
#' |
|
|
163 |
#' Let's draw a representative curve from the unbootstrapped fit... (It would be |
|
|
164 |
#' better to draw all the curves from the bootstrap fit to get an idea of |
|
|
165 |
#' variability, but I've not implemented this yet.) |
|
|
166 |
#' |
|
|
167 |
#+ calibration_plot |
|
|
168 |
|
|
|
169 |
calibration.table <- |
|
|
170 |
calibrationTable(surv.model.fit, imputed.data[[i]][test.set, ]) |
|
|
171 |
|
|
|
172 |
calibration.score <- calibrationScore(calibration.table) |
|
|
173 |
|
|
|
174 |
calibrationPlot(calibration.table) |
|
|
175 |
|
|
|
176 |
#' The area between the calibration curve and the diagonal is |
|
|
177 |
#' **`r round(calibration.score[['area']], 3)`** +/- |
|
|
178 |
#' **`r round(calibration.score[['se']], 3)`**. |