|
a |
|
b/lib/all-cv-bootstrap.R |
|
|
1 |
# All model types are bootstrapped this many times |
|
|
2 |
bootstraps <- 200 |
|
|
3 |
# n.trees is (obviously) only relevant for random forests |
|
|
4 |
n.trees <- 500 |
|
|
5 |
# The following two variables are only relevant if the model.type is 'ranger' |
|
|
6 |
split.rule <- 'logrank' |
|
|
7 |
n.threads <- 10 |
|
|
8 |
|
|
|
9 |
# Cross-validation variables |
|
|
10 |
input.n.bins <- 10:20 |
|
|
11 |
cv.n.folds <- 3 |
|
|
12 |
n.calibrations <- 1000 |
|
|
13 |
n.data <- NA # This is of full dataset...further rows may be excluded in prep |
|
|
14 |
|
|
|
15 |
continuous.vars <- |
|
|
16 |
c( |
|
|
17 |
'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo', |
|
|
18 |
'total_wbc_6mo', 'haemoglobin_6mo' |
|
|
19 |
) |
|
|
20 |
|
|
|
21 |
source('shared.R') |
|
|
22 |
require(ggrepel) |
|
|
23 |
|
|
|
24 |
# Load the data and convert to data frame to make column-selecting code in |
|
|
25 |
# prepData simpler |
|
|
26 |
COHORT.full <- data.frame(fread(data.filename)) |
|
|
27 |
|
|
|
28 |
# If n.data was specified... |
|
|
29 |
if(!is.na(n.data)){ |
|
|
30 |
# Take a subset n.data in size |
|
|
31 |
COHORT.use <- sample.df(COHORT.full, n.data) |
|
|
32 |
rm(COHORT.full) |
|
|
33 |
} else { |
|
|
34 |
# Use all the data |
|
|
35 |
COHORT.use <- COHORT.full |
|
|
36 |
rm(COHORT.full) |
|
|
37 |
} |
|
|
38 |
|
|
|
39 |
# We now need a quick null preparation of the data to get its length (some rows |
|
|
40 |
# may be excluded during preparation) |
|
|
41 |
COHORT.prep <- |
|
|
42 |
prepData( |
|
|
43 |
COHORT.use, |
|
|
44 |
cols.keep, discretise.settings, surv.time, surv.event, |
|
|
45 |
surv.event.yes, extra.fun = caliberExtraPrep, n.keep = n.data |
|
|
46 |
) |
|
|
47 |
n.data <- nrow(COHORT.prep) |
|
|
48 |
|
|
|
49 |
# Define indices of test set |
|
|
50 |
test.set <- testSetIndices(COHORT.prep, random.seed = 78361) |
|
|
51 |
|
|
|
52 |
# If we've not already done a calibration, then do one |
|
|
53 |
if(!file.exists(calibration.filename)) { |
|
|
54 |
# Create an empty data frame to aggregate stats per fold |
|
|
55 |
cv.performance <- data.frame() |
|
|
56 |
|
|
|
57 |
# We can parallelise this bit with foreach, so set that up |
|
|
58 |
initParallel(n.threads) |
|
|
59 |
|
|
|
60 |
# Run crossvalidations in parallel |
|
|
61 |
cv.performance <- |
|
|
62 |
foreach(i = 1:n.calibrations, .combine = 'rbind') %dopar% { |
|
|
63 |
cat( |
|
|
64 |
'Calibration', i, '...\n' |
|
|
65 |
) |
|
|
66 |
|
|
|
67 |
# Reset process settings with the base setings |
|
|
68 |
process.settings <- |
|
|
69 |
list( |
|
|
70 |
var = c('anonpatid', 'time_death', 'imd_score', 'exclude'), |
|
|
71 |
method = c(NA, NA, NA, NA), |
|
|
72 |
settings = list(NA, NA, NA, NA) |
|
|
73 |
) |
|
|
74 |
# Generate some random numbers of bins (and for n bins, you need n + 1 breaks) |
|
|
75 |
n.bins <- sample(input.n.bins, length(continuous.vars), replace = TRUE) + 1 |
|
|
76 |
names(n.bins) <- continuous.vars |
|
|
77 |
# Go through each variable setting it to bin by quantile with a random number of bins |
|
|
78 |
for(j in 1:length(continuous.vars)) { |
|
|
79 |
process.settings$var <- c(process.settings$var, continuous.vars[j]) |
|
|
80 |
process.settings$method <- c(process.settings$method, 'binByQuantile') |
|
|
81 |
process.settings$settings <- |
|
|
82 |
c( |
|
|
83 |
process.settings$settings, |
|
|
84 |
list( |
|
|
85 |
seq( |
|
|
86 |
# Quantiles are obviously between 0 and 1 |
|
|
87 |
0, 1, |
|
|
88 |
# Choose a random number of bins (and for n bins, you need n + 1 breaks) |
|
|
89 |
length.out = n.bins[j] |
|
|
90 |
) |
|
|
91 |
) |
|
|
92 |
) |
|
|
93 |
} |
|
|
94 |
|
|
|
95 |
# prep the data given the variables provided |
|
|
96 |
COHORT.cv <- |
|
|
97 |
prepData( |
|
|
98 |
# Data for cross-validation excludes test set |
|
|
99 |
COHORT.use[-test.set, ], |
|
|
100 |
cols.keep, |
|
|
101 |
process.settings, |
|
|
102 |
surv.time, surv.event, |
|
|
103 |
surv.event.yes, |
|
|
104 |
extra.fun = caliberExtraPrep |
|
|
105 |
) |
|
|
106 |
|
|
|
107 |
# Get folds for cross-validation |
|
|
108 |
cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds) |
|
|
109 |
|
|
|
110 |
cv.fold.performance <- data.frame() |
|
|
111 |
|
|
|
112 |
for(j in 1:cv.n.folds) { |
|
|
113 |
time.start <- handyTimer() |
|
|
114 |
# Fit model to the training set |
|
|
115 |
surv.model.fit <- |
|
|
116 |
survivalFit( |
|
|
117 |
surv.predict, |
|
|
118 |
COHORT.cv[-cv.folds[[j]],], |
|
|
119 |
model.type = model.type, |
|
|
120 |
n.trees = n.trees, |
|
|
121 |
split.rule = split.rule |
|
|
122 |
# n.threads not used because this is run in parallel |
|
|
123 |
) |
|
|
124 |
time.learn <- handyTimer(time.start) |
|
|
125 |
|
|
|
126 |
time.start <- handyTimer() |
|
|
127 |
# Get C-indices for training and validation sets |
|
|
128 |
c.index.train <- |
|
|
129 |
cIndex( |
|
|
130 |
surv.model.fit, COHORT.cv[-cv.folds[[j]],], model.type = model.type |
|
|
131 |
) |
|
|
132 |
c.index.val <- |
|
|
133 |
cIndex( |
|
|
134 |
surv.model.fit, COHORT.cv[cv.folds[[j]],], model.type = model.type |
|
|
135 |
) |
|
|
136 |
time.predict <- handyTimer(time.start) |
|
|
137 |
|
|
|
138 |
# Append the stats we've obtained from this fold |
|
|
139 |
cv.fold.performance <- |
|
|
140 |
rbind( |
|
|
141 |
cv.fold.performance, |
|
|
142 |
data.frame( |
|
|
143 |
calibration = i, |
|
|
144 |
cv.fold = j, |
|
|
145 |
as.list(n.bins), |
|
|
146 |
c.index.train, |
|
|
147 |
c.index.val, |
|
|
148 |
time.learn, |
|
|
149 |
time.predict |
|
|
150 |
) |
|
|
151 |
) |
|
|
152 |
|
|
|
153 |
} # End cross-validation loop (j) |
|
|
154 |
|
|
|
155 |
# rbind the performance by fold |
|
|
156 |
cv.fold.performance |
|
|
157 |
} # End calibration loop (i) |
|
|
158 |
|
|
|
159 |
# Save output at end of calibration |
|
|
160 |
write.csv(cv.performance, calibration.filename) |
|
|
161 |
|
|
|
162 |
} else { # If we did previously calibrate, load it |
|
|
163 |
cv.performance <- read.csv(calibration.filename) |
|
|
164 |
} |
|
|
165 |
|
|
|
166 |
# Find the best calibration... |
|
|
167 |
# First, average performance across cross-validation folds |
|
|
168 |
cv.performance.average <- |
|
|
169 |
aggregate( |
|
|
170 |
c.index.val ~ calibration, |
|
|
171 |
data = cv.performance, |
|
|
172 |
mean |
|
|
173 |
) |
|
|
174 |
# Find the highest value |
|
|
175 |
best.calibration <- |
|
|
176 |
cv.performance.average$calibration[ |
|
|
177 |
which.max(cv.performance.average$c.index.val) |
|
|
178 |
] |
|
|
179 |
# And finally, find the first row of that calibration to get the n.bins values |
|
|
180 |
best.calibration.row1 <- |
|
|
181 |
min(which(cv.performance$calibration == best.calibration)) |
|
|
182 |
|
|
|
183 |
# Get its parameters |
|
|
184 |
n.bins <- |
|
|
185 |
t( |
|
|
186 |
cv.performance[best.calibration.row1, continuous.vars] |
|
|
187 |
) |
|
|
188 |
|
|
|
189 |
# Prepare the data with those settings... |
|
|
190 |
|
|
|
191 |
# Reset process settings with the base setings |
|
|
192 |
process.settings <- |
|
|
193 |
list( |
|
|
194 |
var = c('anonpatid', 'time_death', 'imd_score', 'exclude'), |
|
|
195 |
method = c(NA, NA, NA, NA), |
|
|
196 |
settings = list(NA, NA, NA, NA) |
|
|
197 |
) |
|
|
198 |
for(j in 1:length(continuous.vars)) { |
|
|
199 |
process.settings$var <- c(process.settings$var, continuous.vars[j]) |
|
|
200 |
process.settings$method <- c(process.settings$method, 'binByQuantile') |
|
|
201 |
process.settings$settings <- |
|
|
202 |
c( |
|
|
203 |
process.settings$settings, |
|
|
204 |
list( |
|
|
205 |
seq( |
|
|
206 |
# Quantiles are obviously between 0 and 1 |
|
|
207 |
0, 1, |
|
|
208 |
# Choose a random number of bins (and for n bins, you need n + 1 breaks) |
|
|
209 |
length.out = n.bins[j] |
|
|
210 |
) |
|
|
211 |
) |
|
|
212 |
) |
|
|
213 |
} |
|
|
214 |
|
|
|
215 |
# prep the data given the variables provided |
|
|
216 |
COHORT.optimised <- |
|
|
217 |
prepData( |
|
|
218 |
# Data for cross-validation excludes test set |
|
|
219 |
COHORT.use, |
|
|
220 |
cols.keep, |
|
|
221 |
process.settings, |
|
|
222 |
surv.time, surv.event, |
|
|
223 |
surv.event.yes, |
|
|
224 |
extra.fun = caliberExtraPrep |
|
|
225 |
) |
|
|
226 |
|
|
|
227 |
#' ## Fit the final model |
|
|
228 |
#' |
|
|
229 |
#' This may take some time, so we'll cache it if possible... |
|
|
230 |
|
|
|
231 |
#+ fit_final_model |
|
|
232 |
|
|
|
233 |
# Fit to whole training set |
|
|
234 |
surv.model.fit <- |
|
|
235 |
survivalFit( |
|
|
236 |
surv.predict, |
|
|
237 |
COHORT.optimised[-test.set,], # Training set |
|
|
238 |
model.type = model.type, |
|
|
239 |
n.trees = n.trees, |
|
|
240 |
split.rule = split.rule, |
|
|
241 |
n.threads = n.threads |
|
|
242 |
) |
|
|
243 |
|
|
|
244 |
cl <- initParallel(n.threads, backend = 'doParallel') |
|
|
245 |
|
|
|
246 |
surv.model.params.boot <- |
|
|
247 |
foreach( |
|
|
248 |
i = 1:bootstraps, |
|
|
249 |
.combine = rbind, |
|
|
250 |
.packages = c('survival'), |
|
|
251 |
.verbose = TRUE |
|
|
252 |
) %dopar% { |
|
|
253 |
|
|
|
254 |
# Bootstrap-sampled training set |
|
|
255 |
COHORT.boot <- |
|
|
256 |
sample.df( |
|
|
257 |
COHORT.optimised[-test.set,], |
|
|
258 |
nrow(COHORT.optimised[-test.set,]), |
|
|
259 |
replace = TRUE |
|
|
260 |
) |
|
|
261 |
|
|
|
262 |
surv.model.fit.i <- |
|
|
263 |
survivalFit( |
|
|
264 |
surv.predict, |
|
|
265 |
COHORT.boot, |
|
|
266 |
model.type = model.type, |
|
|
267 |
n.trees = n.trees, |
|
|
268 |
split.rule = split.rule, |
|
|
269 |
# 1 thread, because we're parallelising the bootstrapping |
|
|
270 |
n.threads = 1 |
|
|
271 |
) |
|
|
272 |
|
|
|
273 |
# Work out other quantities of interest |
|
|
274 |
#var.imp.vector <- bootstrapVarImp(surv.model.fit.i, COHORT.boot) |
|
|
275 |
c.index <- cIndex(surv.model.fit.i, COHORT.optimised[test.set, ]) |
|
|
276 |
calibration.score <- |
|
|
277 |
calibrationScoreWrapper(surv.model.fit.i, COHORT.optimised[test.set, ]) |
|
|
278 |
|
|
|
279 |
data.frame( |
|
|
280 |
i, |
|
|
281 |
t(coef(surv.model.fit.i)), |
|
|
282 |
#t(var.imp.vector), |
|
|
283 |
c.index, |
|
|
284 |
calibration.score |
|
|
285 |
) |
|
|
286 |
} |
|
|
287 |
|
|
|
288 |
# Save the fit object |
|
|
289 |
write.csv(surv.model.params.boot, paste0(output.filename.base, '-surv-boot.csv')) |
|
|
290 |
|
|
|
291 |
# Tidy up by removing the cluster |
|
|
292 |
stopCluster(cl) |
|
|
293 |
|
|
|
294 |
surv.model.fit.coeffs <- bootStatsDf(surv.model.params.boot) |