|
a |
|
b/lib/rfsrc-cv-nsplit-bootstrap.R |
|
|
1 |
bootstraps <- 20 |
|
|
2 |
split.rule <- 'logrank' |
|
|
3 |
n.threads <- 16 |
|
|
4 |
|
|
|
5 |
# Cross-validation variables |
|
|
6 |
ns.splits <- 0:20 |
|
|
7 |
ns.trees <- c(500, 1000, 2000) |
|
|
8 |
ns.imputations <- 1:3 |
|
|
9 |
cv.n.folds <- 3 |
|
|
10 |
n.data <- NA # This is of full dataset...further rows may be excluded in prep |
|
|
11 |
|
|
|
12 |
calibration.filename <- paste0(output.filename.base, '-calibration.csv') |
|
|
13 |
|
|
|
14 |
continuous.vars <- |
|
|
15 |
c( |
|
|
16 |
'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo', |
|
|
17 |
'total_wbc_6mo', 'haemoglobin_6mo' |
|
|
18 |
) |
|
|
19 |
|
|
|
20 |
untransformed.vars <- c('anonpatid', 'surv_time', 'imd_score', 'exclude') |
|
|
21 |
|
|
|
22 |
source('../lib/shared.R') |
|
|
23 |
require(ggrepel) |
|
|
24 |
|
|
|
25 |
# Load the data and convert to data frame to make column-selecting code in |
|
|
26 |
# prepData simpler |
|
|
27 |
COHORT.full <- data.frame(fread(data.filename)) |
|
|
28 |
|
|
|
29 |
# If n.data was specified... |
|
|
30 |
if(!is.na(n.data)){ |
|
|
31 |
# Take a subset n.data in size |
|
|
32 |
COHORT.use <- sample.df(COHORT.full, n.data) |
|
|
33 |
rm(COHORT.full) |
|
|
34 |
} else { |
|
|
35 |
# Use all the data |
|
|
36 |
COHORT.use <- COHORT.full |
|
|
37 |
rm(COHORT.full) |
|
|
38 |
} |
|
|
39 |
|
|
|
40 |
# We now need a quick null preparation of the data to get its length (some rows |
|
|
41 |
# may be excluded during preparation) |
|
|
42 |
COHORT.prep <- |
|
|
43 |
prepData( |
|
|
44 |
COHORT.use, |
|
|
45 |
cols.keep, discretise.settings, surv.time, surv.event, |
|
|
46 |
surv.event.yes, extra.fun = caliberExtraPrep, n.keep = n.data |
|
|
47 |
) |
|
|
48 |
n.data <- nrow(COHORT.prep) |
|
|
49 |
|
|
|
50 |
# Define indices of test set |
|
|
51 |
test.set <- testSetIndices(COHORT.prep, random.seed = 78361) |
|
|
52 |
|
|
|
53 |
# Process settings: don't touch anything!! |
|
|
54 |
process.settings <- |
|
|
55 |
list( |
|
|
56 |
var = c(untransformed.vars, continuous.vars), |
|
|
57 |
method = rep(NA, length(untransformed.vars) + length(continuous.vars)), |
|
|
58 |
settings = rep(NA, length(untransformed.vars) + length(continuous.vars)) |
|
|
59 |
) |
|
|
60 |
|
|
|
61 |
# If we've not already done a calibration, then do one |
|
|
62 |
if(!file.exists(calibration.filename)) { |
|
|
63 |
# Create an empty data frame to aggregate stats per fold |
|
|
64 |
cv.performance <- data.frame() |
|
|
65 |
|
|
|
66 |
# Items to cross-validate over |
|
|
67 |
cv.vars <- expand.grid(ns.splits, ns.trees, ns.imputations) |
|
|
68 |
names(cv.vars) <- c('n.splits', 'n.trees', 'n.imputations') |
|
|
69 |
|
|
|
70 |
# prep the data (since we're not cross-validating on data prep this can be |
|
|
71 |
# done before the loop) |
|
|
72 |
|
|
|
73 |
# Prep the data |
|
|
74 |
COHORT.cv <- |
|
|
75 |
prepData( |
|
|
76 |
# Data for cross-validation excludes test set |
|
|
77 |
COHORT.use[-test.set, ], |
|
|
78 |
cols.keep, |
|
|
79 |
process.settings, |
|
|
80 |
surv.time, surv.event, |
|
|
81 |
surv.event.yes, |
|
|
82 |
extra.fun = caliberExtraPrep |
|
|
83 |
) |
|
|
84 |
|
|
|
85 |
# Finally, add missing flag columns, but leave the missing data intact because |
|
|
86 |
# rfsrc can do on-the-fly imputation |
|
|
87 |
COHORT.cv <- prepCoxMissing(COHORT.cv, missingReplace = NA) |
|
|
88 |
|
|
|
89 |
# Add on those column names we just created |
|
|
90 |
surv.predict <- |
|
|
91 |
c(surv.predict, names(COHORT.cv)[grepl('_missing', names(COHORT.cv))]) |
|
|
92 |
|
|
|
93 |
# Run crossvalidations. No need to parallelise because rfsrc is parallelised |
|
|
94 |
for(i in 1:nrow(cv.vars)) { |
|
|
95 |
cat( |
|
|
96 |
'Calibration', i, '...\n' |
|
|
97 |
) |
|
|
98 |
|
|
|
99 |
# Get folds for cross-validation |
|
|
100 |
cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds) |
|
|
101 |
|
|
|
102 |
cv.fold.performance <- data.frame() |
|
|
103 |
|
|
|
104 |
for(j in 1:cv.n.folds) { |
|
|
105 |
time.start <- handyTimer() |
|
|
106 |
# Fit model to the training set |
|
|
107 |
surv.model.fit <- |
|
|
108 |
survivalFit( |
|
|
109 |
surv.predict, |
|
|
110 |
COHORT.cv[-cv.folds[[j]],], |
|
|
111 |
model.type = 'rfsrc', |
|
|
112 |
n.trees = cv.vars$n.trees[i], |
|
|
113 |
split.rule = split.rule, |
|
|
114 |
n.threads = n.threads, |
|
|
115 |
nsplit = cv.vars$n.splits[i], |
|
|
116 |
nimpute = cv.vars$n.imputations[i], |
|
|
117 |
na.action = 'na.impute' |
|
|
118 |
) |
|
|
119 |
time.learn <- handyTimer(time.start) |
|
|
120 |
|
|
|
121 |
time.start <- handyTimer() |
|
|
122 |
# Get C-indices for training and validation sets |
|
|
123 |
c.index.train <- |
|
|
124 |
cIndex( |
|
|
125 |
surv.model.fit, COHORT.cv[-cv.folds[[j]],], |
|
|
126 |
na.action = 'na.impute' |
|
|
127 |
) |
|
|
128 |
c.index.val <- |
|
|
129 |
cIndex( |
|
|
130 |
surv.model.fit, COHORT.cv[cv.folds[[j]],], |
|
|
131 |
na.action = 'na.impute' |
|
|
132 |
) |
|
|
133 |
time.predict <- handyTimer(time.start) |
|
|
134 |
|
|
|
135 |
# Append the stats we've obtained from this fold |
|
|
136 |
cv.fold.performance <- |
|
|
137 |
rbind( |
|
|
138 |
cv.fold.performance, |
|
|
139 |
data.frame( |
|
|
140 |
calibration = i, |
|
|
141 |
cv.fold = j, |
|
|
142 |
n.trees = cv.vars$n.trees[i], |
|
|
143 |
n.splits = cv.vars$n.splits[i], |
|
|
144 |
n.imputations = cv.vars$n.imputations[i], |
|
|
145 |
c.index.train, |
|
|
146 |
c.index.val, |
|
|
147 |
time.learn, |
|
|
148 |
time.predict |
|
|
149 |
) |
|
|
150 |
) |
|
|
151 |
|
|
|
152 |
} # End cross-validation loop (j) |
|
|
153 |
|
|
|
154 |
|
|
|
155 |
# rbind the performance by fold |
|
|
156 |
cv.performance <- |
|
|
157 |
rbind( |
|
|
158 |
cv.performance, |
|
|
159 |
cv.fold.performance |
|
|
160 |
) |
|
|
161 |
|
|
|
162 |
# Save output at the end of each loop |
|
|
163 |
write.csv(cv.performance, calibration.filename) |
|
|
164 |
|
|
|
165 |
} # End calibration loop (i) |
|
|
166 |
|
|
|
167 |
|
|
|
168 |
|
|
|
169 |
} else { # If we did previously calibrate, load it |
|
|
170 |
cv.performance <- read.csv(calibration.filename) |
|
|
171 |
} |
|
|
172 |
|
|
|
173 |
|
|
|
174 |
|
|
|
175 |
# Find the best calibration... |
|
|
176 |
# First, average performance across cross-validation folds |
|
|
177 |
cv.performance.average <- |
|
|
178 |
aggregate( |
|
|
179 |
c.index.val ~ calibration, |
|
|
180 |
data = cv.performance, |
|
|
181 |
mean |
|
|
182 |
) |
|
|
183 |
# Find the highest value |
|
|
184 |
best.calibration <- |
|
|
185 |
cv.performance.average$calibration[ |
|
|
186 |
which.max(cv.performance.average$c.index.val) |
|
|
187 |
] |
|
|
188 |
# And finally, find the first row of that calibration to get the n.bins values |
|
|
189 |
best.calibration.row1 <- |
|
|
190 |
min(which(cv.performance$calibration == best.calibration)) |
|
|
191 |
|
|
|
192 |
# Prep the data to fit and test with |
|
|
193 |
COHORT.prep <- |
|
|
194 |
prepData( |
|
|
195 |
# Data for cross-validation excludes test set |
|
|
196 |
COHORT.use, |
|
|
197 |
cols.keep, |
|
|
198 |
process.settings, |
|
|
199 |
surv.time, surv.event, |
|
|
200 |
surv.event.yes, |
|
|
201 |
extra.fun = caliberExtraPrep |
|
|
202 |
) |
|
|
203 |
|
|
|
204 |
# Finally, add missing flag columns, but leave the missing data intact because |
|
|
205 |
# rfsrc can do on-the-fly imputation |
|
|
206 |
COHORT.prep <- prepCoxMissing(COHORT.prep, missingReplace = NA) |
|
|
207 |
|
|
|
208 |
# Add on those column names we just created |
|
|
209 |
surv.predict <- |
|
|
210 |
c(surv.predict, names(COHORT.prep)[grepl('_missing', names(COHORT.prep))]) |
|
|
211 |
|
|
|
212 |
#' ## Fit the final model |
|
|
213 |
#' |
|
|
214 |
#' This may take some time, so we'll cache it if possible... |
|
|
215 |
|
|
|
216 |
#+ fit_final_model |
|
|
217 |
|
|
|
218 |
surv.model.fit <- |
|
|
219 |
survivalFit( |
|
|
220 |
surv.predict, |
|
|
221 |
COHORT.prep[-test.set,], |
|
|
222 |
model.type = 'rfsrc', |
|
|
223 |
n.trees = cv.performance[best.calibration.row1, 'n.trees'], |
|
|
224 |
split.rule = split.rule, |
|
|
225 |
n.threads = n.threads, |
|
|
226 |
nsplit = cv.performance[best.calibration.row1, 'n.splits'], |
|
|
227 |
nimpute = cv.performance[best.calibration.row1, 'n.imputations'], |
|
|
228 |
na.action = 'na.impute' |
|
|
229 |
) |
|
|
230 |
|
|
|
231 |
surv.model.params.boot <- |
|
|
232 |
survivalFitBoot( |
|
|
233 |
surv.predict, |
|
|
234 |
COHORT.prep[-test.set,], # Training set |
|
|
235 |
COHORT.prep[test.set,], # Test set |
|
|
236 |
model.type = 'rfsrc', |
|
|
237 |
n.threads = n.threads, |
|
|
238 |
bootstraps = bootstraps, |
|
|
239 |
n.trees = cv.performance[best.calibration.row1, 'n.trees'], |
|
|
240 |
split.rule = split.rule, |
|
|
241 |
nsplit = cv.performance[best.calibration.row1, 'n.splits'], |
|
|
242 |
nimpute = cv.performance[best.calibration.row1, 'n.imputations'], |
|
|
243 |
na.action = 'na.impute', |
|
|
244 |
filename = paste0(output.filename.base, '-boot-all.csv') |
|
|
245 |
) |
|
|
246 |
|
|
|
247 |
# Get C-indices for training and test sets |
|
|
248 |
surv.model.fit.coeffs <- bootStatsDf(surv.model.params.boot) |
|
|
249 |
|
|
|
250 |
# Save them to the all-models comparison table |
|
|
251 |
varsToTable( |
|
|
252 |
data.frame( |
|
|
253 |
model = 'rfsrc', |
|
|
254 |
imputation = FALSE, |
|
|
255 |
discretised = FALSE, |
|
|
256 |
c.index = surv.model.fit.coeffs['c.index', 'val'], |
|
|
257 |
c.index.lower = surv.model.fit.coeffs['c.index', 'lower'], |
|
|
258 |
c.index.upper = surv.model.fit.coeffs['c.index', 'upper'], |
|
|
259 |
calibration.score = surv.model.fit.coeffs['calibration.score', 'val'], |
|
|
260 |
calibration.score.lower = |
|
|
261 |
surv.model.fit.coeffs['calibration.score', 'lower'], |
|
|
262 |
calibration.score.upper = |
|
|
263 |
surv.model.fit.coeffs['calibration.score', 'upper'] |
|
|
264 |
), |
|
|
265 |
performance.file, |
|
|
266 |
index.cols = c('model', 'imputation', 'discretised') |
|
|
267 |
) |
|
|
268 |
|
|
|
269 |
write.csv( |
|
|
270 |
surv.model.fit.coeffs, |
|
|
271 |
paste0(output.filename.base, '-boot-summary.csv') |
|
|
272 |
) |