# All model types are bootstrapped this many times
bootstraps <- 200
# n.trees is (obviously) only relevant for random forests
n.trees <- 500
# The following two variables are only relevant if the model.type is 'ranger'
split.rule <- 'logrank'
n.threads <- 10
# Cross-validation variables
input.n.bins <- 10:20
cv.n.folds <- 3
n.calibrations <- 1000
n.data <- NA # This is of full dataset...further rows may be excluded in prep
continuous.vars <-
c(
'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
'total_wbc_6mo', 'haemoglobin_6mo'
)
source('shared.R')
require(ggrepel)
# Load the data and convert to data frame to make column-selecting code in
# prepData simpler
COHORT.full <- data.frame(fread(data.filename))
# If n.data was specified...
if(!is.na(n.data)){
# Take a subset n.data in size
COHORT.use <- sample.df(COHORT.full, n.data)
rm(COHORT.full)
} else {
# Use all the data
COHORT.use <- COHORT.full
rm(COHORT.full)
}
# We now need a quick null preparation of the data to get its length (some rows
# may be excluded during preparation)
COHORT.prep <-
prepData(
COHORT.use,
cols.keep, discretise.settings, surv.time, surv.event,
surv.event.yes, extra.fun = caliberExtraPrep, n.keep = n.data
)
n.data <- nrow(COHORT.prep)
# Define indices of test set
test.set <- testSetIndices(COHORT.prep, random.seed = 78361)
# If we've not already done a calibration, then do one
if(!file.exists(calibration.filename)) {
# Create an empty data frame to aggregate stats per fold
cv.performance <- data.frame()
# We can parallelise this bit with foreach, so set that up
initParallel(n.threads)
# Run crossvalidations in parallel
cv.performance <-
foreach(i = 1:n.calibrations, .combine = 'rbind') %dopar% {
cat(
'Calibration', i, '...\n'
)
# Reset process settings with the base setings
process.settings <-
list(
var = c('anonpatid', 'time_death', 'imd_score', 'exclude'),
method = c(NA, NA, NA, NA),
settings = list(NA, NA, NA, NA)
)
# Generate some random numbers of bins (and for n bins, you need n + 1 breaks)
n.bins <- sample(input.n.bins, length(continuous.vars), replace = TRUE) + 1
names(n.bins) <- continuous.vars
# Go through each variable setting it to bin by quantile with a random number of bins
for(j in 1:length(continuous.vars)) {
process.settings$var <- c(process.settings$var, continuous.vars[j])
process.settings$method <- c(process.settings$method, 'binByQuantile')
process.settings$settings <-
c(
process.settings$settings,
list(
seq(
# Quantiles are obviously between 0 and 1
0, 1,
# Choose a random number of bins (and for n bins, you need n + 1 breaks)
length.out = n.bins[j]
)
)
)
}
# prep the data given the variables provided
COHORT.cv <-
prepData(
# Data for cross-validation excludes test set
COHORT.use[-test.set, ],
cols.keep,
process.settings,
surv.time, surv.event,
surv.event.yes,
extra.fun = caliberExtraPrep
)
# Get folds for cross-validation
cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds)
cv.fold.performance <- data.frame()
for(j in 1:cv.n.folds) {
time.start <- handyTimer()
# Fit model to the training set
surv.model.fit <-
survivalFit(
surv.predict,
COHORT.cv[-cv.folds[[j]],],
model.type = model.type,
n.trees = n.trees,
split.rule = split.rule
# n.threads not used because this is run in parallel
)
time.learn <- handyTimer(time.start)
time.start <- handyTimer()
# Get C-indices for training and validation sets
c.index.train <-
cIndex(
surv.model.fit, COHORT.cv[-cv.folds[[j]],], model.type = model.type
)
c.index.val <-
cIndex(
surv.model.fit, COHORT.cv[cv.folds[[j]],], model.type = model.type
)
time.predict <- handyTimer(time.start)
# Append the stats we've obtained from this fold
cv.fold.performance <-
rbind(
cv.fold.performance,
data.frame(
calibration = i,
cv.fold = j,
as.list(n.bins),
c.index.train,
c.index.val,
time.learn,
time.predict
)
)
} # End cross-validation loop (j)
# rbind the performance by fold
cv.fold.performance
} # End calibration loop (i)
# Save output at end of calibration
write.csv(cv.performance, calibration.filename)
} else { # If we did previously calibrate, load it
cv.performance <- read.csv(calibration.filename)
}
# Find the best calibration...
# First, average performance across cross-validation folds
cv.performance.average <-
aggregate(
c.index.val ~ calibration,
data = cv.performance,
mean
)
# Find the highest value
best.calibration <-
cv.performance.average$calibration[
which.max(cv.performance.average$c.index.val)
]
# And finally, find the first row of that calibration to get the n.bins values
best.calibration.row1 <-
min(which(cv.performance$calibration == best.calibration))
# Get its parameters
n.bins <-
t(
cv.performance[best.calibration.row1, continuous.vars]
)
# Prepare the data with those settings...
# Reset process settings with the base setings
process.settings <-
list(
var = c('anonpatid', 'time_death', 'imd_score', 'exclude'),
method = c(NA, NA, NA, NA),
settings = list(NA, NA, NA, NA)
)
for(j in 1:length(continuous.vars)) {
process.settings$var <- c(process.settings$var, continuous.vars[j])
process.settings$method <- c(process.settings$method, 'binByQuantile')
process.settings$settings <-
c(
process.settings$settings,
list(
seq(
# Quantiles are obviously between 0 and 1
0, 1,
# Choose a random number of bins (and for n bins, you need n + 1 breaks)
length.out = n.bins[j]
)
)
)
}
# prep the data given the variables provided
COHORT.optimised <-
prepData(
# Data for cross-validation excludes test set
COHORT.use,
cols.keep,
process.settings,
surv.time, surv.event,
surv.event.yes,
extra.fun = caliberExtraPrep
)
#' ## Fit the final model
#'
#' This may take some time, so we'll cache it if possible...
#+ fit_final_model
# Fit to whole training set
surv.model.fit <-
survivalFit(
surv.predict,
COHORT.optimised[-test.set,], # Training set
model.type = model.type,
n.trees = n.trees,
split.rule = split.rule,
n.threads = n.threads
)
cl <- initParallel(n.threads, backend = 'doParallel')
surv.model.params.boot <-
foreach(
i = 1:bootstraps,
.combine = rbind,
.packages = c('survival'),
.verbose = TRUE
) %dopar% {
# Bootstrap-sampled training set
COHORT.boot <-
sample.df(
COHORT.optimised[-test.set,],
nrow(COHORT.optimised[-test.set,]),
replace = TRUE
)
surv.model.fit.i <-
survivalFit(
surv.predict,
COHORT.boot,
model.type = model.type,
n.trees = n.trees,
split.rule = split.rule,
# 1 thread, because we're parallelising the bootstrapping
n.threads = 1
)
# Work out other quantities of interest
#var.imp.vector <- bootstrapVarImp(surv.model.fit.i, COHORT.boot)
c.index <- cIndex(surv.model.fit.i, COHORT.optimised[test.set, ])
calibration.score <-
calibrationScoreWrapper(surv.model.fit.i, COHORT.optimised[test.set, ])
data.frame(
i,
t(coef(surv.model.fit.i)),
#t(var.imp.vector),
c.index,
calibration.score
)
}
# Save the fit object
write.csv(surv.model.params.boot, paste0(output.filename.base, '-surv-boot.csv'))
# Tidy up by removing the cluster
stopCluster(cl)
surv.model.fit.coeffs <- bootStatsDf(surv.model.params.boot)