bootstraps <- 20
split.rule <- 'logrank'
n.threads <- 16
# Cross-validation variables
ns.splits <- 0:20
ns.trees <- c(500, 1000, 2000)
ns.imputations <- 1:3
cv.n.folds <- 3
n.data <- NA # This is of full dataset...further rows may be excluded in prep
calibration.filename <- paste0(output.filename.base, '-calibration.csv')
continuous.vars <-
c(
'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
'total_wbc_6mo', 'haemoglobin_6mo'
)
untransformed.vars <- c('anonpatid', 'surv_time', 'imd_score', 'exclude')
source('../lib/shared.R')
require(ggrepel)
# Load the data and convert to data frame to make column-selecting code in
# prepData simpler
COHORT.full <- data.frame(fread(data.filename))
# If n.data was specified...
if(!is.na(n.data)){
# Take a subset n.data in size
COHORT.use <- sample.df(COHORT.full, n.data)
rm(COHORT.full)
} else {
# Use all the data
COHORT.use <- COHORT.full
rm(COHORT.full)
}
# We now need a quick null preparation of the data to get its length (some rows
# may be excluded during preparation)
COHORT.prep <-
prepData(
COHORT.use,
cols.keep, discretise.settings, surv.time, surv.event,
surv.event.yes, extra.fun = caliberExtraPrep, n.keep = n.data
)
n.data <- nrow(COHORT.prep)
# Define indices of test set
test.set <- testSetIndices(COHORT.prep, random.seed = 78361)
# Process settings: don't touch anything!!
process.settings <-
list(
var = c(untransformed.vars, continuous.vars),
method = rep(NA, length(untransformed.vars) + length(continuous.vars)),
settings = rep(NA, length(untransformed.vars) + length(continuous.vars))
)
# If we've not already done a calibration, then do one
if(!file.exists(calibration.filename)) {
# Create an empty data frame to aggregate stats per fold
cv.performance <- data.frame()
# Items to cross-validate over
cv.vars <- expand.grid(ns.splits, ns.trees, ns.imputations)
names(cv.vars) <- c('n.splits', 'n.trees', 'n.imputations')
# prep the data (since we're not cross-validating on data prep this can be
# done before the loop)
# Prep the data
COHORT.cv <-
prepData(
# Data for cross-validation excludes test set
COHORT.use[-test.set, ],
cols.keep,
process.settings,
surv.time, surv.event,
surv.event.yes,
extra.fun = caliberExtraPrep
)
# Finally, add missing flag columns, but leave the missing data intact because
# rfsrc can do on-the-fly imputation
COHORT.cv <- prepCoxMissing(COHORT.cv, missingReplace = NA)
# Add on those column names we just created
surv.predict <-
c(surv.predict, names(COHORT.cv)[grepl('_missing', names(COHORT.cv))])
# Run crossvalidations. No need to parallelise because rfsrc is parallelised
for(i in 1:nrow(cv.vars)) {
cat(
'Calibration', i, '...\n'
)
# Get folds for cross-validation
cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds)
cv.fold.performance <- data.frame()
for(j in 1:cv.n.folds) {
time.start <- handyTimer()
# Fit model to the training set
surv.model.fit <-
survivalFit(
surv.predict,
COHORT.cv[-cv.folds[[j]],],
model.type = 'rfsrc',
n.trees = cv.vars$n.trees[i],
split.rule = split.rule,
n.threads = n.threads,
nsplit = cv.vars$n.splits[i],
nimpute = cv.vars$n.imputations[i],
na.action = 'na.impute'
)
time.learn <- handyTimer(time.start)
time.start <- handyTimer()
# Get C-indices for training and validation sets
c.index.train <-
cIndex(
surv.model.fit, COHORT.cv[-cv.folds[[j]],],
na.action = 'na.impute'
)
c.index.val <-
cIndex(
surv.model.fit, COHORT.cv[cv.folds[[j]],],
na.action = 'na.impute'
)
time.predict <- handyTimer(time.start)
# Append the stats we've obtained from this fold
cv.fold.performance <-
rbind(
cv.fold.performance,
data.frame(
calibration = i,
cv.fold = j,
n.trees = cv.vars$n.trees[i],
n.splits = cv.vars$n.splits[i],
n.imputations = cv.vars$n.imputations[i],
c.index.train,
c.index.val,
time.learn,
time.predict
)
)
} # End cross-validation loop (j)
# rbind the performance by fold
cv.performance <-
rbind(
cv.performance,
cv.fold.performance
)
# Save output at the end of each loop
write.csv(cv.performance, calibration.filename)
} # End calibration loop (i)
} else { # If we did previously calibrate, load it
cv.performance <- read.csv(calibration.filename)
}
# Find the best calibration...
# First, average performance across cross-validation folds
cv.performance.average <-
aggregate(
c.index.val ~ calibration,
data = cv.performance,
mean
)
# Find the highest value
best.calibration <-
cv.performance.average$calibration[
which.max(cv.performance.average$c.index.val)
]
# And finally, find the first row of that calibration to get the n.bins values
best.calibration.row1 <-
min(which(cv.performance$calibration == best.calibration))
# Prep the data to fit and test with
COHORT.prep <-
prepData(
# Data for cross-validation excludes test set
COHORT.use,
cols.keep,
process.settings,
surv.time, surv.event,
surv.event.yes,
extra.fun = caliberExtraPrep
)
# Finally, add missing flag columns, but leave the missing data intact because
# rfsrc can do on-the-fly imputation
COHORT.prep <- prepCoxMissing(COHORT.prep, missingReplace = NA)
# Add on those column names we just created
surv.predict <-
c(surv.predict, names(COHORT.prep)[grepl('_missing', names(COHORT.prep))])
#' ## Fit the final model
#'
#' This may take some time, so we'll cache it if possible...
#+ fit_final_model
surv.model.fit <-
survivalFit(
surv.predict,
COHORT.prep[-test.set,],
model.type = 'rfsrc',
n.trees = cv.performance[best.calibration.row1, 'n.trees'],
split.rule = split.rule,
n.threads = n.threads,
nsplit = cv.performance[best.calibration.row1, 'n.splits'],
nimpute = cv.performance[best.calibration.row1, 'n.imputations'],
na.action = 'na.impute'
)
surv.model.params.boot <-
survivalFitBoot(
surv.predict,
COHORT.prep[-test.set,], # Training set
COHORT.prep[test.set,], # Test set
model.type = 'rfsrc',
n.threads = n.threads,
bootstraps = bootstraps,
n.trees = cv.performance[best.calibration.row1, 'n.trees'],
split.rule = split.rule,
nsplit = cv.performance[best.calibration.row1, 'n.splits'],
nimpute = cv.performance[best.calibration.row1, 'n.imputations'],
na.action = 'na.impute',
filename = paste0(output.filename.base, '-boot-all.csv')
)
# Get C-indices for training and test sets
surv.model.fit.coeffs <- bootStatsDf(surv.model.params.boot)
# Save them to the all-models comparison table
varsToTable(
data.frame(
model = 'rfsrc',
imputation = FALSE,
discretised = FALSE,
c.index = surv.model.fit.coeffs['c.index', 'val'],
c.index.lower = surv.model.fit.coeffs['c.index', 'lower'],
c.index.upper = surv.model.fit.coeffs['c.index', 'upper'],
calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
calibration.score.lower =
surv.model.fit.coeffs['calibration.score', 'lower'],
calibration.score.upper =
surv.model.fit.coeffs['calibration.score', 'upper']
),
performance.file,
index.cols = c('model', 'imputation', 'discretised')
)
write.csv(
surv.model.fit.coeffs,
paste0(output.filename.base, '-boot-summary.csv')
)