#+ knitr_setup, include = FALSE
# Whether to cache the intensive code sections. Set to FALSE to recalculate
# everything afresh.
cacheoption <- TRUE
# Disable lazy caching globally, because it fails for large objects, and all the
# objects we wish to cache are large...
opts_chunk$set(cache.lazy = FALSE)
#' # Variable selection in data-driven health records
#'
#' Having extracted around 600 variables which occur most frequently in patient
#' records, let's try to narrow these down using the methodology of varSelRf
#' with a couple of custom additions (potentially multiple variable importance
#' calculations at the outset, and cross-validation to choose the best number of
#' variables at the end.)
#'
#' ## User variables
#'
#+ user_variables
output.filename.base <- '../../output/rf-bigdata-try12-varselrf2'
nsplit <- 20
n.trees.initial <- 500
n.forests.initial <- 5
n.trees.cv <- 500
n.trees.final <- 2000
split.rule <- 'logrank'
n.imputations <- 3
cv.n.folds <- 3
vars.drop.frac <- 0.2 # Fraction of variables to drop at each iteration
bootstraps <- 200
n.data <- NA # This is after any variables being excluded in prep
n.threads <- 20
#' ## Data set-up
#'
#+ data_setup
data.filename.big <- '../../data/cohort-datadriven-02.csv'
surv.predict.old <- c('age', 'smokstatus', 'imd_score', 'gender')
untransformed.vars <- c('time_death', 'endpoint_death', 'exclude')
source('../lib/shared.R')
require(xtable)
# Define these after shared.R or they will be overwritten!
exclude.vars <-
c(
# Entity type 4 is smoking status, which we already have
"clinical.values.4_data1", "clinical.values.4_data5",
"clinical.values.4_data6",
# Entity 13 data2 is the patient's weight centile, and not a single one is
# entered, but they come out as 0 so the algorithm, looking for NAs, thinks
# it's a useful column
"clinical.values.13_data2",
# Entities 148 and 149 are to do with death certification. I'm not sure how
# it made it into the dataset, but since all the datapoints in this are
# looking back in time, they're all NA. This causes rfsrc to fail.
"clinical.values.148_data1", "clinical.values.148_data2",
"clinical.values.148_data3", "clinical.values.148_data4",
"clinical.values.148_data5",
"clinical.values.149_data1", "clinical.values.149_data2"
)
COHORT <- fread(data.filename.big)
l2df <- function(x) {
# Simple function to turn a list into a data frame so we can compare variable
# importances across iterations of the initial forests
rownames <- unique(unlist(lapply(x, names)))
df <-
data.frame(
matrix(
unlist(lapply(x, FUN = function(x){x[rownames]})),
ncol = length(x), byrow = FALSE
)
)
rownames(df) <- rownames
df
}
bigdata.prefixes <-
c(
'hes.icd.',
'hes.opcs.',
'tests.enttype.',
'clinical.history.',
'clinical.values.',
'bnf.'
)
bigdata.columns <-
colnames(COHORT)[
which(
# Does is start with one of the data column names?
startsWithAny(names(COHORT), bigdata.prefixes) &
# And it's not one of the columns we want to exclude?
!(colnames(COHORT) %in% exclude.vars)
)
]
COHORT.bigdata <-
COHORT[, c(
untransformed.vars, surv.predict.old, bigdata.columns
),
with = FALSE
]
# Deal appropriately with missing data
# Most of the variables are number of days since the first record of that type
time.based.vars <-
names(COHORT.bigdata)[
startsWithAny(
names(COHORT.bigdata),
c('hes.icd.', 'hes.opcs.', 'clinical.history.')
)
]
# We're dealing with this as a logical, so we want non-NA values to be TRUE,
# is there is something in the history
for (j in time.based.vars) {
set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]]))
}
# Again, taking this as a logical, set any non-NA value to TRUE.
prescriptions.vars <- names(COHORT.bigdata)[startsWith(names(COHORT.bigdata), 'bnf.')]
for (j in prescriptions.vars) {
set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]]))
}
# This leaves tests and clinical.values, which are test results and should be
# imputed.
# Manually fix clinical values items...
#
# "clinical.values.1_data1" "clinical.values.1_data2"
# These are just blood pressure values...fine to impute
#
# "clinical.values.13_data1" "clinical.values.13_data3"
# These are weight and BMI...also fine to impute
#
# Entity 5 is alcohol consumption status, 1 = Yes, 2 = No, 3 = Ex, so should be
# a factor, and NA can be a factor level
COHORT.bigdata$clinical.values.5_data1 <-
factorNAfix(factor(COHORT.bigdata$clinical.values.5_data1), NAval = 'missing')
# Both gender and smokstatus are factors...fix that
COHORT.bigdata$gender <- factor(COHORT.bigdata$gender)
COHORT.bigdata$smokstatus <-
factorNAfix(factor(COHORT.bigdata$smokstatus), NAval = 'missing')
# Exclude invalid patients
COHORT.bigdata <- COHORT.bigdata[!COHORT.bigdata$exclude]
COHORT.bigdata$exclude <- NULL
COHORT.bigdata <-
prepSurvCol(data.frame(COHORT.bigdata), 'time_death', 'endpoint_death', 'Death')
# If n.data was specified, trim the data table down to size
if(!is.na(n.data)) {
COHORT.bigdata <- sample.df(COHORT.bigdata, n.data)
}
# Define test set
test.set <- testSetIndices(COHORT.bigdata, random.seed = 78361)
# Start by predicting survival with all the variables provided
surv.predict <- c(surv.predict.old, bigdata.columns)
# Set up a csv file to store calibration data, or retrieve previous data
calibration.filename <- paste0(output.filename.base, '-varselcalibration.csv')
#' ## Run random forest calibration
#'
#' If there's not already a calibration file, we run the rfVarSel methodology:
#' 1. Fit a big forest to the whole dataset to obtain variable importances.
#' 2. Cross-validate as number of most important variables kept is reduced.
#'
#' (If there is already a calibration file, just load the previous work.)
#'
#+ rf_var_sel_calibration
# If we've not already done a calibration, then do one
if(!file.exists(calibration.filename)) {
# Start by fitting several forests to all the variables
vimps.initial <- list()
time.fit <- c()
time.vimp <- c()
# If we haven't already made the initial forests...
if(!file.exists(
paste0(output.filename.base,'-varimp-', n.forests.initial,'.rds'))
) {
for(i in 1:n.forests.initial) {
time.start <- handyTimer()
surv.model.fit.full <-
survivalFit(
surv.predict,
COHORT.bigdata[-test.set,],
model.type = 'rfsrc',
n.trees = n.trees.initial,
split.rule = split.rule,
n.threads = n.threads,
nimpute = 3,
nsplit = nsplit,
na.action = 'na.impute'
)
time.fit <- c(time.fit, handyTimer(time.start))
# Save the model
saveRDS(
surv.model.fit.full,
paste0(output.filename.base,'-initialmodel-', i,'.rds')
)
# Calculate variable importance
time.start <- handyTimer()
var.imp <- vimp(surv.model.fit.full, importance = 'permute.ensemble')
time.vimp <- c(time.vimp, handyTimer(time.start))
# Save it
saveRDS(var.imp, paste0(output.filename.base, '-varimp-', i,'.rds'))
# Make a vector of variable importances and append to the list
vimps.initial[[i]] <- sort(var.imp$importance, decreasing = TRUE)
}
cat('Total initial vimp time = ', sum(time.fit) + sum(time.vimp))
cat('Average fit time = ', mean(time.fit))
cat('Average vimp time = ', mean(time.vimp))
} else {
# If we already made the initial forests, just load them
for(i in 1:n.forests.initial) {
var.imp <- readRDS(paste0(output.filename.base, '-varimp-', i,'.rds'))
vimps.initial[[i]] <- sort(var.imp$importance, decreasing = TRUE)
}
}
# Convert the vimps.initial list to a dataframe, rowwise
vimps.initial <- l2df(vimps.initial)
# Take averages across rows to give variable importances to use
var.importances <- sort(apply(vimps.initial, 1, mean), decreasing = TRUE)
# Save the result
saveRDS(var.importances, paste0(output.filename.base, '-varimp.rds'))
# Create an empty data frame to aggregate stats per fold
cv.performance <- data.frame()
# Cross-validate over number of variables to try
cv.vars <- getVarNums(length(var.importances))
COHORT.cv <- COHORT.bigdata[-test.set, ]
# Run crossvalidations. No need to parallelise because rfsrc is parallelised
for(i in 1:length(cv.vars)) {
# Get the subset of most important variables to use
surv.predict.partial <- names(var.importances)[1:cv.vars[i]]
# Get folds for cross-validation
cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds)
cv.fold.performance <- data.frame()
for(j in 1:cv.n.folds) {
time.start <- handyTimer()
# Fit model to the training set
surv.model.fit <-
survivalFit(
surv.predict.partial,
COHORT.cv[-cv.folds[[j]],],
model.type = 'rfsrc',
n.trees = n.trees.cv,
split.rule = split.rule,
n.threads = n.threads,
nsplit = nsplit,
nimpute = n.imputations,
na.action = 'na.impute'
)
time.learn <- handyTimer(time.start)
time.start <- handyTimer()
# Get C-index on validation set
c.index.val <-
cIndex(
surv.model.fit, COHORT.cv[cv.folds[[j]],],
na.action = 'na.impute'
)
time.c.index <- handyTimer(time.start)
time.start <- handyTimer()
# Get calibration score validation set
calibration.score <-
calibrationScore(
calibrationTable(
surv.model.fit, COHORT.cv[cv.folds[[j]],], na.action = 'na.impute'
)
)
time.calibration <- handyTimer(time.start)
# Append the stats we've obtained from this fold
cv.fold.performance <-
rbind(
cv.fold.performance,
data.frame(
calibration = i,
cv.fold = j,
n.vars = cv.vars[i],
c.index.val,
calibration.score,
time.learn,
time.c.index,
time.calibration
)
)
} # End cross-validation loop (j)
# rbind the performance by fold
cv.performance <-
rbind(
cv.performance,
cv.fold.performance
)
# Save output at the end of each loop
write.csv(cv.performance, calibration.filename)
} # End calibration loop (i)
} else {
cv.performance <- read.csv(calibration.filename)
var.importances <- readRDS(paste0(output.filename.base, '-varimp.rds'))
}
#' ## Find the best model from the calibrations
#'
#' ### Plot model performance
#'
#+ model_performance
# Find the best calibration...
# First, average performance across cross-validation folds
cv.performance.average <-
aggregate(
c.index.val ~ n.vars,
data = cv.performance,
mean
)
cv.calibration.average <-
aggregate(
area ~ n.vars,
data = cv.performance,
mean
)
ggplot(cv.performance.average, aes(x = n.vars, y = c.index.val)) +
geom_line() +
geom_point(data = cv.performance) +
ggtitle(label = 'C-index by n.vars')
ggplot(cv.calibration.average, aes(x = n.vars, y = area)) +
geom_line() +
geom_point(data = cv.performance) +
ggtitle(label = 'Calibration performance by n.vars')
# Find the highest value
n.vars <-
cv.performance.average$n.vars[
which.max(cv.performance.average$c.index.val)
]
# Fit a full model with the variables provided
surv.predict.partial <- names(var.importances)[1:n.vars]
#' ## Best model
#'
#' The best model contained `r n.vars` variables. Let's see what those were...
#'
#+ variables_used
vars.df <-
data.frame(
vars = surv.predict.partial
)
vars.df$descriptions <- lookUpDescriptions(surv.predict.partial)
vars.df$vimp <- var.importances[1:n.vars]/max(var.importances)
missingness <- sapply(COHORT, percentMissing)
vars.df$missingness <- missingness[names(var.importances)[1:n.vars]]
#+ variables_table, results='asis'
print(
xtable(vars.df),
type = 'html',
include.rownames = FALSE
)
#' ## Perform the final fit
#'
#' Having found the best number of variables by cross-validation, let's perform
#' the final fit with the full training set and `r n.trees.final` trees.
#'
#+ final_fit
time.start <- handyTimer()
surv.model.fit.final <-
survivalFit(
surv.predict.partial,
COHORT.bigdata[-test.set,],
model.type = 'rfsrc',
n.trees = n.trees.final,
split.rule = split.rule,
n.threads = n.threads,
nimpute = 3,
nsplit = nsplit,
na.action = 'na.impute'
)
time.fit.final <- handyTimer(time.start)
saveRDS(surv.model.fit.final, paste0(output.filename.base, '-finalmodel.rds'))
#' Final model of `r n.trees.final` trees fitted in `r round(time.fit.final)`
#' seconds!
#'
#' Also bootstrap this final fitting stage. A fully proper bootstrap would
#' iterate over the whole model-building process including variable selection,
#' but that would be prohibitive in terms of computational time.
#'
#+ bootstrap_final
surv.model.fit.boot <-
survivalBootstrap(
surv.predict.partial,
COHORT.bigdata[-test.set,], # Training set
COHORT.bigdata[test.set,], # Test set
model.type = 'rfsrc',
n.trees = n.trees.final,
split.rule = split.rule,
n.threads = n.threads,
nimpute = 3,
nsplit = nsplit,
na.action = 'na.impute',
bootstraps = bootstraps
)
COHORT.bigdata$surv_time_round <- round_any(COHORT.bigdata$surv_time, tod.round)
# Create a survival formula with the provided variable names...
surv.formula <-
formula(
paste0(
# Survival object made in-formula
'Surv(surv_time_round, surv_event) ~ ',
# Predictor variables then make up the other side
paste(surv.predict.partial, collapse = '+')
)
)
# rfsrc, if you installed it correctly, controls threading by changing an
# environment variable
options(rf.cores = n.threads)
surv.model.fit.boot <-
boot(
formula = surv.formula,
data = COHORT.bigdata[-test.set, ],
statistic = bootstrapFitRfsrc,
R = bootstraps,
parallel = 'no',
ncpus = 1, # disable parallelism because rfsrc can be run in parallel
n.trees = n.trees,
test.data = COHORT.bigdata[test.set, ],
# Boot requires named variables, so can't use ... here. This slight
# kludge means that this will fail unless these three variables are
# explicitly specified in the call.
nimpute = nimpute,
nsplit = nsplit,
na.action = 'na.impute'
)
# Get coefficients and variable importances from bootstrap fits
surv.model.fit.coeffs <- bootStats(surv.model.fit.boot, uncertainty = '95ci')
#' ## Performance
#'
#' ### C-index
#'
#' C-indices are **`r round(surv.model.fit.coeffs['c.train', 'val'], 3)`
#' (`r round(surv.model.fit.coeffs['c.train', 'lower'], 3)` -
#' `r round(surv.model.fit.coeffs['c.train', 'upper'], 3)`)**
#' on the training set and
#' **`r round(surv.model.fit.coeffs['c.test', 'val'], 3)`
#' (`r round(surv.model.fit.coeffs['c.test', 'lower'], 3)` -
#' `r round(surv.model.fit.coeffs['c.test', 'upper'], 3)`)** on the test set.
#'
#'
#' ### Calibration
#'
#' The bootstrapped calibration score is
#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)`
#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` -
#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**.
#'
#' Let's draw a representative curve from the unbootstrapped fit... (It would be
#' better to draw all the curves from the bootstrap fit to get an idea of
#' variability, but I've not implemented this yet.)
#'
#+ calibration_plot
calibration.table <-
calibrationTable(
# Standard calibration options
surv.model.fit.final, COHORT.bigdata[test.set,],
# Always need to specify NA imputation for rfsrc
na.action = 'na.impute'
)
calibration.score <- calibrationScore(calibration.table)
calibrationPlot(calibration.table)
#' The area between the calibration curve and the diagonal is
#' **`r round(calibration.score[['area']], 3)`** +/-
#' **`r round(calibration.score[['se']], 3)`**.
#'
#+ save_results
# Save performance results
varsToTable(
data.frame(
model = 'rf-varselrf2',
imputation = FALSE,
discretised = FALSE,
c.index = surv.model.fit.coeffs['c.train', 'val'],
c.index.lower = surv.model.fit.coeffs['c.train', 'lower'],
c.index.upper = surv.model.fit.coeffs['c.train', 'upper'],
calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
calibration.score.lower =
surv.model.fit.coeffs['calibration.score', 'lower'],
calibration.score.upper =
surv.model.fit.coeffs['calibration.score', 'upper']
),
performance.file,
index.cols = c('model', 'imputation', 'discretised')
)