#+ knitr_setup, include = FALSE
# Whether to cache the intensive code sections. Set to FALSE to recalculate
# everything afresh.
cacheoption <- TRUE
# Disable lazy caching globally, because it fails for large objects, and all the
# objects we wish to cache are large...
opts_chunk$set(cache.lazy = FALSE)
#' # Variable selection in data-driven health records
#'
#' Having extracted around 600 variables which occur most frequently in patient
#' records, let's try to narrow these down using a methodology based on varSelRf
#' combined with survival modelling. We'll find the predictability of variables
#' as defined by the p-value of a logrank test on survival curves of different
#' categories within that variable, and then iteratively throw out unimportant
#' variables, cross-validating for optimum performance.
#'
#' ## User variables
#'
#+ user_variables
output.filename.base <- '../../output/rf-bigdata-varsellogrank-02'
nsplit <- 20
n.trees.cv <- 500
n.trees.final <- 500
split.rule <- 'logrank'
n.imputations <- 3
cv.n.folds <- 3
vars.drop.frac <- 0.2 # Fraction of variables to drop at each iteration
bootstraps <- 20
n.data <- NA # This is after any variables being excluded in prep
n.threads <- 40
#' ## Data set-up
#'
#+ data_setup
data.filename.big <- '../../data/cohort-datadriven-02.csv'
surv.predict.old <- c('age', 'smokstatus', 'imd_score', 'gender')
untransformed.vars <- c('time_death', 'endpoint_death', 'exclude')
source('../lib/shared.R')
require(xtable)
# Define these after shared.R or they will be overwritten!
exclude.vars <-
c(
# Entity type 4 is smoking status, which we already have
"clinical.values.4_data1", "clinical.values.4_data5",
"clinical.values.4_data6",
# Entity 13 data2 is the patient's weight centile, and not a single one is
# entered, but they come out as 0 so the algorithm, looking for NAs, thinks
# it's a useful column
"clinical.values.13_data2",
# Entities 148 and 149 are to do with death certification. I'm not sure how
# it made it into the dataset, but since all the datapoints in this are
# looking back in time, they're all NA. This causes rfsrc to fail.
"clinical.values.148_data1", "clinical.values.148_data2",
"clinical.values.148_data3", "clinical.values.148_data4",
"clinical.values.148_data5",
"clinical.values.149_data1", "clinical.values.149_data2"
)
COHORT <- fread(data.filename.big)
bigdata.prefixes <-
c(
'hes.icd.',
'hes.opcs.',
'tests.enttype.',
'clinical.history.',
'clinical.values.',
'bnf.'
)
bigdata.columns <-
colnames(COHORT)[
which(
# Does is start with one of the data column names?
startsWithAny(names(COHORT), bigdata.prefixes) &
# And it's not one of the columns we want to exclude?
!(colnames(COHORT) %in% exclude.vars)
)
]
COHORT.bigdata <-
COHORT[, c(
untransformed.vars, surv.predict.old, bigdata.columns
),
with = FALSE
]
# Get the missingness before we start removing missing values
missingness <- sort(sapply(COHORT.bigdata, percentMissing))
# Remove values for the 'untransformed.vars' above, which are the survival
# values plus exclude column
missingness <- missingness[!(names(missingness) %in% untransformed.vars)]
# Deal appropriately with missing data
# Most of the variables are number of days since the first record of that type
time.based.vars <-
names(COHORT.bigdata)[
startsWithAny(
names(COHORT.bigdata),
c('hes.icd.', 'hes.opcs.', 'clinical.history.')
)
]
# We're dealing with this as a logical, so we want non-NA values to be TRUE,
# is there is something in the history
for (j in time.based.vars) {
set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]]))
}
# Again, taking this as a logical, set any non-NA value to TRUE.
prescriptions.vars <- names(COHORT.bigdata)[startsWith(names(COHORT.bigdata), 'bnf.')]
for (j in prescriptions.vars) {
set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]]))
}
# This leaves tests and clinical.values, which are test results and should be
# imputed.
# Manually fix clinical values items...
#
# "clinical.values.1_data1" "clinical.values.1_data2"
# These are just blood pressure values...fine to impute
#
# "clinical.values.13_data1" "clinical.values.13_data3"
# These are weight and BMI...also fine to impute
#
# Entity 5 is alcohol consumption status, 1 = Yes, 2 = No, 3 = Ex, so should be
# a factor, and NA can be a factor level
COHORT.bigdata$clinical.values.5_data1 <-
factorNAfix(factor(COHORT.bigdata$clinical.values.5_data1), NAval = 'missing')
# Both gender and smokstatus are factors...fix that
COHORT.bigdata$gender <- factor(COHORT.bigdata$gender)
COHORT.bigdata$smokstatus <-
factorNAfix(factor(COHORT.bigdata$smokstatus), NAval = 'missing')
# Exclude invalid patients
COHORT.bigdata <- COHORT.bigdata[!COHORT.bigdata$exclude]
COHORT.bigdata$exclude <- NULL
COHORT.bigdata <-
prepSurvCol(data.frame(COHORT.bigdata), 'time_death', 'endpoint_death', 'Death')
# If n.data was specified, trim the data table down to size
if(!is.na(n.data)) {
COHORT.bigdata <- sample.df(COHORT.bigdata, n.data)
}
# Define test set
test.set <- testSetIndices(COHORT.bigdata, random.seed = 78361)
# Start by predicting survival with all the variables provided
surv.predict <- c(surv.predict.old, bigdata.columns)
# Set up a csv file to store calibration data, or retrieve previous data
calibration.filename <- paste0(output.filename.base, '-varselcalibration.csv')
varLogrankTest <- function(df, var) {
# If there's only one category, this is a single-valued variable so you can't
# do a logrank test on different values of it...
if(length(unique(NArm(df[, var]))) == 1) {
return(NA)
}
# If it's a logical, make an extra column for consistency of later code
if(class(df[, var]) == 'logical') {
df$groups <- factor(ifelse(df[, var], 'A', 'B'))
# If it's numeric, split it into four quartiles
} else if(class(df[, var]) == 'numeric') {
# First, discard all rows where the value is missing
df <- df[!is.na(df[, var]), ]
# Then, assign quartiles
df$groups <-
factor(
findInterval(
df[, var],
quantile(df[, var], probs=c(0, 0.25, .5, .75, 1))
)
)
} else {
# Otherwise, it's a factor, so leave it as-is
df$groups <- df[, var]
}
# Perform a logrank test on the data
lr.test <-
survdiff(
as.formula(paste0('Surv(surv_time, surv_event) ~ groups')),
df
)
# Return the p-value of the logrank test
pchisq(lr.test$chisq, length(lr.test$n)-1, lower.tail = FALSE)
}
# Don't use the output variables in our list
vars.to.check <-
names(COHORT.bigdata)[!(names(COHORT.bigdata) %in% c('surv_time', 'surv_event'))]
var.logrank.p <-
sapply(
X = vars.to.check, FUN = varLogrankTest,
df = COHORT.bigdata[-test.set, ]
)
# Sort them, in ascending order because small p-values indicate differing
# survival curves
var.logrank.p <- sort(var.logrank.p)
#' ## Run random forest calibration
#'
#' If there's not already a calibration file, we run the rfVarSel methodology:
#' 1. Fit a big forest to the whole dataset to obtain variable importances.
#' 2. Cross-validate as number of most important variables kept is reduced.
#'
#' (If there is already a calibration file, just load the previous work.)
#'
#+ rf_var_sel_calibration
# If we've not already done a calibration, then do one
if(!file.exists(calibration.filename)) {
# Create an empty data frame to aggregate stats per fold
cv.performance <- data.frame()
# Cross-validate over number of variables to try
cv.vars <-
getVarNums(
length(var.logrank.p),
# no point going lower than the point at which all the p-values are 0,
# because the order is alphabetical and therefore meaningless below this!
min = sum(var.logrank.p == 0)
)
COHORT.cv <- COHORT.bigdata[-test.set, ]
# Run crossvalidations. No need to parallelise because rfsrc is parallelised
for(i in 1:length(cv.vars)) {
# Get the subset of most important variables to use
surv.predict.partial <- names(var.logrank.p)[1:cv.vars[i]]
# Get folds for cross-validation
cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds)
cv.fold.performance <- data.frame()
for(j in 1:cv.n.folds) {
time.start <- handyTimer()
# Fit model to the training set
surv.model.fit <-
survivalFit(
surv.predict.partial,
COHORT.cv[-cv.folds[[j]],],
model.type = 'rfsrc',
n.trees = n.trees.cv,
split.rule = split.rule,
n.threads = n.threads,
nsplit = nsplit,
nimpute = n.imputations,
na.action = 'na.impute'
)
time.learn <- handyTimer(time.start)
time.start <- handyTimer()
# Get C-index on validation set
c.index.val <-
cIndex(
surv.model.fit, COHORT.cv[cv.folds[[j]],],
na.action = 'na.impute'
)
time.c.index <- handyTimer(time.start)
time.start <- handyTimer()
# Get calibration score validation set
calibration.score <-
calibrationScore(
calibrationTable(
surv.model.fit, COHORT.cv[cv.folds[[j]],], na.action = 'na.impute'
)
)
time.calibration <- handyTimer(time.start)
# Append the stats we've obtained from this fold
cv.fold.performance <-
rbind(
cv.fold.performance,
data.frame(
calibration = i,
cv.fold = j,
n.vars = cv.vars[i],
c.index.val,
calibration.score,
time.learn,
time.c.index,
time.calibration
)
)
} # End cross-validation loop (j)
# rbind the performance by fold
cv.performance <-
rbind(
cv.performance,
cv.fold.performance
)
# Save output at the end of each loop
write.csv(cv.performance, calibration.filename)
} # End calibration loop (i)
} else {
cv.performance <- read.csv(calibration.filename)
}
#' ## Find the best model from the calibrations
#'
#' ### Plot model performance
#'
#+ model_performance
# Find the best calibration...
# First, average performance across cross-validation folds
cv.performance.average <-
aggregate(
c.index.val ~ n.vars,
data = cv.performance,
mean
)
cv.calibration.average <-
aggregate(
area ~ n.vars,
data = cv.performance,
mean
)
ggplot(cv.performance.average, aes(x = n.vars, y = c.index.val)) +
geom_line() +
geom_point(data = cv.performance) +
ggtitle(label = 'C-index by n.vars')
ggplot(cv.calibration.average, aes(x = n.vars, y = area)) +
geom_line() +
geom_point(data = cv.performance) +
ggtitle(label = 'Calibration performance by n.vars')
# Find the highest value
n.vars <-
cv.performance.average$n.vars[
which.max(cv.performance.average$c.index.val)
]
# Fit a full model with the variables provided
surv.predict.partial <- names(var.logrank.p)[1:n.vars]
#' ## Best model
#'
#' The best model contained `r n.vars` variables. Let's see what those were...
#'
#+ variables_used
vars.df <-
data.frame(
vars = surv.predict.partial
)
vars.df$descriptions <- lookUpDescriptions(surv.predict.partial)
vars.df$missingness <- missingness[surv.predict.partial]
#+ variables_table, results='asis'
print(
xtable(vars.df),
type = 'html',
include.rownames = FALSE
)
#' ## Perform the final fit
#'
#' Having found the best number of variables by cross-validation, let's perform
#' the final fit with the full training set and `r n.trees.final` trees.
#'
#+ final_fit
time.start <- handyTimer()
surv.model.fit.final <-
survivalFit(
surv.predict.partial,
COHORT.bigdata[-test.set,],
model.type = 'rfsrc',
n.trees = n.trees.final,
split.rule = split.rule,
n.threads = n.threads,
nimpute = 3,
nsplit = nsplit,
na.action = 'na.impute'
)
time.fit.final <- handyTimer(time.start)
saveRDS(surv.model.fit.final, paste0(output.filename.base, '-finalmodel.rds'))
#' Final model of `r n.trees.final` trees fitted in `r round(time.fit.final)`
#' seconds!
#'
#' Also bootstrap this final fitting stage. A fully proper bootstrap would
#' iterate over the whole model-building process including variable selection,
#' but that would be prohibitive in terms of computational time.
#'
#+ bootstrap_final
time.start <- handyTimer()
surv.model.params.boot <-
survivalFitBoot(
surv.predict.partial,
COHORT.bigdata[-test.set,], # Training set
COHORT.bigdata[test.set,], # Test set
model.type = 'rfsrc',
n.threads = n.threads,
bootstraps = bootstraps,
filename = paste0(output.filename.base, '-boot-all.csv'),
na.action = 'na.impute'
)
time.fit.boot <- handyTimer(time.start)
#' `r bootstraps` bootstraps completed in `r round(time.fit.boot)` seconds!
# Get coefficients and variable importances from bootstrap fits
surv.model.fit.coeffs <- bootStatsDf(surv.model.params.boot)
# Save performance results
varsToTable(
data.frame(
model = 'rf-varsellr',
imputation = FALSE,
discretised = FALSE,
c.index = surv.model.fit.coeffs['c.index', 'val'],
c.index.lower = surv.model.fit.coeffs['c.index', 'lower'],
c.index.upper = surv.model.fit.coeffs['c.index', 'upper'],
calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
calibration.score.lower =
surv.model.fit.coeffs['calibration.score', 'lower'],
calibration.score.upper =
surv.model.fit.coeffs['calibration.score', 'upper']
),
performance.file,
index.cols = c('model', 'imputation', 'discretised')
)
write.csv(
surv.model.fit.coeffs,
paste0(output.filename.base, '-boot-summary.csv')
)
#' ## Performance
#'
#' ### C-index
#'
#' C-indices are **`r round(surv.model.fit.coeffs['c.train', 'val'], 3)`
#' (`r round(surv.model.fit.coeffs['c.train', 'lower'], 3)` -
#' `r round(surv.model.fit.coeffs['c.train', 'upper'], 3)`)**
#' on the training set and
#' **`r round(surv.model.fit.coeffs['c.test', 'val'], 3)`
#' (`r round(surv.model.fit.coeffs['c.test', 'lower'], 3)` -
#' `r round(surv.model.fit.coeffs['c.test', 'upper'], 3)`)** on the test set.
#'
#'
#' ### Calibration
#'
#' The bootstrapped calibration score is
#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)`
#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` -
#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**.
#'
#' Let's draw a representative curve from the unbootstrapped fit... (It would be
#' better to draw all the curves from the bootstrap fit to get an idea of
#' variability, but I've not implemented this yet.)
#'
#+ calibration_plot
calibration.table <-
calibrationTable(
# Standard calibration options
surv.model.fit.final, COHORT.bigdata[test.set,],
# Always need to specify NA imputation for rfsrc
na.action = 'na.impute'
)
calibration.score <- calibrationScore(calibration.table)
calibrationPlot(calibration.table)
#' The area between the calibration curve and the diagonal is
#' **`r round(calibration.score, 3)`**