#+ knitr_setup, include = FALSE
# Whether to cache the intensive code sections. Set to FALSE to recalculate
# everything afresh.
cacheoption <- TRUE
# Disable lazy caching globally, because it fails for large objects, and all the
# objects we wish to cache are large...
opts_chunk$set(cache.lazy = FALSE)
#' # Modelling with age
#'
#' It seems that, no matter what I do, the C-index of a model, random forest or
#' otherwise, is about 0.78. I decided to try to some simpler models, intially
#' based purely on age which is clearly the biggest factor in this dataset.
#' (And, I assume, most datasets where a reasonably broad range of ages is
#' present.)
#'
#' The sanity check works: giving the model more data does indeed result in a
#' better fit. However, on top of that, I was surprised by just how good the
#' performance can be when age alone is considered!
#+ user_variables, message=FALSE
data.filename <- '../../data/cohort-sanitised.csv'
n.trees <- 500
continuous.vars <-
c(
'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
'total_wbc_6mo', 'haemoglobin_6mo'
)
untransformed.vars <- c('anonpatid', 'time_death', 'imd_score', 'exclude')
source('../lib/shared.R')
require(ggrepel)
#' ## Load and prepare data
#+ load_and_prepare_data
# Load the data and convert to data frame to make column-selecting code in
# prepData simpler
COHORT.full <- data.frame(fread(data.filename))
# Define process settings; nothing for those to not transform, and missingToBig
# for the continuous ones...
process.settings <-
list(
var = c(untransformed.vars, continuous.vars),
method =
c(
rep(NA, length(untransformed.vars)),
rep('missingToBig', length(continuous.vars))
),
settings = rep(NA, length(untransformed.vars) + length(continuous.vars))
)
COHORT.prep <-
prepData(
# Data for cross-validation excludes test set
COHORT.full,
cols.keep,
process.settings,
surv.time, surv.event,
surv.event.yes,
extra.fun = caliberExtraPrep
)
n.data <- nrow(COHORT.prep)
# Define indices of test set
test.set <- sample(1:n.data, (1/3)*n.data)
#' ## Models
#'
#' ### The normal model
#'
#' All the variables, as in the vector `surv.predict`.
#+ normal_model, cache=cacheoption
# Fit random forest
surv.model.fit <-
survivalFit(
surv.predict,
COHORT.prep[-test.set,],
model.type = 'rfsrc',
n.trees = n.trees,
split.rule = 'logrank',
n.threads = 7,
nsplit = 20
)
print(surv.model.fit)
# Get C-index
c.index.test <-
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**.
#' ### Just age
#'
#' What if all we had to go on was age?
#+ just_age_model, cache=cacheoption
# Fit random forest
surv.model.fit <-
survivalFit(
c('age'),
COHORT.prep[-test.set,],
model.type = 'ranger',
n.trees = n.trees,
split.rule = 'logrank',
n.threads = 8,
respect.unordered.factors = 'partition'
)
print(surv.model.fit)
# Get C-index
c.index.test <-
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**.
#' ### No model, literally just age
#'
#' What if we constructed the C-index based purely on patients' ages?
#+ just_age_cindex
c.index.age <-
as.numeric(
survConcordance(
Surv(time_death, surv_event) ~ age,
COHORT.prep
)$concordance
)
#' The C-index on the whole dataset based purely on age is
#' **`r round(c.index.test, 3)`**. That's most of our predictive accuracy right
#' there! Reassuringly, it's also equal to the value predicted by the random
#' forest model based purely on age...
#' ### Age and gender
#'
#' OK, age and gender.
#+ age_gender_model, cache=cacheoption
# Fit random forest
surv.model.fit <-
survivalFit(
c('age', 'gender'),
COHORT.prep[-test.set,],
model.type = 'ranger',
n.trees = n.trees,
split.rule = 'logrank',
n.threads = 8,
respect.unordered.factors = 'partition'
)
print(surv.model.fit)
# Get C-index
c.index.test <-
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**.
#' ### Age, gender and history of liver disease
#'
#' Let's add a third variable. In the replication of the Cox model with missing
#' data included, liver disease was the most predictive factor after age, so
#' it's a reasonable next variable to add.
#+ age_gender_liver_model, cache=cacheoption
# Fit random forest
surv.model.fit <-
survivalFit(
c('age', 'gender', 'hx_liver'),
COHORT.prep[-test.set,],
model.type = 'ranger',
n.trees = n.trees,
split.rule = 'logrank',
n.threads = 8,
respect.unordered.factors = 'partition'
)
print(surv.model.fit)
# Get C-index
c.index.test <-
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**.
#' ### Age, gender and heart failure
#'
#' A different third variable: heart failure, the second most important variable
#' (after age) from random forest modelling.
#+ age_gender_hf_model, cache=cacheoption
# Fit random forest
surv.model.fit <-
survivalFit(
c('age', 'gender', 'heart_failure'),
COHORT.prep[-test.set,],
model.type = 'ranger',
n.trees = n.trees,
split.rule = 'logrank',
n.threads = 8,
respect.unordered.factors = 'partition'
)
print(surv.model.fit)
# Get C-index
c.index.test <-
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**.
#' ### Just gender
#'
#' Just gender, as a sanity check.
#+ just_gender_model, cache=cacheoption
# Fit random forest
surv.model.fit <-
survivalFit(
c('gender'),
COHORT.prep[-test.set,],
model.type = 'ranger',
n.trees = n.trees,
split.rule = 'logrank',
n.threads = 8,
respect.unordered.factors = 'partition'
)
print(surv.model.fit)
# Get C-index
c.index.test <-
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**.
#' ### Everything except age
#'
#' How do we do if we use all the variables _except_ age?
#+ no_age_model, cache=cacheoption
# Fit random forest
surv.model.fit <-
survivalFit(
surv.predict[surv.predict != 'age'],
COHORT.prep[-test.set,],
model.type = 'ranger',
n.trees = n.trees,
split.rule = 'logrank',
n.threads = 8,
respect.unordered.factors = 'partition'
)
print(surv.model.fit)
# Get C-index
c.index.test.all.not.age <-
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
#' The C-index on the held-out test set is
#' **`r round(c.index.test.all.not.age, 3)`**.
#' ### Predicting age
#'
#' So why does the model which doesn't include age do so well? Clearly the other
#' variables allow you to predict age with reasonable accuracy... So let's try
#' just that as a final test.
#+ predict_age, cache=cacheoption
options(rf.cores = n.threads)
age.model <-
rfsrc(
formula(
paste0(
# Predicting just the age
'age ~ ',
# Predictor variables then make up the other side
paste(surv.predict[!(surv.predict %in% c('age', 'most_deprived'))], collapse = '+')
)
),
COHORT.use[-test.set, ],
ntree = n.trees,
splitrule = 'mse',
na.action = 'na.impute',
nimpute = 3
)
age.predictions <- predict(age.model, COHORT.prep[test.set, ])
age.cor <- cor(age.predictions$predictions, COHORT.prep[test.set, 'age'])
to.plot <-
data.frame(
age = COHORT.prep[test.set, 'age'],
predicted = age.predictions$predictions
)
ggplot(sample.df(to.plot, 10000), aes(x = age, y = predicted)) +
geom_point(alpha = 0.2)
#' It doesn't look that great, but there is some correlation...
#' r^2 = `r age.cor^2` which is unremarkable, but OK. A more relevant measure
#' would be the pure-age C-index, ie if I gave you a pair of patients, how
#' often could you predict who was older?
c.index.age.on.age <-
1 - as.numeric(
survConcordance(
age ~ predicted,
to.plot
)$concordance
)
#' This comes out as **`r round(c.index.age.on.age, 3)`**, as compared to
#' **`r round(c.index.test.all.not.age, 3)`**, which was the C-index for the
#' survival model based on all other variables.
#'
#' This makes sense. The maths of C-indices is a bit tricky (how much they add
#' to one-another depends in large part of how correlated the variables are, as
#' well as probably being somewhat non-linear anyway),
#' but clearly some significant fraction of the all-except-age model's
#' predictive power comes from its ability to infer age from the remaining
#' variables, and then use _that_ (implicitly) to predict time to death.
#'
#' The mechanism for this could be that people are likely to
#' get more diseases as they get older, so if you have a, b _and_ c you're
#' likely to be older. Second-order predictivity may occur if particular
#' combinations of disorders or test results are common in certain age groups.
#'
#' ## Conclusion
#'
#' So, in conclusion, the sanity check has worked: giving the
#' random forest model more data to work with improves its performance. Age
#' alone is less predictive, adding gender makes it slightly more predictive,
#' and so on.
#'
#' Further, though, a huge amount of the model's predictivity arises from just
#' a patien's age. Not only is that alone a good predictor, but the
#' reasonable performance on the model of all factors except age is explained in
#' part by those factors' ability to act as a proxy for age.