cox.disc.filename <- '../../output/all-cv-survreg-boot-try5-surv-model.rds'
caliber.missing.coefficients.filename <-
'../../output/caliber-replicate-with-missing-survreg-6-linear-age-coeffs-3.csv'
rf.filename <- '../../output/rfsrc-cv-nsplit-try3-var-effects.csv'
source('../lib/shared.R')
requirePlus('cowplot')
# Amount of padding at the right-hand side to make space for missing values
missing.padding <- 0.05
continuous.vars <-
c(
'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
'total_wbc_6mo', 'haemoglobin_6mo'
)
# Load in the discretised Cox model for plotting
surv.model.fit.boot <- readRDS(cox.disc.filename)
# Pull coefficients from model with missing data
caliber.missing.coeffs <- read.csv(caliber.missing.coefficients.filename)
# Log them to get them on the same scale as discrete model
caliber.missing.coeffs$our_value <- -log(caliber.missing.coeffs$our_value)
caliber.missing.coeffs$our_lower <- -log(caliber.missing.coeffs$our_lower)
caliber.missing.coeffs$our_upper <- -log(caliber.missing.coeffs$our_upper)
# Load the data
COHORT.use <- data.frame(fread(data.filename))
# Open the calibration to find the best binning scheme
calibration.filename <- '../../output/survreg-crossvalidation-try5.csv'
cv.performance <- read.csv(calibration.filename)
# Find the best calibration...
# First, average performance across cross-validation folds
cv.performance.average <-
aggregate(
c.index.val ~ calibration,
data = cv.performance,
mean
)
# Find the highest value
best.calibration <-
cv.performance.average$calibration[
which.max(cv.performance.average$c.index.val)
]
# And finally, find the first row of that calibration to get the n.bins values
best.calibration.row1 <-
min(which(cv.performance$calibration == best.calibration))
# Get its parameters
n.bins <-
t(
cv.performance[best.calibration.row1, continuous.vars]
)
# Prepare the data with those settings...
# Reset process settings with the base setings
process.settings <-
list(
var = c('anonpatid', 'time_death', 'imd_score', 'exclude'),
method = c(NA, NA, NA, NA),
settings = list(NA, NA, NA, NA)
)
for(j in 1:length(continuous.vars)) {
process.settings$var <- c(process.settings$var, continuous.vars[j])
process.settings$method <- c(process.settings$method, 'binByQuantile')
process.settings$settings <-
c(
process.settings$settings,
list(
seq(
# Quantiles are obviously between 0 and 1
0, 1,
# Choose a random number of bins (and for n bins, you need n + 1 breaks)
length.out = n.bins[j]
)
)
)
}
# prep the data given the variables provided
COHORT.optimised <-
prepData(
# Data for cross-validation excludes test set
COHORT.use,
cols.keep,
process.settings,
surv.time, surv.event,
surv.event.yes,
extra.fun = caliberExtraPrep
)
# Unpack variable and level names
cph.coeffs <- cphCoeffs(
bootStats(surv.model.fit.boot, uncertainty = '95ci', transform = `-`),
COHORT.optimised, surv.predict, model.type = 'boot.survreg'
)
# We'll need the CALIBER scaling functions for plotting
source('../cox-ph/caliber-scale.R')
# set up list to store the plots
cox.discrete.plots <- list()
# Add dummy columns for x-position of missing values
caliber.missing.coeffs$missing.x.pos.cont <- NA
cph.coeffs$missing.x.pos.disc <- NA
for(variable in unique(cph.coeffs$var)) {
# If it's a continuous variable, get the real centres of the bins
if(variable %in% process.settings$var) {
process.i <- which(variable == process.settings$var)
if(process.settings$method[[process.i]] == 'binByQuantile') {
variable.quantiles <-
getQuantiles(
COHORT.use[, variable],
process.settings$settings[[process.i]]
)
# For those rows which relate to this variable, and whose level isn't
# missing, put in the appropriate quantile boundaries for plotting
cph.coeffs$bin.min[cph.coeffs$var == variable &
cph.coeffs$level != 'missing'] <-
variable.quantiles[1:(length(variable.quantiles) - 1)]
cph.coeffs$bin.max[cph.coeffs$var == variable &
cph.coeffs$level != 'missing'] <-
variable.quantiles[2:length(variable.quantiles)]
# Make the final bin the 99th percentile
cph.coeffs$bin.max[cph.coeffs$var == variable &
cph.coeffs$level != 'missing'][
length(variable.quantiles) - 1] <-
quantile(COHORT.use[, variable], 0.99, na.rm = TRUE)
# Add a fake data point at the highest value to finish the graph
cph.coeffs <-
rbind(
cph.coeffs,
cph.coeffs[cph.coeffs$var == variable &
cph.coeffs$level != 'missing', ][
length(variable.quantiles) - 1, ]
)
# Change it so that bin.min is bin.max from the old one
cph.coeffs$bin.min[nrow(cph.coeffs)] <-
cph.coeffs$bin.max[cph.coeffs$var == variable &
cph.coeffs$level != 'missing'][
length(variable.quantiles) - 1]
# Work out data range by taking the 1st and 99th percentiles
# Use the max to provide a max value for the final bin
# Also use for x-axis limits, unless there are missing values to
# accommodate on the right-hand edge.
x.data.range <-
quantile(COHORT.use[, variable], c(0.01, 0.99), na.rm = TRUE)
x.axis.limits <- x.data.range
# Finally, we need to scale this such that the baseline value is equal
# to the value for the equivalent place in the Cox model, to make the
# risks comparable...
# First, we need to find the average value of this variable in the lowest
# bin (which is always the baseline here)
baseline.bin <- variable.quantiles[1:2]
baseline.bin.avg <-
mean(
# Take only those values of the variable which are in the range
COHORT.use[
inRange(COHORT.use[, variable], baseline.bin, na.false = TRUE),
variable
]
)
# Then, scale it with the caliber scaling
baseline.bin.val <-
caliberScaleUnits(baseline.bin.avg, variable) *
caliber.missing.coeffs$our_value[
caliber.missing.coeffs$quantity == variable
]
# And now, add all the discretised values to that value to make them
# comparable...
cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] <-
cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] -
baseline.bin.val
# Now, plot this variable as a stepped line plot using those quantile
# boundaries
cox.discrete.plot <-
ggplot(
subset(cph.coeffs, var == variable),
aes(x = bin.min, y = val)
) +
geom_step() +
geom_step(aes(y = lower), colour = 'grey') +
geom_step(aes(y = upper), colour = 'grey') +
labs(x = variable, y = 'Bx')
# If there's a missing value risk, add it
if(any(cph.coeffs$var == variable & cph.coeffs$level == 'missing')) {
# Expand the x-axis to squeeze the missing values in
x.axis.limits[2] <-
x.axis.limits[2] + diff(x.data.range) * missing.padding
# Put this missing value a third of the way into the missing area
cph.coeffs$missing.x.pos.disc[
cph.coeffs$var == variable &
cph.coeffs$level == 'missing'] <-
x.axis.limits[2] + diff(x.data.range) * missing.padding / 3
# Add the point to the graph (we'll set axis limits later)
cox.discrete.plot <-
cox.discrete.plot +
geom_pointrange(
data = cph.coeffs[cph.coeffs$var == variable &
cph.coeffs$level == 'missing', ],
aes(
x = missing.x.pos.disc,
y = val, ymin = lower,
ymax = upper
),
colour = 'red'
)
}
# Now, let's add the line from the continuous Cox model. We only need two
# points because the lines are straight!
continuous.cox <-
data.frame(
var.x.values = x.data.range
)
# Scale the x-values
continuous.cox$var.x.scaled <-
caliberScaleUnits(continuous.cox$var.x.values, variable)
# Use the risks to calculate risk per x for central estimate and errors
continuous.cox$y <-
-caliber.missing.coeffs$our_value[
caliber.missing.coeffs$quantity == variable
] * continuous.cox$var.x.scaled
continuous.cox$upper <-
-caliber.missing.coeffs$our_upper[
caliber.missing.coeffs$quantity == variable
] * continuous.cox$var.x.scaled
continuous.cox$lower <-
-caliber.missing.coeffs$our_lower[
caliber.missing.coeffs$quantity == variable
] * continuous.cox$var.x.scaled
cox.discrete.plot <-
cox.discrete.plot +
geom_line(
data = continuous.cox,
aes(x = var.x.values, y = y),
colour = 'blue'
) +
geom_line(
data = continuous.cox,
aes(x = var.x.values, y = upper),
colour = 'lightblue'
) +
geom_line(
data = continuous.cox,
aes(x = var.x.values, y = lower),
colour = 'lightblue'
)
# If there is one, add missing value risk from the continuous model
if(any(caliber.missing.coeffs$quantity == paste0(variable, '_missing') &
caliber.missing.coeffs$unit == 'missing')) {
# Put this missing value 2/3rds of the way into the missing area
caliber.missing.coeffs$missing.x.pos.cont[
caliber.missing.coeffs$quantity == paste0(variable, '_missing') &
caliber.missing.coeffs$unit == 'missing'] <-
x.axis.limits[2] + 2 * diff(x.data.range) * missing.padding / 3
cox.discrete.plot <-
cox.discrete.plot +
geom_pointrange(
data = caliber.missing.coeffs[
caliber.missing.coeffs$quantity == paste0(variable, '_missing') &
caliber.missing.coeffs$unit == 'missing',
],
aes(
x = missing.x.pos.cont,
y = our_value, ymin = our_lower, ymax = our_upper
),
colour = 'blue'
)
}
# Finally, set the x-axis limits; will just be the data range, or data
# range plus a bit if there are missing values to squeeze in
cox.discrete.plot <-
cox.discrete.plot +
coord_cartesian(xlim = x.axis.limits) +
theme(axis.title.y = element_blank()) +
theme(plot.margin = unit(c(0.2, 0.1, 0.2, 0.1), "cm"))
cox.discrete.plots[[variable]] <- cox.discrete.plot
}
}
}
# Load the random forest variable effects file
risk.by.variables <- read.csv(rf.filename)
rf.vareff.plots <- list()
for(variable in unique(risk.by.variables$var)) {
# Get the mean of the normalised risk for every value of the variable
risk.aggregated <-
aggregate(
as.formula(paste0('risk.normalised ~ val')),
subset(risk.by.variables, var == variable), median
)
# work out the limits on the axes by taking the 1st and 99th percentiles
x.axis.limits <-
quantile(COHORT.use[, variable], c(0.01, 0.99), na.rm = TRUE)
y.axis.limits <-
quantile(subset(risk.by.variables, var == variable)$risk.normalised, c(0.05, 0.95), na.rm = TRUE)
# If there's a missing value risk in the graph above, expand the axes so they
# match
if(any(cph.coeffs$var == variable & cph.coeffs$level == 'missing')) {
x.axis.limits[2] <-
x.axis.limits[2] + diff(x.data.range) * missing.padding
}
rf.vareff.plots[[variable]] <-
ggplot(
subset(risk.by.variables, var == variable),
aes(x = val, y = log(risk.normalised))
) +
geom_line(alpha=0.003, aes(group = id)) +
geom_line(data = risk.aggregated, colour = 'blue') +
coord_cartesian(xlim = x.axis.limits, ylim = log(y.axis.limits)) +
labs(x = variable) +
theme(
plot.margin = unit(c(0.2, 0.1, 0.2, 0.1), "cm"),
axis.title.y = element_blank()
)
}
plot_grid(
cox.discrete.plots[['age']],
cox.discrete.plots[['haemoglobin_6mo']],
cox.discrete.plots[['total_wbc_6mo']],
cox.discrete.plots[['crea_6mo']],
rf.vareff.plots[['age']],
rf.vareff.plots[['haemoglobin_6mo']],
rf.vareff.plots[['total_wbc_6mo']],
rf.vareff.plots[['crea_6mo']],
labels = c('A', rep('', 3), 'B', rep('', 3)),
align = "v", ncol = 4
)
ggsave(
'../../output/variable-effects.pdf',
width = 16,
height = 10,
units = 'cm',
useDingbats = FALSE
)