MLehealth / Git / [0375db] /overview/variable-effects.R

Models:
philipB/
MLehealth
Downloads: 1
[0375db]: / overview / variable-effects.R
History
Download this file
366 lines (323 with data), 13.0 kB

cox.disc.filename <- '../../output/all-cv-survreg-boot-try5-surv-model.rds'
caliber.missing.coefficients.filename <-
  '../../output/caliber-replicate-with-missing-survreg-6-linear-age-coeffs-3.csv'
rf.filename <- '../../output/rfsrc-cv-nsplit-try3-var-effects.csv'

source('../lib/shared.R')
requirePlus('cowplot')

# Amount of padding at the right-hand side to make space for missing values
missing.padding <- 0.05

continuous.vars <-
  c(
    'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
    'total_wbc_6mo', 'haemoglobin_6mo'
  )

# Load in the discretised Cox model for plotting
surv.model.fit.boot <- readRDS(cox.disc.filename)

# Pull coefficients from model with missing data
caliber.missing.coeffs <- read.csv(caliber.missing.coefficients.filename)
# Log them to get them on the same scale as discrete model
caliber.missing.coeffs$our_value <- -log(caliber.missing.coeffs$our_value)
caliber.missing.coeffs$our_lower <- -log(caliber.missing.coeffs$our_lower)
caliber.missing.coeffs$our_upper <- -log(caliber.missing.coeffs$our_upper)

# Load the data
COHORT.use <- data.frame(fread(data.filename))

# Open the calibration to find the best binning scheme
calibration.filename <- '../../output/survreg-crossvalidation-try5.csv'
cv.performance <- read.csv(calibration.filename)

# Find the best calibration...
# First, average performance across cross-validation folds
cv.performance.average <-
  aggregate(
    c.index.val ~ calibration,
    data = cv.performance,
    mean
  )
# Find the highest value
best.calibration <-
  cv.performance.average$calibration[
    which.max(cv.performance.average$c.index.val)
    ]
# And finally, find the first row of that calibration to get the n.bins values
best.calibration.row1 <-
  min(which(cv.performance$calibration == best.calibration))

# Get its parameters
n.bins <-
  t(
    cv.performance[best.calibration.row1, continuous.vars]
  )

# Prepare the data with those settings...

# Reset process settings with the base setings
process.settings <-
  list(
    var        = c('anonpatid', 'time_death', 'imd_score', 'exclude'),
    method     = c(NA, NA, NA, NA),
    settings   = list(NA, NA, NA, NA)
  )
for(j in 1:length(continuous.vars)) {
  process.settings$var <- c(process.settings$var, continuous.vars[j])
  process.settings$method <- c(process.settings$method, 'binByQuantile')
  process.settings$settings <-
    c(
      process.settings$settings,
      list(
        seq(
          # Quantiles are obviously between 0 and 1
          0, 1,
          # Choose a random number of bins (and for n bins, you need n + 1 breaks)
          length.out = n.bins[j]
        )
      )
    )
}

# prep the data given the variables provided
COHORT.optimised <-
  prepData(
    # Data for cross-validation excludes test set
    COHORT.use,
    cols.keep,
    process.settings,
    surv.time, surv.event,
    surv.event.yes,
    extra.fun = caliberExtraPrep
  )

# Unpack variable and level names
cph.coeffs <- cphCoeffs(
  bootStats(surv.model.fit.boot, uncertainty = '95ci', transform = `-`),
  COHORT.optimised, surv.predict, model.type = 'boot.survreg'
)

# We'll need the CALIBER scaling functions for plotting
source('../cox-ph/caliber-scale.R')

# set up list to store the plots
cox.discrete.plots <- list()
# Add dummy columns for x-position of missing values
caliber.missing.coeffs$missing.x.pos.cont <- NA
cph.coeffs$missing.x.pos.disc <- NA

for(variable in unique(cph.coeffs$var)) {
  # If it's a continuous variable, get the real centres of the bins
  if(variable %in% process.settings$var) {
    process.i <- which(variable == process.settings$var)
    
    if(process.settings$method[[process.i]] == 'binByQuantile') {
      
      variable.quantiles <-
        getQuantiles(
          COHORT.use[, variable],
          process.settings$settings[[process.i]]
        )
      # For those rows which relate to this variable, and whose level isn't
      # missing, put in the appropriate quantile boundaries for plotting
      cph.coeffs$bin.min[cph.coeffs$var == variable & 
                           cph.coeffs$level != 'missing'] <-
        variable.quantiles[1:(length(variable.quantiles) - 1)]
      cph.coeffs$bin.max[cph.coeffs$var == variable & 
                           cph.coeffs$level != 'missing'] <-
        variable.quantiles[2:length(variable.quantiles)]
      # Make the final bin the 99th percentile
      cph.coeffs$bin.max[cph.coeffs$var == variable & 
                           cph.coeffs$level != 'missing'][
                             length(variable.quantiles) - 1] <-
        quantile(COHORT.use[, variable], 0.99, na.rm = TRUE)
      
      # Add a fake data point at the highest value to finish the graph
      cph.coeffs <-
        rbind(
          cph.coeffs,
          cph.coeffs[cph.coeffs$var == variable & 
                       cph.coeffs$level != 'missing', ][
                         length(variable.quantiles) - 1, ]
        )
      # Change it so that bin.min is bin.max from the old one
      cph.coeffs$bin.min[nrow(cph.coeffs)] <-
        cph.coeffs$bin.max[cph.coeffs$var == variable & 
                             cph.coeffs$level != 'missing'][
                               length(variable.quantiles) - 1]
      
      # Work out data range by taking the 1st and 99th percentiles
      # Use the max to provide a max value for the final bin
      # Also use for x-axis limits, unless there are missing values to
      # accommodate on the right-hand edge.
      x.data.range <-
        quantile(COHORT.use[, variable], c(0.01, 0.99), na.rm = TRUE)
      x.axis.limits <- x.data.range
      
      
      # Finally, we need to scale this such that the baseline value is equal
      # to the value for the equivalent place in the Cox model, to make the
      # risks comparable...
      
      # First, we need to find the average value of this variable in the lowest
      # bin (which is always the baseline here)
      baseline.bin <- variable.quantiles[1:2]
      baseline.bin.avg <- 
        mean(
          # Take only those values of the variable which are in the range
          COHORT.use[
            inRange(COHORT.use[, variable], baseline.bin, na.false = TRUE),
            variable
            ]
        )
      # Then, scale it with the caliber scaling
      baseline.bin.val <-
        caliberScaleUnits(baseline.bin.avg, variable) * 
        caliber.missing.coeffs$our_value[
          caliber.missing.coeffs$quantity == variable
          ]
      
      # And now, add all the discretised values to that value to make them
      # comparable...
      cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] <-
        cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] -
        baseline.bin.val
      
      # Now, plot this variable as a stepped line plot using those quantile
      # boundaries
      cox.discrete.plot <-
        ggplot(
          subset(cph.coeffs, var == variable),
          aes(x = bin.min, y = val)
        ) +   
        geom_step() +
        geom_step(aes(y = lower), colour = 'grey') +
        geom_step(aes(y = upper), colour = 'grey') +
        labs(x = variable, y = 'Bx')
      
      # If there's a missing value risk, add it
      if(any(cph.coeffs$var == variable & cph.coeffs$level == 'missing')) {
        # Expand the x-axis to squeeze the missing values in
        x.axis.limits[2] <- 
          x.axis.limits[2] + diff(x.data.range) * missing.padding
        # Put this missing value a third of the way into the missing area
        cph.coeffs$missing.x.pos.disc[
          cph.coeffs$var == variable &
          cph.coeffs$level == 'missing'] <-
          x.axis.limits[2] + diff(x.data.range) * missing.padding / 3

        # Add the point to the graph (we'll set axis limits later)
        cox.discrete.plot <-
          cox.discrete.plot +
          geom_pointrange(
            data = cph.coeffs[cph.coeffs$var == variable & 
                                cph.coeffs$level == 'missing', ],
            aes(
              x = missing.x.pos.disc,
              y = val, ymin = lower,
              ymax = upper
            ),
            colour = 'red'
          )
      }
      
      # Now, let's add the line from the continuous Cox model. We only need two
      # points because the lines are straight!
      continuous.cox <-
        data.frame(
          var.x.values = x.data.range
        )
      # Scale the x-values
      continuous.cox$var.x.scaled <-
        caliberScaleUnits(continuous.cox$var.x.values, variable)
      # Use the risks to calculate risk per x for central estimate and errors
      continuous.cox$y <-
        -caliber.missing.coeffs$our_value[
          caliber.missing.coeffs$quantity == variable
          ] * continuous.cox$var.x.scaled
      continuous.cox$upper <-
        -caliber.missing.coeffs$our_upper[
          caliber.missing.coeffs$quantity == variable
          ] * continuous.cox$var.x.scaled
      continuous.cox$lower <-
        -caliber.missing.coeffs$our_lower[
          caliber.missing.coeffs$quantity == variable
          ] * continuous.cox$var.x.scaled
      
      cox.discrete.plot <-
        cox.discrete.plot +
        geom_line(
          data = continuous.cox,
          aes(x = var.x.values, y = y),
          colour = 'blue'
        ) +
        geom_line(
          data = continuous.cox,
          aes(x = var.x.values, y = upper),
          colour = 'lightblue'
        ) +
        geom_line(
          data = continuous.cox,
          aes(x = var.x.values, y = lower),
          colour = 'lightblue'
        )
      
      # If there is one, add missing value risk from the continuous model
      if(any(caliber.missing.coeffs$quantity == paste0(variable, '_missing') &
             caliber.missing.coeffs$unit == 'missing')) {
        
        # Put this missing value 2/3rds of the way into the missing area
        caliber.missing.coeffs$missing.x.pos.cont[
          caliber.missing.coeffs$quantity == paste0(variable, '_missing') &
            caliber.missing.coeffs$unit == 'missing'] <-
                x.axis.limits[2] + 2 * diff(x.data.range) * missing.padding / 3
        
        cox.discrete.plot <-
          cox.discrete.plot +
          geom_pointrange(
            data = caliber.missing.coeffs[
              caliber.missing.coeffs$quantity == paste0(variable, '_missing') &
                caliber.missing.coeffs$unit == 'missing',
              ],
            aes(
              x = missing.x.pos.cont,
              y = our_value, ymin = our_lower, ymax = our_upper
            ),
            colour = 'blue'
          )
      }
      
      # Finally, set the x-axis limits; will just be the data range, or data
      # range plus a bit if there are missing values to squeeze in
      cox.discrete.plot <-
        cox.discrete.plot +
        coord_cartesian(xlim = x.axis.limits) +
        theme(axis.title.y = element_blank()) +
        theme(plot.margin = unit(c(0.2, 0.1, 0.2, 0.1), "cm"))
      
      cox.discrete.plots[[variable]] <- cox.discrete.plot
    }
  }
}

# Load the random forest variable effects file
risk.by.variables <- read.csv(rf.filename)
rf.vareff.plots <- list()

for(variable in unique(risk.by.variables$var)) {
  # Get the mean of the normalised risk for every value of the variable
  risk.aggregated <-
    aggregate(
      as.formula(paste0('risk.normalised ~ val')),
      subset(risk.by.variables, var == variable), median
    )
  
  # work out the limits on the axes by taking the 1st and 99th percentiles
  x.axis.limits <-
    quantile(COHORT.use[, variable], c(0.01, 0.99), na.rm = TRUE)
  y.axis.limits <-
    quantile(subset(risk.by.variables, var == variable)$risk.normalised, c(0.05, 0.95), na.rm = TRUE)
  
  # If there's a missing value risk in the graph above, expand the axes so they
  # match
  if(any(cph.coeffs$var == variable & cph.coeffs$level == 'missing')) {
    x.axis.limits[2] <- 
      x.axis.limits[2] + diff(x.data.range) * missing.padding
  }
  
  rf.vareff.plots[[variable]] <-
    ggplot(
      subset(risk.by.variables, var == variable), 
      aes(x = val, y = log(risk.normalised))
    ) +
    geom_line(alpha=0.003, aes(group = id)) +
    geom_line(data = risk.aggregated, colour = 'blue') +
    coord_cartesian(xlim = x.axis.limits, ylim = log(y.axis.limits)) +
    labs(x = variable) +
    theme(
      plot.margin = unit(c(0.2, 0.1, 0.2, 0.1), "cm"),
      axis.title.y = element_blank()
    )
}


plot_grid(
  cox.discrete.plots[['age']],
  cox.discrete.plots[['haemoglobin_6mo']],
  cox.discrete.plots[['total_wbc_6mo']],
  cox.discrete.plots[['crea_6mo']],
  rf.vareff.plots[['age']],
  rf.vareff.plots[['haemoglobin_6mo']],
  rf.vareff.plots[['total_wbc_6mo']],
  rf.vareff.plots[['crea_6mo']],
  labels = c('A', rep('', 3), 'B', rep('', 3)),
  align = "v", ncol = 4
)

ggsave(
  '../../output/variable-effects.pdf',
  width = 16,
  height = 10,
  units = 'cm',
  useDingbats = FALSE
)