MLehealth / Git / Diff of /overview/variable-effects.R

Models:
philipB/
MLehealth
Downloads: 1
Diff of /overview/variable-effects.R [000000] .. [0375db]
Switch to side-by-side view

--- a
+++ b/overview/variable-effects.R
@@ -0,0 +1,365 @@
+cox.disc.filename <- '../../output/all-cv-survreg-boot-try5-surv-model.rds'
+caliber.missing.coefficients.filename <-
+  '../../output/caliber-replicate-with-missing-survreg-6-linear-age-coeffs-3.csv'
+rf.filename <- '../../output/rfsrc-cv-nsplit-try3-var-effects.csv'
+
+source('../lib/shared.R')
+requirePlus('cowplot')
+
+# Amount of padding at the right-hand side to make space for missing values
+missing.padding <- 0.05
+
+continuous.vars <-
+  c(
+    'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
+    'total_wbc_6mo', 'haemoglobin_6mo'
+  )
+
+# Load in the discretised Cox model for plotting
+surv.model.fit.boot <- readRDS(cox.disc.filename)
+
+# Pull coefficients from model with missing data
+caliber.missing.coeffs <- read.csv(caliber.missing.coefficients.filename)
+# Log them to get them on the same scale as discrete model
+caliber.missing.coeffs$our_value <- -log(caliber.missing.coeffs$our_value)
+caliber.missing.coeffs$our_lower <- -log(caliber.missing.coeffs$our_lower)
+caliber.missing.coeffs$our_upper <- -log(caliber.missing.coeffs$our_upper)
+
+# Load the data
+COHORT.use <- data.frame(fread(data.filename))
+
+# Open the calibration to find the best binning scheme
+calibration.filename <- '../../output/survreg-crossvalidation-try5.csv'
+cv.performance <- read.csv(calibration.filename)
+
+# Find the best calibration...
+# First, average performance across cross-validation folds
+cv.performance.average <-
+  aggregate(
+    c.index.val ~ calibration,
+    data = cv.performance,
+    mean
+  )
+# Find the highest value
+best.calibration <-
+  cv.performance.average$calibration[
+    which.max(cv.performance.average$c.index.val)
+    ]
+# And finally, find the first row of that calibration to get the n.bins values
+best.calibration.row1 <-
+  min(which(cv.performance$calibration == best.calibration))
+
+# Get its parameters
+n.bins <-
+  t(
+    cv.performance[best.calibration.row1, continuous.vars]
+  )
+
+# Prepare the data with those settings...
+
+# Reset process settings with the base setings
+process.settings <-
+  list(
+    var        = c('anonpatid', 'time_death', 'imd_score', 'exclude'),
+    method     = c(NA, NA, NA, NA),
+    settings   = list(NA, NA, NA, NA)
+  )
+for(j in 1:length(continuous.vars)) {
+  process.settings$var <- c(process.settings$var, continuous.vars[j])
+  process.settings$method <- c(process.settings$method, 'binByQuantile')
+  process.settings$settings <-
+    c(
+      process.settings$settings,
+      list(
+        seq(
+          # Quantiles are obviously between 0 and 1
+          0, 1,
+          # Choose a random number of bins (and for n bins, you need n + 1 breaks)
+          length.out = n.bins[j]
+        )
+      )
+    )
+}
+
+# prep the data given the variables provided
+COHORT.optimised <-
+  prepData(
+    # Data for cross-validation excludes test set
+    COHORT.use,
+    cols.keep,
+    process.settings,
+    surv.time, surv.event,
+    surv.event.yes,
+    extra.fun = caliberExtraPrep
+  )
+
+# Unpack variable and level names
+cph.coeffs <- cphCoeffs(
+  bootStats(surv.model.fit.boot, uncertainty = '95ci', transform = `-`),
+  COHORT.optimised, surv.predict, model.type = 'boot.survreg'
+)
+
+# We'll need the CALIBER scaling functions for plotting
+source('../cox-ph/caliber-scale.R')
+
+# set up list to store the plots
+cox.discrete.plots <- list()
+# Add dummy columns for x-position of missing values
+caliber.missing.coeffs$missing.x.pos.cont <- NA
+cph.coeffs$missing.x.pos.disc <- NA
+
+for(variable in unique(cph.coeffs$var)) {
+  # If it's a continuous variable, get the real centres of the bins
+  if(variable %in% process.settings$var) {
+    process.i <- which(variable == process.settings$var)
+    
+    if(process.settings$method[[process.i]] == 'binByQuantile') {
+      
+      variable.quantiles <-
+        getQuantiles(
+          COHORT.use[, variable],
+          process.settings$settings[[process.i]]
+        )
+      # For those rows which relate to this variable, and whose level isn't
+      # missing, put in the appropriate quantile boundaries for plotting
+      cph.coeffs$bin.min[cph.coeffs$var == variable & 
+                           cph.coeffs$level != 'missing'] <-
+        variable.quantiles[1:(length(variable.quantiles) - 1)]
+      cph.coeffs$bin.max[cph.coeffs$var == variable & 
+                           cph.coeffs$level != 'missing'] <-
+        variable.quantiles[2:length(variable.quantiles)]
+      # Make the final bin the 99th percentile
+      cph.coeffs$bin.max[cph.coeffs$var == variable & 
+                           cph.coeffs$level != 'missing'][
+                             length(variable.quantiles) - 1] <-
+        quantile(COHORT.use[, variable], 0.99, na.rm = TRUE)
+      
+      # Add a fake data point at the highest value to finish the graph
+      cph.coeffs <-
+        rbind(
+          cph.coeffs,
+          cph.coeffs[cph.coeffs$var == variable & 
+                       cph.coeffs$level != 'missing', ][
+                         length(variable.quantiles) - 1, ]
+        )
+      # Change it so that bin.min is bin.max from the old one
+      cph.coeffs$bin.min[nrow(cph.coeffs)] <-
+        cph.coeffs$bin.max[cph.coeffs$var == variable & 
+                             cph.coeffs$level != 'missing'][
+                               length(variable.quantiles) - 1]
+      
+      # Work out data range by taking the 1st and 99th percentiles
+      # Use the max to provide a max value for the final bin
+      # Also use for x-axis limits, unless there are missing values to
+      # accommodate on the right-hand edge.
+      x.data.range <-
+        quantile(COHORT.use[, variable], c(0.01, 0.99), na.rm = TRUE)
+      x.axis.limits <- x.data.range
+      
+      
+      # Finally, we need to scale this such that the baseline value is equal
+      # to the value for the equivalent place in the Cox model, to make the
+      # risks comparable...
+      
+      # First, we need to find the average value of this variable in the lowest
+      # bin (which is always the baseline here)
+      baseline.bin <- variable.quantiles[1:2]
+      baseline.bin.avg <- 
+        mean(
+          # Take only those values of the variable which are in the range
+          COHORT.use[
+            inRange(COHORT.use[, variable], baseline.bin, na.false = TRUE),
+            variable
+            ]
+        )
+      # Then, scale it with the caliber scaling
+      baseline.bin.val <-
+        caliberScaleUnits(baseline.bin.avg, variable) * 
+        caliber.missing.coeffs$our_value[
+          caliber.missing.coeffs$quantity == variable
+          ]
+      
+      # And now, add all the discretised values to that value to make them
+      # comparable...
+      cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] <-
+        cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] -
+        baseline.bin.val
+      
+      # Now, plot this variable as a stepped line plot using those quantile
+      # boundaries
+      cox.discrete.plot <-
+        ggplot(
+          subset(cph.coeffs, var == variable),
+          aes(x = bin.min, y = val)
+        ) +   
+        geom_step() +
+        geom_step(aes(y = lower), colour = 'grey') +
+        geom_step(aes(y = upper), colour = 'grey') +
+        labs(x = variable, y = 'Bx')
+      
+      # If there's a missing value risk, add it
+      if(any(cph.coeffs$var == variable & cph.coeffs$level == 'missing')) {
+        # Expand the x-axis to squeeze the missing values in
+        x.axis.limits[2] <- 
+          x.axis.limits[2] + diff(x.data.range) * missing.padding
+        # Put this missing value a third of the way into the missing area
+        cph.coeffs$missing.x.pos.disc[
+          cph.coeffs$var == variable &
+          cph.coeffs$level == 'missing'] <-
+          x.axis.limits[2] + diff(x.data.range) * missing.padding / 3
+
+        # Add the point to the graph (we'll set axis limits later)
+        cox.discrete.plot <-
+          cox.discrete.plot +
+          geom_pointrange(
+            data = cph.coeffs[cph.coeffs$var == variable & 
+                                cph.coeffs$level == 'missing', ],
+            aes(
+              x = missing.x.pos.disc,
+              y = val, ymin = lower,
+              ymax = upper
+            ),
+            colour = 'red'
+          )
+      }
+      
+      # Now, let's add the line from the continuous Cox model. We only need two
+      # points because the lines are straight!
+      continuous.cox <-
+        data.frame(
+          var.x.values = x.data.range
+        )
+      # Scale the x-values
+      continuous.cox$var.x.scaled <-
+        caliberScaleUnits(continuous.cox$var.x.values, variable)
+      # Use the risks to calculate risk per x for central estimate and errors
+      continuous.cox$y <-
+        -caliber.missing.coeffs$our_value[
+          caliber.missing.coeffs$quantity == variable
+          ] * continuous.cox$var.x.scaled
+      continuous.cox$upper <-
+        -caliber.missing.coeffs$our_upper[
+          caliber.missing.coeffs$quantity == variable
+          ] * continuous.cox$var.x.scaled
+      continuous.cox$lower <-
+        -caliber.missing.coeffs$our_lower[
+          caliber.missing.coeffs$quantity == variable
+          ] * continuous.cox$var.x.scaled
+      
+      cox.discrete.plot <-
+        cox.discrete.plot +
+        geom_line(
+          data = continuous.cox,
+          aes(x = var.x.values, y = y),
+          colour = 'blue'
+        ) +
+        geom_line(
+          data = continuous.cox,
+          aes(x = var.x.values, y = upper),
+          colour = 'lightblue'
+        ) +
+        geom_line(
+          data = continuous.cox,
+          aes(x = var.x.values, y = lower),
+          colour = 'lightblue'
+        )
+      
+      # If there is one, add missing value risk from the continuous model
+      if(any(caliber.missing.coeffs$quantity == paste0(variable, '_missing') &
+             caliber.missing.coeffs$unit == 'missing')) {
+        
+        # Put this missing value 2/3rds of the way into the missing area
+        caliber.missing.coeffs$missing.x.pos.cont[
+          caliber.missing.coeffs$quantity == paste0(variable, '_missing') &
+            caliber.missing.coeffs$unit == 'missing'] <-
+                x.axis.limits[2] + 2 * diff(x.data.range) * missing.padding / 3
+        
+        cox.discrete.plot <-
+          cox.discrete.plot +
+          geom_pointrange(
+            data = caliber.missing.coeffs[
+              caliber.missing.coeffs$quantity == paste0(variable, '_missing') &
+                caliber.missing.coeffs$unit == 'missing',
+              ],
+            aes(
+              x = missing.x.pos.cont,
+              y = our_value, ymin = our_lower, ymax = our_upper
+            ),
+            colour = 'blue'
+          )
+      }
+      
+      # Finally, set the x-axis limits; will just be the data range, or data
+      # range plus a bit if there are missing values to squeeze in
+      cox.discrete.plot <-
+        cox.discrete.plot +
+        coord_cartesian(xlim = x.axis.limits) +
+        theme(axis.title.y = element_blank()) +
+        theme(plot.margin = unit(c(0.2, 0.1, 0.2, 0.1), "cm"))
+      
+      cox.discrete.plots[[variable]] <- cox.discrete.plot
+    }
+  }
+}
+
+# Load the random forest variable effects file
+risk.by.variables <- read.csv(rf.filename)
+rf.vareff.plots <- list()
+
+for(variable in unique(risk.by.variables$var)) {
+  # Get the mean of the normalised risk for every value of the variable
+  risk.aggregated <-
+    aggregate(
+      as.formula(paste0('risk.normalised ~ val')),
+      subset(risk.by.variables, var == variable), median
+    )
+  
+  # work out the limits on the axes by taking the 1st and 99th percentiles
+  x.axis.limits <-
+    quantile(COHORT.use[, variable], c(0.01, 0.99), na.rm = TRUE)
+  y.axis.limits <-
+    quantile(subset(risk.by.variables, var == variable)$risk.normalised, c(0.05, 0.95), na.rm = TRUE)
+  
+  # If there's a missing value risk in the graph above, expand the axes so they
+  # match
+  if(any(cph.coeffs$var == variable & cph.coeffs$level == 'missing')) {
+    x.axis.limits[2] <- 
+      x.axis.limits[2] + diff(x.data.range) * missing.padding
+  }
+  
+  rf.vareff.plots[[variable]] <-
+    ggplot(
+      subset(risk.by.variables, var == variable), 
+      aes(x = val, y = log(risk.normalised))
+    ) +
+    geom_line(alpha=0.003, aes(group = id)) +
+    geom_line(data = risk.aggregated, colour = 'blue') +
+    coord_cartesian(xlim = x.axis.limits, ylim = log(y.axis.limits)) +
+    labs(x = variable) +
+    theme(
+      plot.margin = unit(c(0.2, 0.1, 0.2, 0.1), "cm"),
+      axis.title.y = element_blank()
+    )
+}
+
+
+plot_grid(
+  cox.discrete.plots[['age']],
+  cox.discrete.plots[['haemoglobin_6mo']],
+  cox.discrete.plots[['total_wbc_6mo']],
+  cox.discrete.plots[['crea_6mo']],
+  rf.vareff.plots[['age']],
+  rf.vareff.plots[['haemoglobin_6mo']],
+  rf.vareff.plots[['total_wbc_6mo']],
+  rf.vareff.plots[['crea_6mo']],
+  labels = c('A', rep('', 3), 'B', rep('', 3)),
+  align = "v", ncol = 4
+)
+
+ggsave(
+  '../../output/variable-effects.pdf',
+  width = 16,
+  height = 10,
+  units = 'cm',
+  useDingbats = FALSE
+)