Diff of /cox-ph/cox-discretised.R [000000] .. [0375db]

Switch to side-by-side view

--- a
+++ b/cox-ph/cox-discretised.R
@@ -0,0 +1,355 @@
+#+ knitr_setup, include = FALSE
+
+# Whether to cache the intensive code sections. Set to FALSE to recalculate
+# everything afresh.
+cacheoption <- FALSE
+# Disable lazy caching globally, because it fails for large objects, and all the
+# objects we wish to cache are large...
+opts_chunk$set(cache.lazy = FALSE)
+
+#' # Cross-validating discretisation of input variables in a survival model
+#' 
+#' 
+
+calibration.filename <- '../../output/survreg-crossvalidation-try5.csv'
+caliber.missing.coefficients.filename <-
+  '../../output/caliber-replicate-with-missing-survreg-bootstrap-coeffs-1.csv'
+comparison.filename <-
+  '../../output/caliber-replicate-with-missing-var-imp-try2.csv'
+# The first part of the filename for any output
+output.filename.base <- '../../output/all-cv-survreg-boot-try5'
+
+
+# What kind of model to fit to...currently 'cph' (Cox model), 'ranger' or
+# 'rfsrc' (two implementations of random survival forests)
+model.type <- 'survreg'
+
+# If surv.vars is defined as a character vector here, the model only uses those
+# variables specified, eg c('age') would build a model purely based on age. If
+# not specified (ie commented out), it will use the defaults.
+# surv.predict <- c('age')
+
+#' ## Do the cross-validation
+#' 
+#' The steps for this are common regardless of model type, so run the script to
+#' get a cross-validated model to further analyse...
+#+ cox_discretised_cv, cache=cacheoption
+
+source('../lib/all-cv-bootstrap.R', chdir = TRUE)
+
+#' # Results
+#' 
+#' ## Performance
+#' 
+#' ### C-index
+#' 
+#' C-index is **`r round(surv.model.fit.coeffs['c.index', 'val'], 3)`
+#' (`r round(surv.model.fit.coeffs['c.index', 'lower'], 3)` - 
+#' `r round(surv.model.fit.coeffs['c.index', 'upper'], 3)`)** on the held-out
+#' test set.
+#' 
+#'
+#' ### Calibration
+#' 
+#' The bootstrapped calibration score is
+#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)`
+#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` - 
+#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**.
+#' 
+#' Let's draw a representative curve from the unbootstrapped fit... (It would be
+#' better to draw all the curves from the bootstrap fit to get an idea of
+#' variability, but I've not implemented this yet.)
+#' 
+#+ calibration_plot
+
+calibration.table <-
+  calibrationTable(surv.model.fit, COHORT.optimised[test.set, ])
+
+calibration.score <- calibrationScore(calibration.table)
+
+calibrationPlot(calibration.table)
+
+#' 
+#' ## Model fit
+#'
+#+ resulting_fit
+
+print(surv.model.fit)
+
+#' ## Cox coefficients
+#'
+#+ cox_coefficients_plot
+
+
+# Save bootstrapped performance values
+varsToTable(
+  data.frame(
+    model = 'cox',
+    imputation = FALSE,
+    discretised = TRUE,
+    c.index = surv.model.fit.coeffs['c.index', 'val'],
+    c.index.lower = surv.model.fit.coeffs['c.index', 'lower'],
+    c.index.upper = surv.model.fit.coeffs['c.index', 'upper'],
+    calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
+    calibration.score.lower =
+      surv.model.fit.coeffs['calibration.score', 'lower'],
+    calibration.score.upper =
+      surv.model.fit.coeffs['calibration.score', 'upper']
+  ),
+  performance.file,
+  index.cols = c('model', 'imputation', 'discretised')
+)
+
+# Unpackage the uncertainties again, this time transformed because survreg
+# returns negative values
+surv.boot.ests <- bootStatsDf(surv.model.params.boot, transform = `-`)
+
+#' First, plot the factors and logicals as a scatter plot to compare with the
+#' continuous Cox model...
+
+# Pull coefficients from model with missing data
+caliber.missing.coeffs <- read.csv(caliber.missing.coefficients.filename)
+
+# Rename surv.boot.ests ready for merging
+names(surv.boot.ests) <-
+  c('cox_discrete_value', 'cox_discrete_lower', 'cox_discrete_upper')
+surv.boot.ests$quantity.level <- rownames(surv.boot.ests)
+# Convert variablemissing to variable_missingTRUE for compatibility
+vars.with.missing <- endsWith(surv.boot.ests$quantity.level, 'missing')
+surv.boot.ests$quantity.level[vars.with.missing] <-
+  paste0(
+    substr(
+      surv.boot.ests$quantity.level[vars.with.missing],
+      1,
+      nchar(surv.boot.ests$quantity.level[vars.with.missing]) - nchar('missing')
+    ),
+    '_missingTRUE'
+  )
+
+# Create a data frame comparing them
+compare.coefficients <- merge(caliber.missing.coeffs, surv.boot.ests)
+
+ggplot(
+    compare.coefficients,
+    aes(x = our_value, y = cox_discrete_value, colour = unit == 'missing')
+  ) +
+  geom_abline(intercept = 0, slope = 1) +
+  geom_hline(yintercept = 1, colour = 'grey') +
+  geom_vline(xintercept = 1, colour = 'grey') +
+  geom_point() +
+  geom_errorbar(aes(ymin = cox_discrete_lower, ymax = cox_discrete_upper)) +
+  geom_errorbarh(aes(xmin = our_lower, xmax = our_upper)) +
+  geom_text_repel(aes(label = long_name)) +
+  theme_classic(base_size = 8)
+
+# Unpack variable and level names
+cph.coeffs <- cphCoeffs(
+  bootStats(surv.model.fit.boot, uncertainty = '95ci', transform = `-`),
+  COHORT.optimised, surv.predict, model.type = 'boot.survreg'
+)
+
+# We'll need the CALIBER scaling functions for plotting
+source('../cox-ph/caliber-scale.R')
+
+# set up list to store the plots
+cox.discrete.plots <- list()
+# Add dummy columns for x-position of missing values
+cph.coeffs$missing.x.pos.cont <- NA
+cph.coeffs$missing.x.pos.disc <- NA
+
+for(variable in unique(cph.coeffs$var)) {
+  # If it's a continuous variable, get the real centres of the bins
+  if(variable %in% process.settings$var) {
+    process.i <- which(variable == process.settings$var)
+    
+    if(process.settings$method[[process.i]] == 'binByQuantile') {
+      
+      variable.quantiles <-
+        getQuantiles(
+          COHORT.use[, variable],
+          process.settings$settings[[process.i]]
+        )
+      # For those rows which relate to this variable, and whose level isn't
+      # missing, put in the appropriate quantile boundaries for plotting
+      cph.coeffs$bin.min[cph.coeffs$var == variable & 
+                           cph.coeffs$level != 'missing'] <-
+        variable.quantiles[1:(length(variable.quantiles) - 1)]
+      cph.coeffs$bin.max[cph.coeffs$var == variable & 
+                           cph.coeffs$level != 'missing'] <-
+        variable.quantiles[2:length(variable.quantiles)]
+      # Make the final bin the 99th percentile
+      cph.coeffs$bin.max[cph.coeffs$var == variable & 
+                           cph.coeffs$level != 'missing'][
+                             length(variable.quantiles) - 1] <-
+        quantile(COHORT.use[, variable], 0.99, na.rm = TRUE)
+      
+      # Add a fake data point at the highest value to finish the graph
+      cph.coeffs <-
+        rbind(
+          cph.coeffs,
+          cph.coeffs[cph.coeffs$var == variable & 
+                       cph.coeffs$level != 'missing', ][
+                         length(variable.quantiles) - 1, ]
+        )
+      # Change it so that bin.min is bin.max from the old one
+      cph.coeffs$bin.min[nrow(cph.coeffs)] <-
+        cph.coeffs$bin.max[cph.coeffs$var == variable & 
+                             cph.coeffs$level != 'missing'][
+                               length(variable.quantiles) - 1]
+      
+      # Work out data range by taking the 1st and 99th percentiles
+      # Use the max to provide a max value for the final bin
+      # Also use for x-axis limits, unless there are missing values to
+      # accommodate on the right-hand edge.
+      x.data.range <-
+        quantile(COHORT.use[, variable], c(0.01, 0.99), na.rm = TRUE)
+      x.axis.limits <- x.data.range
+      
+      
+      # Finally, we need to scale this such that the baseline value is equal
+      # to the value for the equivalent place in the Cox model, to make the
+      # risks comparable...
+      
+      # First, we need to find the average value of this variable in the lowest
+      # bin (which is always the baseline here)
+      baseline.bin <- variable.quantiles[1:2]
+      baseline.bin.avg <- 
+        mean(
+          # Take only those values of the variable which are in the range
+          COHORT.use[
+            inRange(COHORT.use[, variable], baseline.bin, na.false = TRUE),
+            variable
+            ]
+        )
+      # Then, scale it with the caliber scaling
+      baseline.bin.val <-
+        caliberScaleUnits(baseline.bin.avg, variable) * 
+        caliber.missing.coeffs$our_value[
+          caliber.missing.coeffs$quantity == variable
+          ]
+      
+      # And now, add all the discretised values to that value to make them
+      # comparable...
+      cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] <-
+        cph.coeffs[cph.coeffs$var == variable, c('val', 'lower', 'upper')] -
+        baseline.bin.val
+      
+      # Now, plot this variable as a stepped line plot using those quantile
+      # boundaries
+      cox.discrete.plot <-
+        ggplot(
+          subset(cph.coeffs, var == variable),
+          aes(x = bin.min, y = val)
+        ) +   
+        geom_step() +
+        geom_step(aes(y = lower), colour = 'grey') +
+        geom_step(aes(y = upper), colour = 'grey') +
+        ggtitle(variable)
+      
+      # If there's a missing value risk, add it
+      if(any(cph.coeffs$var == variable & cph.coeffs$level == 'missing')) {
+        # Expand the x-axis to squeeze the missing values in
+        x.axis.limits[2] <- 
+          x.axis.limits[2] + diff(x.data.range) * missing.padding
+        # Put this missing value a third of the way into the missing area
+        cph.coeffs$missing.x.pos.disc[
+          cph.coeffs$var == variable &
+            cph.coeffs$level == 'missing'] <-
+          x.axis.limits[2] + diff(x.data.range) * missing.padding / 3
+        
+        # Add the point to the graph (we'll set axis limits later)
+        cox.discrete.plot <-
+          cox.discrete.plot +
+          geom_pointrange(
+            data = cph.coeffs[cph.coeffs$var == variable & 
+                                cph.coeffs$level == 'missing', ],
+            aes(
+              x = missing.x.pos.disc,
+              y = val, ymin = lower,
+              ymax = upper
+            ),
+            colour = 'red'
+          )
+      }
+      
+      # Now, let's add the line from the continuous Cox model. We only need two
+      # points because the lines are straight!
+      continuous.cox <-
+        data.frame(
+          var.x.values = x.data.range
+        )
+      # Scale the x-values
+      continuous.cox$var.x.scaled <-
+        caliberScaleUnits(continuous.cox$var.x.values, variable)
+      # Use the risks to calculate risk per x for central estimate and errors
+      continuous.cox$y <-
+        -caliber.missing.coeffs$our_value[
+          caliber.missing.coeffs$quantity == variable
+          ] * continuous.cox$var.x.scaled
+      continuous.cox$upper <-
+        -caliber.missing.coeffs$our_upper[
+          caliber.missing.coeffs$quantity == variable
+          ] * continuous.cox$var.x.scaled
+      continuous.cox$lower <-
+        -caliber.missing.coeffs$our_lower[
+          caliber.missing.coeffs$quantity == variable
+          ] * continuous.cox$var.x.scaled
+      
+      cox.discrete.plot <-
+        cox.discrete.plot +
+        geom_line(
+          data = continuous.cox,
+          aes(x = var.x.values, y = y),
+          colour = 'blue'
+        ) +
+        geom_line(
+          data = continuous.cox,
+          aes(x = var.x.values, y = upper),
+          colour = 'lightblue'
+        ) +
+        geom_line(
+          data = continuous.cox,
+          aes(x = var.x.values, y = lower),
+          colour = 'lightblue'
+        )
+      
+      # If there is one, add missing value risk from the continuous model
+      if(any(caliber.missing.coeffs$quantity == paste0(variable, '_missing') &
+             caliber.missing.coeffs$unit == 'missing')) {
+        # Expand the x-axis to squeeze the missing values in
+        x.axis.limits[2] <- 
+          x.axis.limits[2] + diff(x.data.range) * missing.padding
+        # Put this missing value 2/3rds of the way into the missing area
+        cph.coeffs$missing.x.pos.cont[
+          cph.coeffs$var == variable &
+            cph.coeffs$level == 'missing'] <-
+          x.axis.limits[2] + diff(x.data.range) * missing.padding / 3
+        x.axis.limits[2] + 2 * diff(x.data.range) * missing.padding / 3
+        
+        cox.discrete.plot <-
+          cox.discrete.plot +
+          geom_pointrange(
+            data = cph.coeffs[
+              cph.coeffs$var == variable &
+                cph.coeffs$level == 'missing',
+              ],
+            aes(
+              x = missing.x.pos.cont,
+              y = val, ymin = lower, ymax = upper
+            ),
+            colour = 'blue'
+          )
+      }
+      
+      # Finally, set the x-axis limits; will just be the data range, or data
+      # range plus a bit if there are missing values to squeeze in
+      cox.discrete.plot <-
+        cox.discrete.plot +
+        coord_cartesian(xlim = x.axis.limits)
+      
+      cox.discrete.plots[[variable]] <- cox.discrete.plot
+    }
+  }
+}
+
+print(cox.discrete.plots)