MLehealth / Git / Diff of /lib/all-cv-bootstrap.R

Models:
philipB/
MLehealth
Downloads: 1
Diff of /lib/all-cv-bootstrap.R [000000] .. [0375db]
Switch to side-by-side view

--- a
+++ b/lib/all-cv-bootstrap.R
@@ -0,0 +1,294 @@
+# All model types are bootstrapped this many times
+bootstraps <- 200
+# n.trees is (obviously) only relevant for random forests
+n.trees <- 500
+# The following two variables are only relevant if the model.type is 'ranger'
+split.rule <- 'logrank'
+n.threads <- 10
+
+# Cross-validation variables
+input.n.bins <- 10:20
+cv.n.folds <- 3
+n.calibrations <- 1000
+n.data <- NA # This is of full dataset...further rows may be excluded in prep
+
+continuous.vars <-
+  c(
+    'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
+    'total_wbc_6mo', 'haemoglobin_6mo'
+  )
+
+source('shared.R')
+require(ggrepel)
+
+# Load the data and convert to data frame to make column-selecting code in
+# prepData simpler
+COHORT.full <- data.frame(fread(data.filename))
+
+# If n.data was specified...
+if(!is.na(n.data)){
+  # Take a subset n.data in size
+  COHORT.use <- sample.df(COHORT.full, n.data)
+  rm(COHORT.full)
+} else {
+  # Use all the data
+  COHORT.use <- COHORT.full
+  rm(COHORT.full)
+}
+
+# We now need a quick null preparation of the data to get its length (some rows
+# may be excluded during preparation)
+COHORT.prep <-
+  prepData(
+    COHORT.use,
+    cols.keep, discretise.settings, surv.time, surv.event,
+    surv.event.yes, extra.fun = caliberExtraPrep, n.keep = n.data
+  )
+n.data <- nrow(COHORT.prep)
+
+# Define indices of test set
+test.set <- testSetIndices(COHORT.prep, random.seed = 78361)
+
+# If we've not already done a calibration, then do one
+if(!file.exists(calibration.filename)) {
+  # Create an empty data frame to aggregate stats per fold
+  cv.performance <- data.frame()
+  
+  # We can parallelise this bit with foreach, so set that up
+  initParallel(n.threads)
+  
+  # Run crossvalidations in parallel
+  cv.performance <- 
+    foreach(i = 1:n.calibrations, .combine = 'rbind') %dopar% {
+    cat(
+      'Calibration', i, '...\n'
+    )
+    
+    # Reset process settings with the base setings
+    process.settings <-
+      list(
+        var        = c('anonpatid', 'time_death', 'imd_score', 'exclude'),
+        method     = c(NA, NA, NA, NA),
+        settings   = list(NA, NA, NA, NA)
+      )
+    # Generate some random numbers of bins (and for n bins, you need n + 1 breaks)
+    n.bins <- sample(input.n.bins, length(continuous.vars), replace = TRUE) + 1
+    names(n.bins) <- continuous.vars
+    # Go through each variable setting it to bin by quantile with a random number of bins
+    for(j in 1:length(continuous.vars)) {
+      process.settings$var <- c(process.settings$var, continuous.vars[j])
+      process.settings$method <- c(process.settings$method, 'binByQuantile')
+      process.settings$settings <-
+        c(
+          process.settings$settings,
+          list(
+            seq(
+              # Quantiles are obviously between 0 and 1
+              0, 1,
+              # Choose a random number of bins (and for n bins, you need n + 1 breaks)
+              length.out = n.bins[j]
+            )
+          )
+        )
+    }
+    
+    # prep the data given the variables provided
+    COHORT.cv <-
+      prepData(
+        # Data for cross-validation excludes test set
+        COHORT.use[-test.set, ],
+        cols.keep,
+        process.settings,
+        surv.time, surv.event,
+        surv.event.yes,
+        extra.fun = caliberExtraPrep
+      )
+    
+    # Get folds for cross-validation
+    cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds)
+    
+    cv.fold.performance <- data.frame()
+    
+    for(j in 1:cv.n.folds) {
+      time.start <- handyTimer()
+      # Fit model to the training set
+      surv.model.fit <-
+        survivalFit(
+          surv.predict,
+          COHORT.cv[-cv.folds[[j]],],
+          model.type = model.type,
+          n.trees = n.trees,
+          split.rule = split.rule
+          # n.threads not used because this is run in parallel
+        )
+      time.learn <- handyTimer(time.start)
+      
+      time.start <- handyTimer()
+      # Get C-indices for training and validation sets
+      c.index.train <-
+        cIndex(
+          surv.model.fit, COHORT.cv[-cv.folds[[j]],], model.type = model.type
+        )
+      c.index.val <-
+        cIndex(
+          surv.model.fit, COHORT.cv[cv.folds[[j]],], model.type = model.type
+        )
+      time.predict <- handyTimer(time.start)
+      
+      # Append the stats we've obtained from this fold
+      cv.fold.performance <-
+        rbind(
+          cv.fold.performance,
+          data.frame(
+            calibration = i,
+            cv.fold = j,
+            as.list(n.bins),
+            c.index.train,
+            c.index.val,
+            time.learn,
+            time.predict
+          )
+        )
+      
+    } # End cross-validation loop (j)
+    
+    # rbind the performance by fold
+    cv.fold.performance
+  } # End calibration loop (i)
+  
+  # Save output at end of calibration
+  write.csv(cv.performance, calibration.filename)
+
+} else { # If we did previously calibrate, load it
+  cv.performance <- read.csv(calibration.filename)
+}
+
+# Find the best calibration...
+# First, average performance across cross-validation folds
+cv.performance.average <-
+  aggregate(
+    c.index.val ~ calibration,
+    data = cv.performance,
+    mean
+  )
+# Find the highest value
+best.calibration <-
+  cv.performance.average$calibration[
+    which.max(cv.performance.average$c.index.val)
+  ]
+# And finally, find the first row of that calibration to get the n.bins values
+best.calibration.row1 <-
+  min(which(cv.performance$calibration == best.calibration))
+
+# Get its parameters
+n.bins <-
+  t(
+    cv.performance[best.calibration.row1, continuous.vars]
+  )
+
+# Prepare the data with those settings...
+
+# Reset process settings with the base setings
+process.settings <-
+  list(
+    var        = c('anonpatid', 'time_death', 'imd_score', 'exclude'),
+    method     = c(NA, NA, NA, NA),
+    settings   = list(NA, NA, NA, NA)
+  )
+for(j in 1:length(continuous.vars)) {
+  process.settings$var <- c(process.settings$var, continuous.vars[j])
+  process.settings$method <- c(process.settings$method, 'binByQuantile')
+  process.settings$settings <-
+    c(
+      process.settings$settings,
+      list(
+        seq(
+          # Quantiles are obviously between 0 and 1
+          0, 1,
+          # Choose a random number of bins (and for n bins, you need n + 1 breaks)
+          length.out = n.bins[j]
+        )
+      )
+    )
+}
+
+# prep the data given the variables provided
+COHORT.optimised <-
+  prepData(
+    # Data for cross-validation excludes test set
+    COHORT.use,
+    cols.keep,
+    process.settings,
+    surv.time, surv.event,
+    surv.event.yes,
+    extra.fun = caliberExtraPrep
+  )
+
+#' ## Fit the final model
+#' 
+#' This may take some time, so we'll cache it if possible...
+
+#+ fit_final_model
+
+# Fit to whole training set
+surv.model.fit <-
+  survivalFit(
+    surv.predict,
+    COHORT.optimised[-test.set,], # Training set
+    model.type = model.type,
+    n.trees = n.trees,
+    split.rule = split.rule,
+    n.threads = n.threads
+  )
+
+cl <- initParallel(n.threads, backend = 'doParallel')
+
+surv.model.params.boot <-
+  foreach(
+    i = 1:bootstraps,
+    .combine = rbind,
+    .packages = c('survival'),
+    .verbose = TRUE
+    ) %dopar% {
+    
+    # Bootstrap-sampled training set
+    COHORT.boot <-
+      sample.df(
+        COHORT.optimised[-test.set,],
+        nrow(COHORT.optimised[-test.set,]),
+        replace = TRUE
+      )
+    
+    surv.model.fit.i <-
+      survivalFit(
+        surv.predict,
+        COHORT.boot,
+        model.type = model.type,
+        n.trees = n.trees,
+        split.rule = split.rule,
+        # 1 thread, because we're parallelising the bootstrapping
+        n.threads = 1
+      )
+    
+    # Work out other quantities of interest
+    #var.imp.vector <- bootstrapVarImp(surv.model.fit.i, COHORT.boot)
+    c.index <- cIndex(surv.model.fit.i, COHORT.optimised[test.set, ])
+    calibration.score <-
+      calibrationScoreWrapper(surv.model.fit.i, COHORT.optimised[test.set, ])
+    
+    data.frame(
+      i,
+      t(coef(surv.model.fit.i)),
+      #t(var.imp.vector),
+      c.index,
+      calibration.score
+    )
+  }
+
+# Save the fit object
+write.csv(surv.model.params.boot, paste0(output.filename.base, '-surv-boot.csv'))
+
+# Tidy up by removing the cluster
+stopCluster(cl)
+
+surv.model.fit.coeffs <-  bootStatsDf(surv.model.params.boot)