MLehealth / Git / Diff of /lib/rfsrc-cv-nsplit-bootstrap.R

Models:

philipB/

MLehealth

Downloads: 1

Diff of /lib/rfsrc-cv-nsplit-bootstrap.R [000000] .. [0375db]

Switch to unified view

 b/lib/rfsrc-cv-nsplit-bootstrap.R
+bootstraps <- 20
+split.rule <- 'logrank'
+n.threads <- 16
+# Cross-validation variables
+ns.splits <- 0:20
+ns.trees  <- c(500, 1000, 2000)
+ns.imputations <- 1:3
+cv.n.folds <- 3
+n.data <- NA # This is of full dataset...further rows may be excluded in prep
+calibration.filename <- paste0(output.filename.base, '-calibration.csv')
+continuous.vars <-
+  c(
+    'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
+    'total_wbc_6mo', 'haemoglobin_6mo'
+  )
+untransformed.vars <- c('anonpatid', 'surv_time', 'imd_score', 'exclude')
+source('../lib/shared.R')
+require(ggrepel)
+# Load the data and convert to data frame to make column-selecting code in
+# prepData simpler
+COHORT.full <- data.frame(fread(data.filename))
+# If n.data was specified...
+if(!is.na(n.data)){
+  # Take a subset n.data in size
+  COHORT.use <- sample.df(COHORT.full, n.data)
+  rm(COHORT.full)
+} else {
+  # Use all the data
+  COHORT.use <- COHORT.full
+  rm(COHORT.full)
+}
+# We now need a quick null preparation of the data to get its length (some rows
+# may be excluded during preparation)
+COHORT.prep <-
+  prepData(
+    COHORT.use,
+    cols.keep, discretise.settings, surv.time, surv.event,
+    surv.event.yes, extra.fun = caliberExtraPrep, n.keep = n.data
+  )
+n.data <- nrow(COHORT.prep)
+# Define indices of test set
+test.set <- testSetIndices(COHORT.prep, random.seed = 78361)
+# Process settings: don't touch anything!!
+process.settings <-
+  list(
+    var       = c(untransformed.vars, continuous.vars),
+    method    = rep(NA, length(untransformed.vars) + length(continuous.vars)),
+    settings  = rep(NA, length(untransformed.vars) + length(continuous.vars))
+  )
+# If we've not already done a calibration, then do one
+if(!file.exists(calibration.filename)) {
+  # Create an empty data frame to aggregate stats per fold
+  cv.performance <- data.frame()
+  # Items to cross-validate over
+  cv.vars <- expand.grid(ns.splits, ns.trees, ns.imputations)
+  names(cv.vars) <- c('n.splits', 'n.trees', 'n.imputations')
+  # prep the data (since we're not cross-validating on data prep this can be
+  # done before the loop)
+  # Prep the data
+  COHORT.cv <-
+    prepData(
+      # Data for cross-validation excludes test set
+      COHORT.use[-test.set, ],
+      cols.keep,
+      process.settings,
+      surv.time, surv.event,
+      surv.event.yes,
+      extra.fun = caliberExtraPrep
+    )
+  # Finally, add missing flag columns, but leave the missing data intact because
+  # rfsrc can do on-the-fly imputation
+  COHORT.cv <- prepCoxMissing(COHORT.cv, missingReplace = NA)
+  # Add on those column names we just created
+  surv.predict <-
+    c(surv.predict, names(COHORT.cv)[grepl('_missing', names(COHORT.cv))])
+  # Run crossvalidations. No need to parallelise because rfsrc is parallelised
+  for(i in 1:nrow(cv.vars)) {
+    cat(
+      'Calibration', i, '...\n'
+    )
+    # Get folds for cross-validation
+    cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds)
+    cv.fold.performance <- data.frame()
+    for(j in 1:cv.n.folds) {
+      time.start <- handyTimer()
+      # Fit model to the training set
+      surv.model.fit <-
+        survivalFit(
+          surv.predict,
+          COHORT.cv[-cv.folds[[j]],],
+          model.type = 'rfsrc',
+          n.trees = cv.vars$n.trees[i],
+          split.rule = split.rule,
+          n.threads = n.threads,
+          nsplit = cv.vars$n.splits[i],
+          nimpute = cv.vars$n.imputations[i],
+          na.action = 'na.impute'
+        )
+      time.learn <- handyTimer(time.start)
+      time.start <- handyTimer()
+      # Get C-indices for training and validation sets
+      c.index.train <-
+        cIndex(
+          surv.model.fit, COHORT.cv[-cv.folds[[j]],],
+          na.action = 'na.impute'
+        )
+      c.index.val <-
+        cIndex(
+          surv.model.fit, COHORT.cv[cv.folds[[j]],],
+          na.action = 'na.impute'
+        )
+      time.predict <- handyTimer(time.start)
+      # Append the stats we've obtained from this fold
+      cv.fold.performance <-
+        rbind(
+          cv.fold.performance,
+          data.frame(
+            calibration = i,
+            cv.fold = j,
+            n.trees = cv.vars$n.trees[i],
+            n.splits = cv.vars$n.splits[i],
+            n.imputations = cv.vars$n.imputations[i],
+            c.index.train,
+            c.index.val,
+            time.learn,
+            time.predict
+          )
+        )
+    } # End cross-validation loop (j)
+    # rbind the performance by fold
+    cv.performance <-
+      rbind(
+        cv.performance,
+        cv.fold.performance
+      )
+    # Save output at the end of each loop
+    write.csv(cv.performance, calibration.filename)
+  } # End calibration loop (i)
+} else { # If we did previously calibrate, load it
+  cv.performance <- read.csv(calibration.filename)
+}
+# Find the best calibration...
+# First, average performance across cross-validation folds
+cv.performance.average <-
+  aggregate(
+    c.index.val ~ calibration,
+    data = cv.performance,
+    mean
+  )
+# Find the highest value
+best.calibration <-
+  cv.performance.average$calibration[
+    which.max(cv.performance.average$c.index.val)
+  ]
+# And finally, find the first row of that calibration to get the n.bins values
+best.calibration.row1 <-
+  min(which(cv.performance$calibration == best.calibration))
+# Prep the data to fit and test with
+COHORT.prep <-
+  prepData(
+    # Data for cross-validation excludes test set
+    COHORT.use,
+    cols.keep,
+    process.settings,
+    surv.time, surv.event,
+    surv.event.yes,
+    extra.fun = caliberExtraPrep
+  )
+# Finally, add missing flag columns, but leave the missing data intact because
+# rfsrc can do on-the-fly imputation
+COHORT.prep <- prepCoxMissing(COHORT.prep, missingReplace = NA)
+# Add on those column names we just created
+surv.predict <-
+  c(surv.predict, names(COHORT.prep)[grepl('_missing', names(COHORT.prep))])
+#' ## Fit the final model
+#'
+#' This may take some time, so we'll cache it if possible...
+#+ fit_final_model
+surv.model.fit <-
+  survivalFit(
+    surv.predict,
+    COHORT.prep[-test.set,],
+    model.type = 'rfsrc',
+    n.trees = cv.performance[best.calibration.row1, 'n.trees'],
+    split.rule = split.rule,
+    n.threads = n.threads,
+    nsplit = cv.performance[best.calibration.row1, 'n.splits'],
+    nimpute = cv.performance[best.calibration.row1, 'n.imputations'],
+    na.action = 'na.impute'
+  )
+surv.model.params.boot <-
+  survivalFitBoot(
+    surv.predict,
+    COHORT.prep[-test.set,], # Training set
+    COHORT.prep[test.set,],  # Test set
+    model.type = 'rfsrc',
+    n.threads = n.threads,
+    bootstraps = bootstraps,
+    n.trees = cv.performance[best.calibration.row1, 'n.trees'],
+    split.rule = split.rule,
+    nsplit = cv.performance[best.calibration.row1, 'n.splits'],
+    nimpute = cv.performance[best.calibration.row1, 'n.imputations'],
+    na.action = 'na.impute',
+    filename = paste0(output.filename.base, '-boot-all.csv')
+  )
+# Get C-indices for training and test sets
+surv.model.fit.coeffs <- bootStatsDf(surv.model.params.boot)
+# Save them to the all-models comparison table
+varsToTable(
+  data.frame(
+    model = 'rfsrc',
+    imputation = FALSE,
+    discretised = FALSE,
+    c.index = surv.model.fit.coeffs['c.index', 'val'],
+    c.index.lower = surv.model.fit.coeffs['c.index', 'lower'],
+    c.index.upper = surv.model.fit.coeffs['c.index', 'upper'],
+    calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
+    calibration.score.lower =
+      surv.model.fit.coeffs['calibration.score', 'lower'],
+    calibration.score.upper =
+      surv.model.fit.coeffs['calibration.score', 'upper']
+  ),
+  performance.file,
+  index.cols = c('model', 'imputation', 'discretised')
+)
+write.csv(
+  surv.model.fit.coeffs,
+  paste0(output.filename.base, '-boot-summary.csv')
+)