MLehealth / Git / Diff of /lib/rfsrc-cv-mtry-nsplit-logical.R

Models:

philipB/

MLehealth

Downloads: 1

Diff of /lib/rfsrc-cv-mtry-nsplit-logical.R [000000] .. [0375db]

Switch to unified view

 b/lib/rfsrc-cv-mtry-nsplit-logical.R
+bootstraps <- 3
+split.rule <- 'logrank'
+n.threads <- 20
+# Cross-validation variables
+ns.splits <- c(0, 5, 10, 15, 20, 30)
+ms.try <- c(50, 100, 200, 300, 400)
+n.trees.cv  <- 500
+n.imputations <- 3
+cv.n.folds <- 3
+n.trees.final <- 2000
+n.data <- NA # This is of full dataset...further rows may be excluded in prep
+calibration.filename <- paste0(output.filename.base, '-calibration.csv')
+# If we've not already done a calibration, then do one
+if(!file.exists(calibration.filename)) {
+  # Create an empty data frame to aggregate stats per fold
+  cv.performance <- data.frame()
+  # Items to cross-validate over
+  cv.vars <- expand.grid(ns.splits, ms.try)
+  names(cv.vars) <- c('n.splits', 'm.try')
+  COHORT.cv <- COHORT.bigdata[-test.set, ]
+  # Run crossvalidations. No need to parallelise because rfsrc is parallelised
+  for(i in 1:nrow(cv.vars)) {
+    cat(
+      'Calibration', i, '...\n'
+    )
+    # Get folds for cross-validation
+    cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds)
+    cv.fold.performance <- data.frame()
+    for(j in 1:cv.n.folds) {
+      time.start <- handyTimer()
+      # Fit model to the training set
+      surv.model.fit <-
+        survivalFit(
+          surv.predict,
+          COHORT.cv[-cv.folds[[j]],],
+          model.type = 'rfsrc',
+          n.trees = n.trees.cv,
+          split.rule = split.rule,
+          n.threads = n.threads,
+          nsplit = cv.vars$n.splits[i],
+          nimpute = n.imputations,
+          na.action = 'na.impute',
+          mtry = cv.vars$m.try[i]
+        )
+      time.learn <- handyTimer(time.start)
+      time.start <- handyTimer()
+      # Get C-index on validation set
+      c.index.val <-
+        cIndex(
+          surv.model.fit, COHORT.cv[cv.folds[[j]],],
+          na.action = 'na.impute'
+        )
+      time.c.index <- handyTimer(time.start)
+      time.start <- handyTimer()
+      # Get C-index on validation set
+      calibration.score <-
+        calibrationScore(
+          calibrationTable(
+            surv.model.fit, COHORT.cv[cv.folds[[j]],], na.action = 'na.impute'
+          )
+        )
+      time.calibration <- handyTimer(time.start)
+      # Append the stats we've obtained from this fold
+      cv.fold.performance <-
+        rbind(
+          cv.fold.performance,
+          data.frame(
+            calibration = i,
+            cv.fold = j,
+            n.splits = cv.vars$n.splits[i],
+            m.try = cv.vars$m.try[i],
+            c.index.val,
+            time.learn,
+            time.c.index,
+            time.calibration
+          )
+        )
+    } # End cross-validation loop (j)
+    # rbind the performance by fold
+    cv.performance <-
+      rbind(
+        cv.performance,
+        cv.fold.performance
+      )
+    # Save output at the end of each loop
+    write.csv(cv.performance, calibration.filename)
+  } # End calibration loop (i)
+} else { # If we did previously calibrate, load it
+  cv.performance <- read.csv(calibration.filename)
+}
+# Find the best calibration...
+# First, average performance across cross-validation folds
+cv.performance.average <-
+  aggregate(
+    c.index.val ~ calibration,
+    data = cv.performance,
+    mean
+  )
+# Find the highest value
+best.calibration <-
+  cv.performance.average$calibration[
+    which.max(cv.performance.average$c.index.val)
+  ]
+# And finally, find the first row of that calibration to get the n.bins values
+best.calibration.row1 <-
+  min(which(cv.performance$calibration == best.calibration))
+#' ## Fit the final model
+#'
+#' This may take some time, so we'll cache it if possible...
+#+ fit_final_model
+surv.model.fit <-
+  survivalFit(
+    surv.predict,
+    COHORT.bigdata[-test.set,],
+    model.type = 'rfsrc',
+    n.trees = n.trees.final,
+    split.rule = split.rule,
+    n.threads = n.threads,
+    nimpute = n.imputations,
+    nsplit = cv.performance[best.calibration.row1, 'n.splits'],
+    mtry = cv.performance[best.calibration.row1, 'm.try'],
+    na.action = 'na.impute',
+    importance = 'permute'
+  )
+# Save the fit object
+saveRDS(
+  surv.model.fit,
+  paste0(output.filename.base, '-surv-model.rds')
+)
+surv.model.fit.boot <-
+  survivalBootstrap(
+    surv.predict,
+    COHORT.bigdata[-test.set,], # Training set
+    COHORT.bigdata[test.set,],  # Test set
+    model.type = 'rfsrc',
+    n.trees = n.trees.final,
+    split.rule = split.rule,
+    n.threads = n.threads,
+    nimpute = n.imputations,
+    nsplit = cv.performance[best.calibration.row1, 'n.splits'],
+    mtry = cv.performance[best.calibration.row1, 'm.try'],
+    na.action = 'na.impute',
+    bootstraps = bootstraps
+  )
+# Save the fit object
+saveRDS(
+  surv.model.fit.boot,
+  paste0(output.filename.base, '-surv-model-bootstraps.rds')
+)
+# Get C-indices for training and test sets
+surv.model.fit.coeffs <- bootStats(surv.model.fit.boot, uncertainty = '95ci')
+# Save them to the all-models comparison table
+varsToTable(
+  data.frame(
+    model = 'rfbigdata',
+    imputation = FALSE,
+    discretised = FALSE,
+    c.index = surv.model.fit.coeffs['c.test', 'val'],
+    c.index.lower = surv.model.fit.coeffs['c.test', 'lower'],
+    c.index.upper = surv.model.fit.coeffs['c.test', 'upper'],
+    calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
+    calibration.score.lower =
+      surv.model.fit.coeffs['calibration.score', 'lower'],
+    calibration.score.upper =
+      surv.model.fit.coeffs['calibration.score', 'upper']
+  ),
+  performance.file,
+  index.cols = c('model', 'imputation', 'discretised')
+)