MLehealth / Git / [0375db] /lib/rfsrc-cv-mtry-nsplit-logical.R

Models:
philipB/
MLehealth
Downloads: 1
[0375db]: / lib / rfsrc-cv-mtry-nsplit-logical.R
History
Download this file
198 lines (169 with data), 5.5 kB

bootstraps <- 3
split.rule <- 'logrank'
n.threads <- 20

# Cross-validation variables
ns.splits <- c(0, 5, 10, 15, 20, 30)
ms.try <- c(50, 100, 200, 300, 400)
n.trees.cv  <- 500
n.imputations <- 3
cv.n.folds <- 3
n.trees.final <- 2000
n.data <- NA # This is of full dataset...further rows may be excluded in prep

calibration.filename <- paste0(output.filename.base, '-calibration.csv')

# If we've not already done a calibration, then do one
if(!file.exists(calibration.filename)) {
  # Create an empty data frame to aggregate stats per fold
  cv.performance <- data.frame()
  
  # Items to cross-validate over
  cv.vars <- expand.grid(ns.splits, ms.try)
  names(cv.vars) <- c('n.splits', 'm.try')
  
  COHORT.cv <- COHORT.bigdata[-test.set, ]
  
  # Run crossvalidations. No need to parallelise because rfsrc is parallelised
  for(i in 1:nrow(cv.vars)) {
    cat(
      'Calibration', i, '...\n'
    )
    
    # Get folds for cross-validation
    cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds)
    
    cv.fold.performance <- data.frame()
    
    for(j in 1:cv.n.folds) {
      time.start <- handyTimer()
      # Fit model to the training set
      surv.model.fit <-
        survivalFit(
          surv.predict,
          COHORT.cv[-cv.folds[[j]],],
          model.type = 'rfsrc',
          n.trees = n.trees.cv,
          split.rule = split.rule,
          n.threads = n.threads,
          nsplit = cv.vars$n.splits[i],
          nimpute = n.imputations,
          na.action = 'na.impute',
          mtry = cv.vars$m.try[i]
        )
      time.learn <- handyTimer(time.start)
      
      time.start <- handyTimer()
      # Get C-index on validation set
      c.index.val <-
        cIndex(
          surv.model.fit, COHORT.cv[cv.folds[[j]],],
          na.action = 'na.impute'
        )
      time.c.index <- handyTimer(time.start)
      
      time.start <- handyTimer()
      # Get C-index on validation set
      calibration.score <-
        calibrationScore(
          calibrationTable(
            surv.model.fit, COHORT.cv[cv.folds[[j]],], na.action = 'na.impute'
          )
        )
      time.calibration <- handyTimer(time.start)
      
      # Append the stats we've obtained from this fold
      cv.fold.performance <-
        rbind(
          cv.fold.performance,
          data.frame(
            calibration = i,
            cv.fold = j,
            n.splits = cv.vars$n.splits[i],
            m.try = cv.vars$m.try[i],
            c.index.val,
            time.learn,
            time.c.index,
            time.calibration
          )
        )
      
    } # End cross-validation loop (j)
    
    
    # rbind the performance by fold
    cv.performance <-
      rbind(
        cv.performance,
        cv.fold.performance
      )
    
    # Save output at the end of each loop
    write.csv(cv.performance, calibration.filename)
    
  } # End calibration loop (i)
  


} else { # If we did previously calibrate, load it
  cv.performance <- read.csv(calibration.filename)
}

# Find the best calibration...
# First, average performance across cross-validation folds
cv.performance.average <-
  aggregate(
    c.index.val ~ calibration,
    data = cv.performance,
    mean
  )
# Find the highest value
best.calibration <-
  cv.performance.average$calibration[
    which.max(cv.performance.average$c.index.val)
  ]
# And finally, find the first row of that calibration to get the n.bins values
best.calibration.row1 <-
  min(which(cv.performance$calibration == best.calibration))

#' ## Fit the final model
#' 
#' This may take some time, so we'll cache it if possible...

#+ fit_final_model

surv.model.fit <-
  survivalFit(
    surv.predict,
    COHORT.bigdata[-test.set,],
    model.type = 'rfsrc',
    n.trees = n.trees.final,
    split.rule = split.rule,
    n.threads = n.threads,
    nimpute = n.imputations,
    nsplit = cv.performance[best.calibration.row1, 'n.splits'],
    mtry = cv.performance[best.calibration.row1, 'm.try'],
    na.action = 'na.impute',
    importance = 'permute'
  )

# Save the fit object
saveRDS(
  surv.model.fit,
  paste0(output.filename.base, '-surv-model.rds')
)

surv.model.fit.boot <-
  survivalBootstrap(
    surv.predict,
    COHORT.bigdata[-test.set,], # Training set
    COHORT.bigdata[test.set,],  # Test set
    model.type = 'rfsrc',
    n.trees = n.trees.final,
    split.rule = split.rule,
    n.threads = n.threads,
    nimpute = n.imputations,
    nsplit = cv.performance[best.calibration.row1, 'n.splits'],
    mtry = cv.performance[best.calibration.row1, 'm.try'],
    na.action = 'na.impute',
    bootstraps = bootstraps
  )

# Save the fit object
saveRDS(
  surv.model.fit.boot,
  paste0(output.filename.base, '-surv-model-bootstraps.rds')
)

# Get C-indices for training and test sets
surv.model.fit.coeffs <- bootStats(surv.model.fit.boot, uncertainty = '95ci')

# Save them to the all-models comparison table
varsToTable(
  data.frame(
    model = 'rfbigdata',
    imputation = FALSE,
    discretised = FALSE,
    c.index = surv.model.fit.coeffs['c.test', 'val'],
    c.index.lower = surv.model.fit.coeffs['c.test', 'lower'],
    c.index.upper = surv.model.fit.coeffs['c.test', 'upper'],
    calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
    calibration.score.lower =
      surv.model.fit.coeffs['calibration.score', 'lower'],
    calibration.score.upper =
      surv.model.fit.coeffs['calibration.score', 'upper']
  ),
  performance.file,
  index.cols = c('model', 'imputation', 'discretised')
)