MLehealth / Git / Diff of /random-forest/rf-varselrf-eqv.R

Models:
philipB/
MLehealth
Downloads: 1
Diff of /random-forest/rf-varselrf-eqv.R [000000] .. [0375db]
Switch to side-by-side view

--- a
+++ b/random-forest/rf-varselrf-eqv.R
@@ -0,0 +1,553 @@
+#+ knitr_setup, include = FALSE
+
+# Whether to cache the intensive code sections. Set to FALSE to recalculate
+# everything afresh.
+cacheoption <- TRUE
+# Disable lazy caching globally, because it fails for large objects, and all the
+# objects we wish to cache are large...
+opts_chunk$set(cache.lazy = FALSE)
+
+#' # Variable selection in data-driven health records
+#' 
+#' Having extracted around 600 variables which occur most frequently in patient
+#' records, let's try to narrow these down using the methodology of varSelRf
+#' with a couple of custom additions (potentially multiple variable importance
+#' calculations at the outset, and cross-validation to choose the best number of
+#' variables at the end.)
+#' 
+#' ## User variables
+#' 
+#+ user_variables
+
+output.filename.base <- '../../output/rf-bigdata-try12-varselrf2'
+
+nsplit <- 20
+n.trees.initial <- 500
+n.forests.initial <- 5
+n.trees.cv <- 500
+n.trees.final <- 2000
+split.rule <- 'logrank'
+n.imputations <- 3
+cv.n.folds <- 3
+vars.drop.frac <- 0.2 # Fraction of variables to drop at each iteration
+bootstraps <- 200
+
+n.data <- NA # This is after any variables being excluded in prep
+
+n.threads <- 20
+
+#' ## Data set-up
+#' 
+#+ data_setup
+
+data.filename.big <- '../../data/cohort-datadriven-02.csv'
+
+surv.predict.old <- c('age', 'smokstatus', 'imd_score', 'gender')
+untransformed.vars <- c('time_death', 'endpoint_death', 'exclude')
+
+source('../lib/shared.R')
+require(xtable)
+
+# Define these after shared.R or they will be overwritten!
+exclude.vars <-
+  c(
+    # Entity type 4 is smoking status, which we already have
+    "clinical.values.4_data1", "clinical.values.4_data5",
+    "clinical.values.4_data6",
+    # Entity 13 data2 is the patient's weight centile, and not a single one is
+    # entered, but they come out as 0 so the algorithm, looking for NAs, thinks
+    # it's a useful column
+    "clinical.values.13_data2",
+    # Entities 148 and 149 are to do with death certification. I'm not sure how 
+    # it made it into the dataset, but since all the datapoints in this are
+    # looking back in time, they're all NA. This causes rfsrc to fail.
+    "clinical.values.148_data1", "clinical.values.148_data2",
+    "clinical.values.148_data3", "clinical.values.148_data4",
+    "clinical.values.148_data5",
+    "clinical.values.149_data1", "clinical.values.149_data2"
+  )
+
+COHORT <- fread(data.filename.big)
+
+l2df <- function(x) {
+  # Simple function to turn a list into a data frame so we can compare variable
+  # importances across iterations of the initial forests
+  rownames <- unique(unlist(lapply(x, names)))
+  df <-
+    data.frame(
+      matrix(
+        unlist(lapply(x, FUN = function(x){x[rownames]})),
+        ncol = length(x), byrow = FALSE
+      )
+    )
+  rownames(df) <- rownames
+  df
+}
+
+bigdata.prefixes <-
+  c(
+    'hes.icd.',
+    'hes.opcs.',
+    'tests.enttype.',
+    'clinical.history.',
+    'clinical.values.',
+    'bnf.'
+  )
+
+bigdata.columns <-
+  colnames(COHORT)[
+    which(
+      # Does is start with one of the data column names?
+      startsWithAny(names(COHORT), bigdata.prefixes) &
+        # And it's not one of the columns we want to exclude?
+        !(colnames(COHORT) %in% exclude.vars)
+    )
+  ]
+
+COHORT.bigdata <-
+  COHORT[, c(
+    untransformed.vars, surv.predict.old, bigdata.columns
+  ),
+  with = FALSE
+  ]
+
+# Deal appropriately with missing data
+# Most of the variables are number of days since the first record of that type
+time.based.vars <-
+  names(COHORT.bigdata)[
+    startsWithAny(
+      names(COHORT.bigdata),
+      c('hes.icd.', 'hes.opcs.', 'clinical.history.')
+    )
+    ]
+# We're dealing with this as a logical, so we want non-NA values to be TRUE,
+# is there is something in the history
+for (j in time.based.vars) {
+  set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]]))
+}
+
+# Again, taking this as a logical, set any non-NA value to TRUE.
+prescriptions.vars <- names(COHORT.bigdata)[startsWith(names(COHORT.bigdata), 'bnf.')]
+for (j in prescriptions.vars) {
+  set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]]))
+}
+
+# This leaves tests and clinical.values, which are test results and should be
+# imputed.
+
+# Manually fix clinical values items...
+#
+# "clinical.values.1_data1"  "clinical.values.1_data2"
+# These are just blood pressure values...fine to impute
+#
+# "clinical.values.13_data1" "clinical.values.13_data3"
+# These are weight and BMI...also fine to impute
+#
+# Entity 5 is alcohol consumption status, 1 = Yes, 2 = No, 3 = Ex, so should be
+# a factor, and NA can be a factor level
+COHORT.bigdata$clinical.values.5_data1 <-
+  factorNAfix(factor(COHORT.bigdata$clinical.values.5_data1), NAval = 'missing')
+
+# Both gender and smokstatus are factors...fix that
+COHORT.bigdata$gender <- factor(COHORT.bigdata$gender)
+COHORT.bigdata$smokstatus <-
+  factorNAfix(factor(COHORT.bigdata$smokstatus), NAval = 'missing')
+
+# Exclude invalid patients
+COHORT.bigdata <- COHORT.bigdata[!COHORT.bigdata$exclude]
+COHORT.bigdata$exclude <- NULL
+
+COHORT.bigdata <-
+  prepSurvCol(data.frame(COHORT.bigdata), 'time_death', 'endpoint_death', 'Death')
+
+# If n.data was specified, trim the data table down to size
+if(!is.na(n.data)) {
+  COHORT.bigdata <- sample.df(COHORT.bigdata, n.data)
+}
+
+# Define test set
+test.set <- testSetIndices(COHORT.bigdata, random.seed = 78361)
+
+# Start by predicting survival with all the variables provided
+surv.predict <- c(surv.predict.old, bigdata.columns)
+
+# Set up a csv file to store calibration data, or retrieve previous data
+calibration.filename <- paste0(output.filename.base, '-varselcalibration.csv')
+
+#' ## Run random forest calibration
+#' 
+#' If there's not already a calibration file, we run the rfVarSel methodology:
+#'   1. Fit a big forest to the whole dataset to obtain variable importances.
+#'   2. Cross-validate as number of most important variables kept is reduced.
+#' 
+#' (If there is already a calibration file, just load the previous work.)
+#' 
+#+ rf_var_sel_calibration
+
+# If we've not already done a calibration, then do one
+if(!file.exists(calibration.filename)) {
+  # Start by fitting several forests to all the variables
+  vimps.initial <- list()
+  time.fit <- c()
+  time.vimp <- c()
+  
+  # If we haven't already made the initial forests...
+  if(!file.exists(
+    paste0(output.filename.base,'-varimp-', n.forests.initial,'.rds'))
+    ) {
+    for(i in 1:n.forests.initial) {
+      time.start <- handyTimer()
+      surv.model.fit.full <-
+        survivalFit(
+          surv.predict,
+          COHORT.bigdata[-test.set,],
+          model.type = 'rfsrc',
+          n.trees = n.trees.initial,
+          split.rule = split.rule,
+          n.threads = n.threads,
+          nimpute = 3,
+          nsplit = nsplit,
+          na.action = 'na.impute'
+        )
+      time.fit <- c(time.fit, handyTimer(time.start))
+      
+      # Save the model
+      saveRDS(
+        surv.model.fit.full,
+        paste0(output.filename.base,'-initialmodel-', i,'.rds')
+      )
+      
+      # Calculate variable importance
+      time.start <- handyTimer()
+      var.imp <- vimp(surv.model.fit.full, importance = 'permute.ensemble')
+      time.vimp <- c(time.vimp, handyTimer(time.start))
+      # Save it
+      saveRDS(var.imp, paste0(output.filename.base, '-varimp-', i,'.rds'))
+      
+      # Make a vector of variable importances and append to the list
+      vimps.initial[[i]] <- sort(var.imp$importance, decreasing = TRUE)
+    }
+    
+    cat('Total initial vimp time = ', sum(time.fit) + sum(time.vimp))
+    cat('Average fit time = ', mean(time.fit))
+    cat('Average vimp time = ', mean(time.vimp))
+    
+  } else {
+    # If we already made the initial forests, just load them
+    for(i in 1:n.forests.initial) {
+      var.imp <- readRDS(paste0(output.filename.base, '-varimp-', i,'.rds'))
+      vimps.initial[[i]] <- sort(var.imp$importance, decreasing = TRUE)
+    }
+  }
+  
+  # Convert the vimps.initial list to a dataframe, rowwise
+  vimps.initial <- l2df(vimps.initial)
+  
+  # Take averages across rows to give variable importances to use
+  var.importances <- sort(apply(vimps.initial, 1, mean), decreasing = TRUE)
+  
+  # Save the result
+  saveRDS(var.importances, paste0(output.filename.base, '-varimp.rds'))
+
+  # Create an empty data frame to aggregate stats per fold
+  cv.performance <- data.frame()
+  # Cross-validate over number of variables to try
+  cv.vars <- getVarNums(length(var.importances))
+  COHORT.cv <- COHORT.bigdata[-test.set, ]
+  # Run crossvalidations. No need to parallelise because rfsrc is parallelised
+  for(i in 1:length(cv.vars)) {
+    # Get the subset of most important variables to use
+    surv.predict.partial <- names(var.importances)[1:cv.vars[i]]
+    
+    # Get folds for cross-validation
+    cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds)
+    
+    cv.fold.performance <- data.frame()
+    
+    for(j in 1:cv.n.folds) {
+      time.start <- handyTimer()
+      # Fit model to the training set
+      surv.model.fit <-
+        survivalFit(
+          surv.predict.partial,
+          COHORT.cv[-cv.folds[[j]],],
+          model.type = 'rfsrc',
+          n.trees = n.trees.cv,
+          split.rule = split.rule,
+          n.threads = n.threads,
+          nsplit = nsplit,
+          nimpute = n.imputations,
+          na.action = 'na.impute'
+        )
+      time.learn <- handyTimer(time.start)
+      
+      time.start <- handyTimer()
+      # Get C-index on validation set
+      c.index.val <-
+        cIndex(
+          surv.model.fit, COHORT.cv[cv.folds[[j]],],
+          na.action = 'na.impute'
+        )
+      time.c.index <- handyTimer(time.start)
+      
+      time.start <- handyTimer()
+      # Get calibration score validation set
+      calibration.score <-
+        calibrationScore(
+          calibrationTable(
+            surv.model.fit, COHORT.cv[cv.folds[[j]],], na.action = 'na.impute'
+          )
+        )
+      time.calibration <- handyTimer(time.start)
+      
+      # Append the stats we've obtained from this fold
+      cv.fold.performance <-
+        rbind(
+          cv.fold.performance,
+          data.frame(
+            calibration = i,
+            cv.fold = j,
+            n.vars = cv.vars[i],
+            c.index.val,
+            calibration.score,
+            time.learn,
+            time.c.index,
+            time.calibration
+          )
+        )
+      
+    } # End cross-validation loop (j)
+    
+    
+    # rbind the performance by fold
+    cv.performance <-
+      rbind(
+        cv.performance,
+        cv.fold.performance
+      )
+    
+    # Save output at the end of each loop
+    write.csv(cv.performance, calibration.filename)
+    
+  } # End calibration loop (i)
+} else {
+  cv.performance <- read.csv(calibration.filename)
+  var.importances <- readRDS(paste0(output.filename.base, '-varimp.rds'))
+}
+
+#' ## Find the best model from the calibrations
+#' 
+#' ### Plot model performance
+#' 
+#+ model_performance
+
+# Find the best calibration...
+# First, average performance across cross-validation folds
+cv.performance.average <-
+  aggregate(
+    c.index.val ~ n.vars,
+    data = cv.performance,
+    mean
+  )
+
+cv.calibration.average <-
+  aggregate(
+    area ~ n.vars,
+    data = cv.performance,
+    mean
+  )
+
+ggplot(cv.performance.average, aes(x = n.vars, y = c.index.val)) +
+  geom_line() +
+  geom_point(data = cv.performance) +
+  ggtitle(label = 'C-index by n.vars')
+
+ggplot(cv.calibration.average, aes(x = n.vars, y = area)) +
+  geom_line() +
+  geom_point(data = cv.performance) +
+  ggtitle(label = 'Calibration performance by n.vars')
+
+# Find the highest value
+n.vars <-
+  cv.performance.average$n.vars[
+    which.max(cv.performance.average$c.index.val)
+    ]
+
+# Fit a full model with the variables provided
+surv.predict.partial <- names(var.importances)[1:n.vars]
+
+#' ## Best model
+#' 
+#' The best model contained `r n.vars` variables. Let's see what those were...
+#' 
+#+ variables_used
+
+vars.df <-
+  data.frame(
+    vars = surv.predict.partial
+  )
+
+vars.df$descriptions <- lookUpDescriptions(surv.predict.partial)
+
+vars.df$vimp <- var.importances[1:n.vars]/max(var.importances)
+
+missingness <- sapply(COHORT, percentMissing)
+
+vars.df$missingness <- missingness[names(var.importances)[1:n.vars]]
+
+#+ variables_table, results='asis'
+
+print(
+  xtable(vars.df),
+  type = 'html',
+  include.rownames = FALSE
+)
+
+#' ## Perform the final fit
+#' 
+#' Having found the best number of variables by cross-validation, let's perform
+#' the final fit with the full training set and `r n.trees.final` trees.
+#' 
+#+ final_fit
+
+time.start <- handyTimer()
+surv.model.fit.final <-
+  survivalFit(
+    surv.predict.partial,
+    COHORT.bigdata[-test.set,],
+    model.type = 'rfsrc',
+    n.trees = n.trees.final,
+    split.rule = split.rule,
+    n.threads = n.threads,
+    nimpute = 3,
+    nsplit = nsplit,
+    na.action = 'na.impute'
+  )
+time.fit.final <- handyTimer(time.start)
+
+saveRDS(surv.model.fit.final, paste0(output.filename.base, '-finalmodel.rds'))
+
+#' Final model of `r n.trees.final` trees fitted in `r round(time.fit.final)`
+#' seconds!
+#' 
+#' Also bootstrap this final fitting stage. A fully proper bootstrap would
+#' iterate over the whole model-building process including variable selection,
+#' but that would be prohibitive in terms of computational time.
+#' 
+#+ bootstrap_final
+
+surv.model.fit.boot <-
+  survivalBootstrap(
+    surv.predict.partial,
+    COHORT.bigdata[-test.set,], # Training set
+    COHORT.bigdata[test.set,],  # Test set
+    model.type = 'rfsrc',
+    n.trees = n.trees.final,
+    split.rule = split.rule,
+    n.threads = n.threads,
+    nimpute = 3,
+    nsplit = nsplit,
+    na.action = 'na.impute',
+    bootstraps = bootstraps
+  )
+
+COHORT.bigdata$surv_time_round <- round_any(COHORT.bigdata$surv_time, tod.round)
+# Create a survival formula with the provided variable names...
+surv.formula <-
+  formula(
+    paste0(
+      # Survival object made in-formula
+      'Surv(surv_time_round, surv_event) ~ ',
+      # Predictor variables then make up the other side
+      paste(surv.predict.partial, collapse = '+')
+    )
+  )
+
+# rfsrc, if you installed it correctly, controls threading by changing an
+# environment variable
+options(rf.cores = n.threads)
+  
+surv.model.fit.boot <-  
+  boot(
+    formula = surv.formula,
+    data = COHORT.bigdata[-test.set, ],
+    statistic = bootstrapFitRfsrc,
+    R = bootstraps,
+    parallel = 'no',
+    ncpus = 1, # disable parallelism because rfsrc can be run in parallel
+    n.trees = n.trees,
+    test.data =  COHORT.bigdata[test.set, ],
+    # Boot requires named variables, so can't use ... here. This slight
+    # kludge means that this will fail unless these three variables are
+    # explicitly specified in the call.
+    nimpute = nimpute,
+    nsplit = nsplit,
+    na.action = 'na.impute'
+  )
+
+
+# Get coefficients and variable importances from bootstrap fits
+surv.model.fit.coeffs <- bootStats(surv.model.fit.boot, uncertainty = '95ci')
+
+#' ## Performance
+#' 
+#' ### C-index
+#' 
+#' C-indices are **`r round(surv.model.fit.coeffs['c.train', 'val'], 3)`
+#' (`r round(surv.model.fit.coeffs['c.train', 'lower'], 3)` - 
+#' `r round(surv.model.fit.coeffs['c.train', 'upper'], 3)`)**
+#' on the training set and
+#' **`r round(surv.model.fit.coeffs['c.test', 'val'], 3)`
+#' (`r round(surv.model.fit.coeffs['c.test', 'lower'], 3)` - 
+#' `r round(surv.model.fit.coeffs['c.test', 'upper'], 3)`)** on the test set.
+#' 
+#'
+#' ### Calibration
+#' 
+#' The bootstrapped calibration score is
+#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)`
+#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` - 
+#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**.
+#' 
+#' Let's draw a representative curve from the unbootstrapped fit... (It would be
+#' better to draw all the curves from the bootstrap fit to get an idea of
+#' variability, but I've not implemented this yet.)
+#' 
+#+ calibration_plot
+
+calibration.table <-
+  calibrationTable(
+    # Standard calibration options
+    surv.model.fit.final, COHORT.bigdata[test.set,],
+    # Always need to specify NA imputation for rfsrc
+    na.action = 'na.impute'
+  )
+
+calibration.score <- calibrationScore(calibration.table)
+
+calibrationPlot(calibration.table)
+
+#' The area between the calibration curve and the diagonal is 
+#' **`r round(calibration.score[['area']], 3)`** +/-
+#' **`r round(calibration.score[['se']], 3)`**.
+#'
+#+ save_results
+
+# Save performance results
+varsToTable(
+  data.frame(
+    model = 'rf-varselrf2',
+    imputation = FALSE,
+    discretised = FALSE,
+    c.index = surv.model.fit.coeffs['c.train', 'val'],
+    c.index.lower = surv.model.fit.coeffs['c.train', 'lower'],
+    c.index.upper = surv.model.fit.coeffs['c.train', 'upper'],
+    calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
+    calibration.score.lower =
+      surv.model.fit.coeffs['calibration.score', 'lower'],
+    calibration.score.upper =
+      surv.model.fit.coeffs['calibration.score', 'upper']
+  ),
+  performance.file,
+  index.cols = c('model', 'imputation', 'discretised')
+)
\ No newline at end of file