MLehealth / Git / [0375db] /random-forest/rf-classification.R

Models:
philipB/
MLehealth
Downloads: 1
[0375db]: / random-forest / rf-classification.R
History
Download this file
113 lines (96 with data), 3.5 kB

#+ knitr_setup, include = FALSE

# Whether to cache the intensive code sections. Set to FALSE to recalculate
# everything afresh.
cacheoption <- FALSE
# Disable lazy caching globally, because it fails for large objects, and all the
# objects we wish to cache are large...
opts_chunk$set(cache.lazy = FALSE)

#' # Cross-validating discretisation of input variables in a survival model
#' 
#' In difference to previous attempts at cross-validation, this uses between 10
#' and 20 bins, not between 2 and 20, in an attempt to avoid throwing away data.

# The first part of the filename for any output
output.filename.base <- '../../output/rfsrc-classification-try1'

risk.time <- 5

n.data <- NA
split.rule <- 'logrank'
n.trees <- 2000
n.threads <- 19

continuous.vars <-
  c(
    'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
    'total_wbc_6mo', 'haemoglobin_6mo'
  )

untransformed.vars <- c('anonpatid', 'surv_time', 'imd_score', 'exclude')

# If surv.vars is defined as a character vector here, the model only uses those
# variables specified, eg c('age') would build a model purely based on age. If
# not specified (ie commented out), it will use the defaults.
# surv.predict <- c('age')

#' ## Fit the model
#' 
#' Now, let's fit the model, but without cross-validating the number of factor
#' levels! The issue is that, if we're allowing factor levels to be grouped into
#' two branches arbitrarily, there are 2^n - 2 combinations, which rapidly
#' becomes a huge number. Thus, cross-validating, especially with large numbers
#' of factor levels, is very impractical.
#' 
#' We'll also leave age as a pure number: we know that it's both a very
#' significant variable, and also that it makes sense to treat it as though it's
#' ordered, because risk should increase monotonically with it.
#' 
#+ rf_discretised, cache=cacheoption

source('../lib/shared.R')

# Load the data and convert to data frame to make column-selecting code in
# prepData simpler
COHORT.full <- data.frame(fread(data.filename))

# If n.data was specified...
if(!is.na(n.data)){
  # Take a subset n.data in size
  COHORT.use <- sample.df(COHORT.full, n.data)
  rm(COHORT.full)
} else {
  # Use all the data
  COHORT.use <- COHORT.full
  rm(COHORT.full)
}

# Process settings: don't touch anything!!
process.settings <-
  list(
    var       = c(untransformed.vars, continuous.vars),
    method    = rep(NA, length(untransformed.vars) + length(continuous.vars)),
    settings  = rep(NA, length(untransformed.vars) + length(continuous.vars))
  )

COHORT.prep <-
  prepData(
    COHORT.use,
    cols.keep,
    process.settings,
    surv.time, surv.event,
    surv.event.yes,
    extra.fun = caliberExtraPrep
  )
n.data <- nrow(COHORT.prep)

# Define indices of test set
test.set <- sample(1:n.data, (1/3)*n.data)

# Create column for whether or not the patient had an event before risk.time
COHORT.prep$event <- NA
# Event before risk.time
COHORT.prep$event[
  COHORT.prep$surv_event & COHORT.prep$surv_time <= risk.time
  ] <- TRUE
# Event after, whether censorship or not, means no event by risk.time
COHORT.prep$event[COHORT.prep$surv_time > risk.time] <- FALSE
# Otherwise, censored before risk.time, let's remove the row
COHORT.prep <- COHORT.prep[!is.na(COHORT.prep$event), ]

surv.model.fit <-
  rfsrc(
    as.formula(
      paste('event ~', paste(surv.predict, collapse = '+'))
    ),
    COHORT.prep[-test.set,], # Training set
    ntree = n.trees,
    splitrule = 'gini',
    n.threads = n.threads,
    na.action = 'na.impute',
    nimpute = 3
  )