Switch to unified view

a b/random-forest/rf-classification.R
1
#+ knitr_setup, include = FALSE
2
3
# Whether to cache the intensive code sections. Set to FALSE to recalculate
4
# everything afresh.
5
cacheoption <- FALSE
6
# Disable lazy caching globally, because it fails for large objects, and all the
7
# objects we wish to cache are large...
8
opts_chunk$set(cache.lazy = FALSE)
9
10
#' # Cross-validating discretisation of input variables in a survival model
11
#' 
12
#' In difference to previous attempts at cross-validation, this uses between 10
13
#' and 20 bins, not between 2 and 20, in an attempt to avoid throwing away data.
14
15
# The first part of the filename for any output
16
output.filename.base <- '../../output/rfsrc-classification-try1'
17
18
risk.time <- 5
19
20
n.data <- NA
21
split.rule <- 'logrank'
22
n.trees <- 2000
23
n.threads <- 19
24
25
continuous.vars <-
26
  c(
27
    'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
28
    'total_wbc_6mo', 'haemoglobin_6mo'
29
  )
30
31
untransformed.vars <- c('anonpatid', 'surv_time', 'imd_score', 'exclude')
32
33
# If surv.vars is defined as a character vector here, the model only uses those
34
# variables specified, eg c('age') would build a model purely based on age. If
35
# not specified (ie commented out), it will use the defaults.
36
# surv.predict <- c('age')
37
38
#' ## Fit the model
39
#' 
40
#' Now, let's fit the model, but without cross-validating the number of factor
41
#' levels! The issue is that, if we're allowing factor levels to be grouped into
42
#' two branches arbitrarily, there are 2^n - 2 combinations, which rapidly
43
#' becomes a huge number. Thus, cross-validating, especially with large numbers
44
#' of factor levels, is very impractical.
45
#' 
46
#' We'll also leave age as a pure number: we know that it's both a very
47
#' significant variable, and also that it makes sense to treat it as though it's
48
#' ordered, because risk should increase monotonically with it.
49
#' 
50
#+ rf_discretised, cache=cacheoption
51
52
source('../lib/shared.R')
53
54
# Load the data and convert to data frame to make column-selecting code in
55
# prepData simpler
56
COHORT.full <- data.frame(fread(data.filename))
57
58
# If n.data was specified...
59
if(!is.na(n.data)){
60
  # Take a subset n.data in size
61
  COHORT.use <- sample.df(COHORT.full, n.data)
62
  rm(COHORT.full)
63
} else {
64
  # Use all the data
65
  COHORT.use <- COHORT.full
66
  rm(COHORT.full)
67
}
68
69
# Process settings: don't touch anything!!
70
process.settings <-
71
  list(
72
    var       = c(untransformed.vars, continuous.vars),
73
    method    = rep(NA, length(untransformed.vars) + length(continuous.vars)),
74
    settings  = rep(NA, length(untransformed.vars) + length(continuous.vars))
75
  )
76
77
COHORT.prep <-
78
  prepData(
79
    COHORT.use,
80
    cols.keep,
81
    process.settings,
82
    surv.time, surv.event,
83
    surv.event.yes,
84
    extra.fun = caliberExtraPrep
85
  )
86
n.data <- nrow(COHORT.prep)
87
88
# Define indices of test set
89
test.set <- sample(1:n.data, (1/3)*n.data)
90
91
# Create column for whether or not the patient had an event before risk.time
92
COHORT.prep$event <- NA
93
# Event before risk.time
94
COHORT.prep$event[
95
  COHORT.prep$surv_event & COHORT.prep$surv_time <= risk.time
96
  ] <- TRUE
97
# Event after, whether censorship or not, means no event by risk.time
98
COHORT.prep$event[COHORT.prep$surv_time > risk.time] <- FALSE
99
# Otherwise, censored before risk.time, let's remove the row
100
COHORT.prep <- COHORT.prep[!is.na(COHORT.prep$event), ]
101
102
surv.model.fit <-
103
  rfsrc(
104
    as.formula(
105
      paste('event ~', paste(surv.predict, collapse = '+'))
106
    ),
107
    COHORT.prep[-test.set,], # Training set
108
    ntree = n.trees,
109
    splitrule = 'gini',
110
    n.threads = n.threads,
111
    na.action = 'na.impute',
112
    nimpute = 3
113
  )