Switch to unified view

a b/random-forest/rf-imputed.R
1
#+ knitr_setup, include = FALSE
2
3
# Whether to cache the intensive code sections. Set to FALSE to recalculate
4
# everything afresh.
5
cacheoption <- FALSE
6
# Disable lazy caching globally, because it fails for large objects, and all the
7
# objects we wish to cache are large...
8
opts_chunk$set(cache.lazy = FALSE)
9
10
#' # Random survival forests on pre-imputed data
11
#' 
12
#' The imputation algorithm used for the Cox modelling 'cheats' in a couple of
13
#' different ways. Firstly, the imputation model is fitted on the whole dataset,
14
#' rather than training a model on a training set and then using it on the test
15
#' set. Secondly, causality is violated in that future measurements and even the
16
#' outcome variable (ie whether a patient died) is included in the model. The
17
#' rationale for this is that you want to have the best and most complete
18
#' dataset possible, and if doctors are going to go on and use this in clinical
19
#' practice they will simply take the additional required measurements when
20
#' calculating a patient's risk score, rather than trying to do the modelling on
21
#' incomplete data.
22
#' 
23
#' However, this could allow the imputation to 'pass back' useful data to the
24
#' Cox model, and thus artificially inflate its performance statistics.
25
#' 
26
#' It is non-trivial to work around this, not least because the imputation
27
#' package we used does not expose the model to allow training on a subset of
28
#' the data. A quick and dirty method to check whether this may be an issue,
29
#' therefore, is to train a random forest model on the imputed dataset, and see
30
#' if it can outperform the Cox model. So, let's try it...
31
32
#+ user_variables, message=FALSE
33
34
imputed.data.filename <- '../../data/COHORT_complete.rds'
35
endpoint <- 'death.imputed'
36
37
n.trees <- 500
38
n.split <- 10
39
n.threads <- 40
40
41
bootstraps <- 50
42
43
output.filename.base <- '../../output/rf-imputed-try1'
44
boot.filename <- paste0(output.filename.base, '-boot.rds')
45
46
# What to do with missing data
47
continuous.vars <-
48
  c(
49
    'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
50
    'total_wbc_6mo', 'haemoglobin_6mo'
51
  )
52
untransformed.vars <- c('anonpatid', 'surv_time', 'imd_score', 'exclude')
53
54
source('../lib/shared.R')
55
require(ggrepel)
56
57
#' ## Read imputed data
58
59
# Load the data from its RDS file
60
imputed.data <- readRDS(imputed.data.filename)
61
62
# Remove rows with death time of 0 to avoid fitting errors, and get the survival
63
# columns ready
64
for(i in 1:length(imputed.data)) {
65
  imputed.data[[i]] <- imputed.data[[i]][imputed.data[[i]][, surv.time] > 0, ]
66
  # Put in a fake exclude column for the next function (exclusions are already
67
  # excluded in the imputed dataset)
68
  imputed.data[[i]]$exclude <- FALSE
69
  imputed.data[[i]]$imd_score <- as.numeric(imputed.data[[i]]$imd_score)
70
  imputed.data[[i]] <-
71
    caliberExtraPrep(
72
      prepSurvCol(
73
        imputed.data[[i]], 'time_death', 'endpoint_death', 1
74
      )
75
    )
76
}
77
78
# Define test set
79
test.set <- testSetIndices(imputed.data[[1]], random.seed = 78361)
80
81
82
#' ## Fit random forest model
83
84
# Do a quick and dirty fit on a single imputed dataset, to draw calibration
85
# curve from
86
time.start <- handyTimer()
87
surv.model.fit <-
88
  survivalFit(
89
    surv.predict,
90
    imputed.data[[1]][-test.set,],
91
    model.type = 'rfsrc',
92
    n.trees = n.trees,
93
    split.rule = split.rule,
94
    n.threads = n.threads,
95
    nsplit = n.split
96
  )
97
time.fit <- handyTimer(time.start)
98
99
fit.rf.boot <- list()
100
101
# Perform bootstrap fitting for every multiply imputed dataset
102
time.start <- handyTimer()
103
for(i in 1:length(imputed.data)) {
104
  fit.rf.boot[[i]] <-
105
    survivalBootstrap(
106
      surv.predict,
107
      imputed.data[[i]][-test.set, ],
108
      imputed.data[[i]][test.set, ],
109
      model.type = 'rfsrc',
110
      bootstraps = bootstraps,
111
      n.threads = n.threads
112
    )
113
}
114
time.boot <- handyTimer(time.start)
115
116
# Save the fits, because it might've taken a while!
117
saveRDS(fit.rf.boot, boot.filename)
118
119
#' Model fitted in `r round(time.fit)` seconds, and `r bootstraps` fits
120
#' performed on `r length(imputed.data)` imputed datasets in
121
#' `r round(time.boot)` seconds.
122
123
# Unpackage the uncertainties from the bootstrapped data
124
fit.rf.boot.ests <-  bootMIStats(fit.rf.boot)
125
126
# Save bootstrapped performance values
127
varsToTable(
128
  data.frame(
129
    model = 'rfsrc',
130
    imputation = TRUE,
131
    discretised = FALSE,
132
    c.index = fit.rf.boot.ests['c.test', 'val'],
133
    c.index.lower = fit.rf.boot.ests['c.test', 'lower'],
134
    c.index.upper = fit.rf.boot.ests['c.test', 'upper'],
135
    calibration.score = fit.rf.boot.ests['calibration.score', 'val'],
136
    calibration.score.lower = fit.rf.boot.ests['calibration.score', 'lower'],
137
    calibration.score.upper = fit.rf.boot.ests['calibration.score', 'upper']
138
  ),
139
  performance.file,
140
  index.cols = c('model', 'imputation', 'discretised')
141
)
142
143
#' ## Performance
144
#' 
145
#' Having fitted the Cox model, how did we do? The c-indices were calculated as
146
#' part of the bootstrapping, so we just need to take a look at those...
147
#' 
148
#' C-indices are **`r round(fit.rf.boot.ests['c.train', 'val'], 3)`
149
#' (`r round(fit.rf.boot.ests['c.train', 'lower'], 3)` - 
150
#' `r round(fit.rf.boot.ests['c.train', 'upper'], 3)`)** on the training set and
151
#' **`r round(fit.rf.boot.ests['c.test', 'val'], 3)`
152
#' (`r round(fit.rf.boot.ests['c.test', 'lower'], 3)` - 
153
#' `r round(fit.rf.boot.ests['c.test', 'upper'], 3)`)** on the test set.
154
#' 
155
#' 
156
#' ### Calibration
157
#' 
158
#' The bootstrapped calibration score is
159
#' **`r round(fit.rf.boot.ests['calibration.score', 'val'], 3)`
160
#' (`r round(fit.rf.boot.ests['calibration.score', 'lower'], 3)` - 
161
#' `r round(fit.rf.boot.ests['calibration.score', 'upper'], 3)`)**.
162
#' 
163
#' Let's draw a representative curve from the unbootstrapped fit... (It would be
164
#' better to draw all the curves from the bootstrap fit to get an idea of
165
#' variability, but I've not implemented this yet.)
166
#' 
167
#+ calibration_plot
168
169
calibration.table <-
170
  calibrationTable(surv.model.fit, imputed.data[[i]][test.set, ])
171
172
calibration.score <- calibrationScore(calibration.table)
173
174
calibrationPlot(calibration.table)
175
176
#' The area between the calibration curve and the diagonal is 
177
#' **`r round(calibration.score[['area']], 3)`** +/-
178
#' **`r round(calibration.score[['se']], 3)`**.