Switch to unified view

a b/random-forest/rf-varselmiss.R
1
#+ knitr_setup, include = FALSE
2
3
# Whether to cache the intensive code sections. Set to FALSE to recalculate
4
# everything afresh.
5
cacheoption <- TRUE
6
# Disable lazy caching globally, because it fails for large objects, and all the
7
# objects we wish to cache are large...
8
opts_chunk$set(cache.lazy = FALSE)
9
10
#' # Variable selection in data-driven health records
11
#' 
12
#' Having extracted around 600 variables which occur most frequently in patient
13
#' records, let's try to narrow these down using the methodology of varSelRf.
14
#' 
15
#' ## User variables
16
#' 
17
#+ user_variables
18
19
output.filename.base <- '../../output/rf-bigdata-try11-varselmiss'
20
21
nsplit <- 20
22
n.trees.cv <- 500
23
n.trees.final <- 500
24
split.rule <- 'logrank'
25
n.imputations <- 3
26
cv.n.folds <- 3
27
vars.drop.frac <- 0.2 # Fraction of variables to drop at each iteration
28
bootstraps <- 200
29
30
n.data <- NA # This is after any variables being excluded in prep
31
32
n.threads <- 40
33
34
#' ## Data set-up
35
#' 
36
#+ data_setup
37
38
data.filename.big <- '../../data/cohort-datadriven-02.csv'
39
40
surv.predict.old <- c('age', 'smokstatus', 'imd_score', 'gender')
41
untransformed.vars <- c('time_death', 'endpoint_death', 'exclude')
42
43
source('../lib/shared.R')
44
require(xtable)
45
46
# Define these after shared.R or they will be overwritten!
47
exclude.vars <-
48
  c(
49
    # Entity type 4 is smoking status, which we already have
50
    "clinical.values.4_data1", "clinical.values.4_data5",
51
    "clinical.values.4_data6",
52
    # Entity 13 data2 is the patient's weight centile, and not a single one is
53
    # entered, but they come out as 0 so the algorithm, looking for NAs, thinks
54
    # it's a useful column
55
    "clinical.values.13_data2",
56
    # Entities 148 and 149 are to do with death certification. I'm not sure how 
57
    # it made it into the dataset, but since all the datapoints in this are
58
    # looking back in time, they're all NA. This causes rfsrc to fail.
59
    "clinical.values.148_data1", "clinical.values.148_data2",
60
    "clinical.values.148_data3", "clinical.values.148_data4",
61
    "clinical.values.148_data5",
62
    "clinical.values.149_data1", "clinical.values.149_data2"
63
  )
64
65
COHORT <- fread(data.filename.big)
66
67
bigdata.prefixes <-
68
  c(
69
    'hes.icd.',
70
    'hes.opcs.',
71
    'tests.enttype.',
72
    'clinical.history.',
73
    'clinical.values.',
74
    'bnf.'
75
  )
76
77
bigdata.columns <-
78
  colnames(COHORT)[
79
    which(
80
      # Does is start with one of the data column names?
81
      startsWithAny(names(COHORT), bigdata.prefixes) &
82
        # And it's not one of the columns we want to exclude?
83
        !(colnames(COHORT) %in% exclude.vars)
84
    )
85
    ]
86
87
COHORT.bigdata <-
88
  COHORT[, c(
89
    untransformed.vars, surv.predict.old, bigdata.columns
90
  ),
91
  with = FALSE
92
  ]
93
94
# Get the missingness before we start removing missing values
95
missingness <- sort(sapply(COHORT.bigdata, percentMissing))
96
# Remove values for the 'untransformed.vars' above, which are the survival
97
# values plus exclude column
98
missingness <- missingness[!(names(missingness) %in% untransformed.vars)]
99
100
# Deal appropriately with missing data
101
# Most of the variables are number of days since the first record of that type
102
time.based.vars <-
103
  names(COHORT.bigdata)[
104
    startsWithAny(
105
      names(COHORT.bigdata),
106
      c('hes.icd.', 'hes.opcs.', 'clinical.history.')
107
    )
108
    ]
109
# We're dealing with this as a logical, so we want non-NA values to be TRUE,
110
# is there is something in the history
111
for (j in time.based.vars) {
112
  set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]]))
113
}
114
115
# Again, taking this as a logical, set any non-NA value to TRUE.
116
prescriptions.vars <- names(COHORT.bigdata)[startsWith(names(COHORT.bigdata), 'bnf.')]
117
for (j in prescriptions.vars) {
118
  set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]]))
119
}
120
121
# This leaves tests and clinical.values, which are test results and should be
122
# imputed.
123
124
# Manually fix clinical values items...
125
#
126
# "clinical.values.1_data1"  "clinical.values.1_data2"
127
# These are just blood pressure values...fine to impute
128
#
129
# "clinical.values.13_data1" "clinical.values.13_data3"
130
# These are weight and BMI...also fine to impute
131
#
132
# Entity 5 is alcohol consumption status, 1 = Yes, 2 = No, 3 = Ex, so should be
133
# a factor, and NA can be a factor level
134
COHORT.bigdata$clinical.values.5_data1 <-
135
  factorNAfix(factor(COHORT.bigdata$clinical.values.5_data1), NAval = 'missing')
136
137
# Both gender and smokstatus are factors...fix that
138
COHORT.bigdata$gender <- factor(COHORT.bigdata$gender)
139
COHORT.bigdata$smokstatus <-
140
  factorNAfix(factor(COHORT.bigdata$smokstatus), NAval = 'missing')
141
142
# Exclude invalid patients
143
COHORT.bigdata <- COHORT.bigdata[!COHORT.bigdata$exclude]
144
COHORT.bigdata$exclude <- NULL
145
146
COHORT.bigdata <-
147
  prepSurvCol(data.frame(COHORT.bigdata), 'time_death', 'endpoint_death', 'Death')
148
149
# If n.data was specified, trim the data table down to size
150
if(!is.na(n.data)) {
151
  COHORT.bigdata <- sample.df(COHORT.bigdata, n.data)
152
}
153
154
# Define test set
155
test.set <- testSetIndices(COHORT.bigdata, random.seed = 78361)
156
157
# Start by predicting survival with all the variables provided
158
surv.predict <- c(surv.predict.old, bigdata.columns)
159
160
# Set up a csv file to store calibration data, or retrieve previous data
161
calibration.filename <- paste0(output.filename.base, '-varselcalibration.csv')
162
163
#' ## Run random forest calibration
164
#' 
165
#' If there's not already a calibration file, we run the rfVarSel methodology:
166
#'   1. Fit a big forest to the whole dataset to obtain variable importances.
167
#'   2. Cross-validate as number of most important variables kept is reduced.
168
#' 
169
#' (If there is already a calibration file, just load the previous work.)
170
#' 
171
#+ rf_var_sel_calibration
172
173
# If we've not already done a calibration, then do one
174
if(!file.exists(calibration.filename)) {
175
  # Create an empty data frame to aggregate stats per fold
176
  cv.performance <- data.frame()
177
  
178
  # Cross-validate over number of variables to try
179
  cv.vars <- getVarNums(length(missingness))
180
  
181
  COHORT.cv <- COHORT.bigdata[-test.set, ]
182
  
183
  # Run crossvalidations. No need to parallelise because rfsrc is parallelised
184
  for(i in 1:length(cv.vars)) {
185
    # Get the subset of most important variables to use
186
    surv.predict.partial <- names(missingness)[1:cv.vars[i]]
187
    
188
    # Get folds for cross-validation
189
    cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds)
190
    
191
    cv.fold.performance <- data.frame()
192
    
193
    for(j in 1:cv.n.folds) {
194
      time.start <- handyTimer()
195
      # Fit model to the training set
196
      surv.model.fit <-
197
        survivalFit(
198
          surv.predict.partial,
199
          COHORT.cv[-cv.folds[[j]],],
200
          model.type = 'rfsrc',
201
          n.trees = n.trees.cv,
202
          split.rule = split.rule,
203
          n.threads = n.threads,
204
          nsplit = nsplit,
205
          nimpute = n.imputations,
206
          na.action = 'na.impute'
207
        )
208
      time.learn <- handyTimer(time.start)
209
      
210
      time.start <- handyTimer()
211
      # Get C-index on validation set
212
      c.index.val <-
213
        cIndex(
214
          surv.model.fit, COHORT.cv[cv.folds[[j]],],
215
          na.action = 'na.impute'
216
        )
217
      time.c.index <- handyTimer(time.start)
218
      
219
      time.start <- handyTimer()
220
      # Get calibration score validation set
221
      calibration.score <-
222
        calibrationScore(
223
          calibrationTable(
224
            surv.model.fit, COHORT.cv[cv.folds[[j]],], na.action = 'na.impute'
225
          )
226
        )
227
      time.calibration <- handyTimer(time.start)
228
      
229
      # Append the stats we've obtained from this fold
230
      cv.fold.performance <-
231
        rbind(
232
          cv.fold.performance,
233
          data.frame(
234
            calibration = i,
235
            cv.fold = j,
236
            n.vars = cv.vars[i],
237
            c.index.val,
238
            calibration.score,
239
            time.learn,
240
            time.c.index,
241
            time.calibration
242
          )
243
        )
244
      
245
    } # End cross-validation loop (j)
246
    
247
    
248
    # rbind the performance by fold
249
    cv.performance <-
250
      rbind(
251
        cv.performance,
252
        cv.fold.performance
253
      )
254
    
255
    # Save output at the end of each loop
256
    write.csv(cv.performance, calibration.filename)
257
    
258
  } # End calibration loop (i)
259
} else {
260
  cv.performance <- read.csv(calibration.filename)
261
}
262
263
#' ## Find the best model from the calibrations
264
#' 
265
#' ### Plot model performance
266
#' 
267
#+ model_performance
268
269
# Find the best calibration...
270
# First, average performance across cross-validation folds
271
cv.performance.average <-
272
  aggregate(
273
    c.index.val ~ n.vars,
274
    data = cv.performance,
275
    mean
276
  )
277
278
cv.calibration.average <-
279
  aggregate(
280
    area ~ n.vars,
281
    data = cv.performance,
282
    mean
283
  )
284
285
ggplot(cv.performance.average, aes(x = n.vars, y = c.index.val)) +
286
  geom_line() +
287
  geom_point(data = cv.performance) +
288
  ggtitle(label = 'C-index by n.vars')
289
290
ggplot(cv.calibration.average, aes(x = n.vars, y = area)) +
291
  geom_line() +
292
  geom_point(data = cv.performance) +
293
  ggtitle(label = 'Calibration performance by n.vars')
294
295
# Find the highest value
296
n.vars <-
297
  cv.performance.average$n.vars[
298
    which.max(cv.performance.average$c.index.val)
299
    ]
300
301
# Fit a full model with the variables provided
302
surv.predict.partial <- names(missingness)[1:n.vars]
303
304
#' ## Best model
305
#' 
306
#' The best model contained `r n.vars` variables. Let's see what those were...
307
#' 
308
#+ variables_used
309
310
vars.df <-
311
  data.frame(
312
    vars = surv.predict.partial
313
  )
314
315
vars.df$descriptions <- lookUpDescriptions(surv.predict.partial)
316
317
vars.df$missingness <- missingness[1:n.vars]
318
319
#+ variables_table, results='asis'
320
321
print(
322
  xtable(vars.df),
323
  type = 'html',
324
  include.rownames = FALSE
325
)
326
327
#' ## Perform the final fit
328
#' 
329
#' Having found the best number of variables by cross-validation, let's perform
330
#' the final fit with the full training set and `r n.trees.final` trees.
331
#' 
332
#+ final_fit
333
334
time.start <- handyTimer()
335
surv.model.fit.final <-
336
  survivalFit(
337
    surv.predict.partial,
338
    COHORT.bigdata[-test.set,],
339
    model.type = 'rfsrc',
340
    n.trees = n.trees.final,
341
    split.rule = split.rule,
342
    n.threads = n.threads,
343
    nimpute = 3,
344
    nsplit = nsplit,
345
    na.action = 'na.impute'
346
  )
347
time.fit.final <- handyTimer(time.start)
348
349
saveRDS(surv.model.fit.final, paste0(output.filename.base, '-finalmodel.rds'))
350
351
#' Final model of `r n.trees.final` trees fitted in `r round(time.fit.final)`
352
#' seconds! 
353
#' 
354
#' Also bootstrap this final fitting stage. A fully proper bootstrap would
355
#' iterate over the whole model-building process including variable selection,
356
#' but that would be prohibitive in terms of computational time.
357
#' 
358
#+ bootstrap_final
359
360
surv.model.fit.boot <-
361
  survivalBootstrap(
362
    surv.predict.partial,
363
    COHORT.bigdata[-test.set,], # Training set
364
    COHORT.bigdata[test.set,],  # Test set
365
    model.type = 'rfsrc',
366
    n.trees = n.trees.final,
367
    split.rule = split.rule,
368
    n.threads = n.threads,
369
    nimpute = 3,
370
    nsplit = nsplit,
371
    na.action = 'na.impute',
372
    bootstraps = bootstraps
373
  )
374
375
# Get coefficients and variable importances from bootstrap fits
376
surv.model.fit.coeffs <- bootStats(surv.model.fit.boot, uncertainty = '95ci')
377
378
#' ## Performance
379
#' 
380
#' ### C-index
381
#' 
382
#' C-indices are **`r round(surv.model.fit.coeffs['c.train', 'val'], 3)`
383
#' (`r round(surv.model.fit.coeffs['c.train', 'lower'], 3)` - 
384
#' `r round(surv.model.fit.coeffs['c.train', 'upper'], 3)`)**
385
#' on the training set and
386
#' **`r round(surv.model.fit.coeffs['c.test', 'val'], 3)`
387
#' (`r round(surv.model.fit.coeffs['c.test', 'lower'], 3)` - 
388
#' `r round(surv.model.fit.coeffs['c.test', 'upper'], 3)`)** on the test set.
389
#' 
390
#'
391
#' ### Calibration
392
#' 
393
#' The bootstrapped calibration score is
394
#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)`
395
#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` - 
396
#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**.
397
#' 
398
#' Let's draw a representative curve from the unbootstrapped fit... (It would be
399
#' better to draw all the curves from the bootstrap fit to get an idea of
400
#' variability, but I've not implemented this yet.)
401
#' 
402
#+ calibration_plot
403
404
calibration.table <-
405
  calibrationTable(
406
    # Standard calibration options
407
    surv.model.fit.final, COHORT.bigdata[test.set,],
408
    # Always need to specify NA imputation for rfsrc
409
    na.action = 'na.impute'
410
  )
411
412
calibration.score <- calibrationScore(calibration.table)
413
414
calibrationPlot(calibration.table)
415
416
#' The area between the calibration curve and the diagonal is 
417
#' **`r round(calibration.score[['area']], 3)`** +/-
418
#' **`r round(calibration.score[['se']], 3)`**.
419
#'
420
#+ save_results
421
422
# Save performance results
423
varsToTable(
424
  data.frame(
425
    model = 'rf-varselmiss',
426
    imputation = FALSE,
427
    discretised = FALSE,
428
    c.index = surv.model.fit.coeffs['c.train', 'val'],
429
    c.index.lower = surv.model.fit.coeffs['c.train', 'lower'],
430
    c.index.upper = surv.model.fit.coeffs['c.train', 'upper'],
431
    calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
432
    calibration.score.lower =
433
      surv.model.fit.coeffs['calibration.score', 'lower'],
434
    calibration.score.upper =
435
      surv.model.fit.coeffs['calibration.score', 'upper']
436
  ),
437
  performance.file,
438
  index.cols = c('model', 'imputation', 'discretised')
439
)