Diff of /random-forest/rf-age.R [000000] .. [0375db]

Switch to unified view

a b/random-forest/rf-age.R
1
#+ knitr_setup, include = FALSE
2
3
# Whether to cache the intensive code sections. Set to FALSE to recalculate
4
# everything afresh.
5
cacheoption <- TRUE
6
# Disable lazy caching globally, because it fails for large objects, and all the
7
# objects we wish to cache are large...
8
opts_chunk$set(cache.lazy = FALSE)
9
10
#' # Modelling with age
11
#' 
12
#' It seems that, no matter what I do, the C-index of a model, random forest or
13
#' otherwise, is about 0.78. I decided to try to some simpler models, intially
14
#' based purely on age which is clearly the biggest factor in this dataset.
15
#' (And, I assume, most datasets where a reasonably broad range of ages is
16
#' present.)
17
#' 
18
#' The sanity check works: giving the model more data does indeed result in a
19
#' better fit. However, on top of that, I was surprised by just how good the
20
#' performance can be when age alone is considered!
21
22
#+ user_variables, message=FALSE
23
24
data.filename <- '../../data/cohort-sanitised.csv'
25
26
n.trees <- 500
27
28
continuous.vars <-
29
  c(
30
    'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
31
    'total_wbc_6mo', 'haemoglobin_6mo'
32
  )
33
untransformed.vars <- c('anonpatid', 'time_death', 'imd_score', 'exclude')
34
35
source('../lib/shared.R')
36
require(ggrepel)
37
38
#' ## Load and prepare data
39
#+ load_and_prepare_data
40
41
# Load the data and convert to data frame to make column-selecting code in
42
# prepData simpler
43
COHORT.full <- data.frame(fread(data.filename))
44
45
# Define process settings; nothing for those to not transform, and missingToBig
46
# for the continuous ones...
47
process.settings <-
48
  list(
49
    var        = c(untransformed.vars, continuous.vars),
50
    method     =
51
      c(
52
        rep(NA, length(untransformed.vars)),
53
        rep('missingToBig', length(continuous.vars))
54
      ),
55
    settings   = rep(NA, length(untransformed.vars) + length(continuous.vars))
56
  )
57
58
COHORT.prep <-
59
  prepData(
60
    # Data for cross-validation excludes test set
61
    COHORT.full,
62
    cols.keep,
63
    process.settings,
64
    surv.time, surv.event,
65
    surv.event.yes,
66
    extra.fun = caliberExtraPrep
67
  )
68
n.data <- nrow(COHORT.prep)
69
70
# Define indices of test set
71
test.set <- sample(1:n.data, (1/3)*n.data)
72
73
#' ## Models
74
#' 
75
#' ### The normal model
76
#' 
77
#' All the variables, as in the vector `surv.predict`.
78
#+ normal_model, cache=cacheoption
79
80
# Fit random forest
81
surv.model.fit <-
82
  survivalFit(
83
    surv.predict,
84
    COHORT.prep[-test.set,],
85
    model.type = 'rfsrc',
86
    n.trees = n.trees,
87
    split.rule = 'logrank',
88
    n.threads = 7,
89
    nsplit = 20
90
  )
91
92
print(surv.model.fit)
93
94
# Get C-index
95
c.index.test <- 
96
  cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
97
98
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**.
99
100
 
101
#' ### Just age
102
#' 
103
#' What if all we had to go on was age?
104
#+ just_age_model, cache=cacheoption
105
106
# Fit random forest
107
surv.model.fit <-
108
  survivalFit(
109
    c('age'),
110
    COHORT.prep[-test.set,],
111
    model.type = 'ranger',
112
    n.trees = n.trees,
113
    split.rule = 'logrank',
114
    n.threads = 8,
115
    respect.unordered.factors = 'partition'
116
  )
117
118
print(surv.model.fit)
119
120
# Get C-index
121
c.index.test <- 
122
  cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
123
124
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**.
125
126
#' ### No model, literally just age
127
#' 
128
#' What if we constructed the C-index based purely on patients' ages?
129
#+ just_age_cindex
130
131
c.index.age <-
132
  as.numeric(
133
    survConcordance(
134
      Surv(time_death, surv_event) ~ age,
135
      COHORT.prep
136
    )$concordance
137
  )
138
139
#' The C-index on the whole dataset based purely on age is
140
#' **`r round(c.index.test, 3)`**. That's most of our predictive accuracy right
141
#' there! Reassuringly, it's also equal to the value predicted by the random
142
#' forest model based purely on age...
143
144
#' ### Age and gender
145
#' 
146
#' OK, age and gender.
147
#+ age_gender_model, cache=cacheoption
148
149
# Fit random forest
150
surv.model.fit <-
151
  survivalFit(
152
    c('age', 'gender'),
153
    COHORT.prep[-test.set,],
154
    model.type = 'ranger',
155
    n.trees = n.trees,
156
    split.rule = 'logrank',
157
    n.threads = 8,
158
    respect.unordered.factors = 'partition'
159
  )
160
161
print(surv.model.fit)
162
163
# Get C-index
164
c.index.test <- 
165
  cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
166
167
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**.
168
169
170
#' ### Age, gender and history of liver disease
171
#' 
172
#' Let's add a third variable. In the replication of the Cox model with missing
173
#' data included, liver disease was the most predictive factor after age, so
174
#' it's a reasonable next variable to add.
175
176
#+ age_gender_liver_model, cache=cacheoption
177
178
# Fit random forest
179
surv.model.fit <-
180
  survivalFit(
181
    c('age', 'gender', 'hx_liver'),
182
    COHORT.prep[-test.set,],
183
    model.type = 'ranger',
184
    n.trees = n.trees,
185
    split.rule = 'logrank',
186
    n.threads = 8,
187
    respect.unordered.factors = 'partition'
188
  )
189
190
print(surv.model.fit)
191
192
# Get C-index
193
c.index.test <- 
194
  cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
195
196
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**.
197
198
#' ### Age, gender and heart failure
199
#' 
200
#' A different third variable: heart failure, the second most important variable
201
#' (after age) from random forest modelling.
202
203
#+ age_gender_hf_model, cache=cacheoption
204
205
# Fit random forest
206
surv.model.fit <-
207
  survivalFit(
208
    c('age', 'gender', 'heart_failure'),
209
    COHORT.prep[-test.set,],
210
    model.type = 'ranger',
211
    n.trees = n.trees,
212
    split.rule = 'logrank',
213
    n.threads = 8,
214
    respect.unordered.factors = 'partition'
215
  )
216
217
print(surv.model.fit)
218
219
# Get C-index
220
c.index.test <- 
221
  cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
222
223
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**.
224
225
226
#' ### Just gender
227
#' 
228
#' Just gender, as a sanity check.
229
#+ just_gender_model, cache=cacheoption
230
231
# Fit random forest
232
surv.model.fit <-
233
  survivalFit(
234
    c('gender'),
235
    COHORT.prep[-test.set,],
236
    model.type = 'ranger',
237
    n.trees = n.trees,
238
    split.rule = 'logrank',
239
    n.threads = 8,
240
    respect.unordered.factors = 'partition'
241
  )
242
243
print(surv.model.fit)
244
245
# Get C-index
246
c.index.test <- 
247
  cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
248
249
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**.
250
251
#' ### Everything except age
252
#' 
253
#' How do we do if we use all the variables _except_ age?
254
#+ no_age_model, cache=cacheoption
255
256
# Fit random forest
257
surv.model.fit <-
258
  survivalFit(
259
    surv.predict[surv.predict != 'age'],
260
    COHORT.prep[-test.set,],
261
    model.type = 'ranger',
262
    n.trees = n.trees,
263
    split.rule = 'logrank',
264
    n.threads = 8,
265
    respect.unordered.factors = 'partition'
266
  )
267
268
print(surv.model.fit)
269
270
# Get C-index
271
c.index.test.all.not.age <- 
272
  cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger')
273
274
#' The C-index on the held-out test set is
275
#' **`r round(c.index.test.all.not.age, 3)`**.
276
277
278
#' ### Predicting age
279
#' 
280
#' So why does the model which doesn't include age do so well? Clearly the other
281
#' variables allow you to predict age with reasonable accuracy... So let's try
282
#' just that as a final test.
283
#+ predict_age, cache=cacheoption
284
285
options(rf.cores = n.threads)
286
287
age.model <-
288
  rfsrc(
289
    formula(
290
      paste0(
291
        # Predicting just the age
292
        'age ~ ',
293
        # Predictor variables then make up the other side
294
        paste(surv.predict[!(surv.predict %in% c('age', 'most_deprived'))], collapse = '+')
295
      )
296
    ),
297
    COHORT.use[-test.set, ],
298
    ntree = n.trees,
299
    splitrule = 'mse',
300
    na.action = 'na.impute',
301
    nimpute = 3
302
  )
303
304
age.predictions <- predict(age.model, COHORT.prep[test.set, ])
305
306
age.cor <- cor(age.predictions$predictions, COHORT.prep[test.set, 'age'])
307
308
to.plot <-
309
  data.frame(
310
    age = COHORT.prep[test.set, 'age'],
311
    predicted = age.predictions$predictions
312
  )
313
314
ggplot(sample.df(to.plot, 10000), aes(x = age, y = predicted)) +
315
  geom_point(alpha = 0.2)
316
317
#' It doesn't look that great, but there is some correlation...
318
#' r^2 = `r age.cor^2` which is unremarkable, but OK. A more relevant measure
319
#' would be the pure-age C-index, ie if I gave you a pair of patients, how
320
#' often could you predict who was older?
321
322
c.index.age.on.age <-
323
  1 - as.numeric(
324
    survConcordance(
325
       age ~ predicted,
326
       to.plot
327
    )$concordance
328
  )
329
330
#' This comes out as **`r round(c.index.age.on.age, 3)`**, as compared to
331
#' **`r round(c.index.test.all.not.age, 3)`**, which was the C-index for the
332
#' survival model based on all other variables.
333
#' 
334
#' This makes sense. The maths of C-indices is a bit tricky (how much they add
335
#' to one-another depends in large part of how correlated the variables are, as
336
#' well as probably being somewhat non-linear anyway),
337
#' but clearly some significant fraction of the all-except-age model's
338
#' predictive power comes from its ability to infer age from the remaining
339
#' variables, and then use _that_ (implicitly) to predict time to death.
340
#' 
341
#' The mechanism for this could be that people are likely to
342
#' get more diseases as they get older, so if you have a, b _and_ c you're
343
#' likely to be older. Second-order predictivity may occur if particular
344
#' combinations of disorders or test results are common in certain age groups.
345
#'
346
#' ## Conclusion
347
#' 
348
#' So, in conclusion, the sanity check has worked: giving the
349
#' random forest model more data to work with improves its performance. Age
350
#' alone is less predictive, adding gender makes it slightly more predictive,
351
#' and so on.
352
#' 
353
#' Further, though, a huge amount of the model's predictivity arises from just
354
#' a patien's age. Not only is that alone a good predictor, but the
355
#' reasonable performance on the model of all factors except age is explained in
356
#' part by those factors' ability to act as a proxy for age.