|
a |
|
b/random-forest/rf-age.R |
|
|
1 |
#+ knitr_setup, include = FALSE |
|
|
2 |
|
|
|
3 |
# Whether to cache the intensive code sections. Set to FALSE to recalculate |
|
|
4 |
# everything afresh. |
|
|
5 |
cacheoption <- TRUE |
|
|
6 |
# Disable lazy caching globally, because it fails for large objects, and all the |
|
|
7 |
# objects we wish to cache are large... |
|
|
8 |
opts_chunk$set(cache.lazy = FALSE) |
|
|
9 |
|
|
|
10 |
#' # Modelling with age |
|
|
11 |
#' |
|
|
12 |
#' It seems that, no matter what I do, the C-index of a model, random forest or |
|
|
13 |
#' otherwise, is about 0.78. I decided to try to some simpler models, intially |
|
|
14 |
#' based purely on age which is clearly the biggest factor in this dataset. |
|
|
15 |
#' (And, I assume, most datasets where a reasonably broad range of ages is |
|
|
16 |
#' present.) |
|
|
17 |
#' |
|
|
18 |
#' The sanity check works: giving the model more data does indeed result in a |
|
|
19 |
#' better fit. However, on top of that, I was surprised by just how good the |
|
|
20 |
#' performance can be when age alone is considered! |
|
|
21 |
|
|
|
22 |
#+ user_variables, message=FALSE |
|
|
23 |
|
|
|
24 |
data.filename <- '../../data/cohort-sanitised.csv' |
|
|
25 |
|
|
|
26 |
n.trees <- 500 |
|
|
27 |
|
|
|
28 |
continuous.vars <- |
|
|
29 |
c( |
|
|
30 |
'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo', |
|
|
31 |
'total_wbc_6mo', 'haemoglobin_6mo' |
|
|
32 |
) |
|
|
33 |
untransformed.vars <- c('anonpatid', 'time_death', 'imd_score', 'exclude') |
|
|
34 |
|
|
|
35 |
source('../lib/shared.R') |
|
|
36 |
require(ggrepel) |
|
|
37 |
|
|
|
38 |
#' ## Load and prepare data |
|
|
39 |
#+ load_and_prepare_data |
|
|
40 |
|
|
|
41 |
# Load the data and convert to data frame to make column-selecting code in |
|
|
42 |
# prepData simpler |
|
|
43 |
COHORT.full <- data.frame(fread(data.filename)) |
|
|
44 |
|
|
|
45 |
# Define process settings; nothing for those to not transform, and missingToBig |
|
|
46 |
# for the continuous ones... |
|
|
47 |
process.settings <- |
|
|
48 |
list( |
|
|
49 |
var = c(untransformed.vars, continuous.vars), |
|
|
50 |
method = |
|
|
51 |
c( |
|
|
52 |
rep(NA, length(untransformed.vars)), |
|
|
53 |
rep('missingToBig', length(continuous.vars)) |
|
|
54 |
), |
|
|
55 |
settings = rep(NA, length(untransformed.vars) + length(continuous.vars)) |
|
|
56 |
) |
|
|
57 |
|
|
|
58 |
COHORT.prep <- |
|
|
59 |
prepData( |
|
|
60 |
# Data for cross-validation excludes test set |
|
|
61 |
COHORT.full, |
|
|
62 |
cols.keep, |
|
|
63 |
process.settings, |
|
|
64 |
surv.time, surv.event, |
|
|
65 |
surv.event.yes, |
|
|
66 |
extra.fun = caliberExtraPrep |
|
|
67 |
) |
|
|
68 |
n.data <- nrow(COHORT.prep) |
|
|
69 |
|
|
|
70 |
# Define indices of test set |
|
|
71 |
test.set <- sample(1:n.data, (1/3)*n.data) |
|
|
72 |
|
|
|
73 |
#' ## Models |
|
|
74 |
#' |
|
|
75 |
#' ### The normal model |
|
|
76 |
#' |
|
|
77 |
#' All the variables, as in the vector `surv.predict`. |
|
|
78 |
#+ normal_model, cache=cacheoption |
|
|
79 |
|
|
|
80 |
# Fit random forest |
|
|
81 |
surv.model.fit <- |
|
|
82 |
survivalFit( |
|
|
83 |
surv.predict, |
|
|
84 |
COHORT.prep[-test.set,], |
|
|
85 |
model.type = 'rfsrc', |
|
|
86 |
n.trees = n.trees, |
|
|
87 |
split.rule = 'logrank', |
|
|
88 |
n.threads = 7, |
|
|
89 |
nsplit = 20 |
|
|
90 |
) |
|
|
91 |
|
|
|
92 |
print(surv.model.fit) |
|
|
93 |
|
|
|
94 |
# Get C-index |
|
|
95 |
c.index.test <- |
|
|
96 |
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger') |
|
|
97 |
|
|
|
98 |
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**. |
|
|
99 |
|
|
|
100 |
|
|
|
101 |
#' ### Just age |
|
|
102 |
#' |
|
|
103 |
#' What if all we had to go on was age? |
|
|
104 |
#+ just_age_model, cache=cacheoption |
|
|
105 |
|
|
|
106 |
# Fit random forest |
|
|
107 |
surv.model.fit <- |
|
|
108 |
survivalFit( |
|
|
109 |
c('age'), |
|
|
110 |
COHORT.prep[-test.set,], |
|
|
111 |
model.type = 'ranger', |
|
|
112 |
n.trees = n.trees, |
|
|
113 |
split.rule = 'logrank', |
|
|
114 |
n.threads = 8, |
|
|
115 |
respect.unordered.factors = 'partition' |
|
|
116 |
) |
|
|
117 |
|
|
|
118 |
print(surv.model.fit) |
|
|
119 |
|
|
|
120 |
# Get C-index |
|
|
121 |
c.index.test <- |
|
|
122 |
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger') |
|
|
123 |
|
|
|
124 |
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**. |
|
|
125 |
|
|
|
126 |
#' ### No model, literally just age |
|
|
127 |
#' |
|
|
128 |
#' What if we constructed the C-index based purely on patients' ages? |
|
|
129 |
#+ just_age_cindex |
|
|
130 |
|
|
|
131 |
c.index.age <- |
|
|
132 |
as.numeric( |
|
|
133 |
survConcordance( |
|
|
134 |
Surv(time_death, surv_event) ~ age, |
|
|
135 |
COHORT.prep |
|
|
136 |
)$concordance |
|
|
137 |
) |
|
|
138 |
|
|
|
139 |
#' The C-index on the whole dataset based purely on age is |
|
|
140 |
#' **`r round(c.index.test, 3)`**. That's most of our predictive accuracy right |
|
|
141 |
#' there! Reassuringly, it's also equal to the value predicted by the random |
|
|
142 |
#' forest model based purely on age... |
|
|
143 |
|
|
|
144 |
#' ### Age and gender |
|
|
145 |
#' |
|
|
146 |
#' OK, age and gender. |
|
|
147 |
#+ age_gender_model, cache=cacheoption |
|
|
148 |
|
|
|
149 |
# Fit random forest |
|
|
150 |
surv.model.fit <- |
|
|
151 |
survivalFit( |
|
|
152 |
c('age', 'gender'), |
|
|
153 |
COHORT.prep[-test.set,], |
|
|
154 |
model.type = 'ranger', |
|
|
155 |
n.trees = n.trees, |
|
|
156 |
split.rule = 'logrank', |
|
|
157 |
n.threads = 8, |
|
|
158 |
respect.unordered.factors = 'partition' |
|
|
159 |
) |
|
|
160 |
|
|
|
161 |
print(surv.model.fit) |
|
|
162 |
|
|
|
163 |
# Get C-index |
|
|
164 |
c.index.test <- |
|
|
165 |
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger') |
|
|
166 |
|
|
|
167 |
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**. |
|
|
168 |
|
|
|
169 |
|
|
|
170 |
#' ### Age, gender and history of liver disease |
|
|
171 |
#' |
|
|
172 |
#' Let's add a third variable. In the replication of the Cox model with missing |
|
|
173 |
#' data included, liver disease was the most predictive factor after age, so |
|
|
174 |
#' it's a reasonable next variable to add. |
|
|
175 |
|
|
|
176 |
#+ age_gender_liver_model, cache=cacheoption |
|
|
177 |
|
|
|
178 |
# Fit random forest |
|
|
179 |
surv.model.fit <- |
|
|
180 |
survivalFit( |
|
|
181 |
c('age', 'gender', 'hx_liver'), |
|
|
182 |
COHORT.prep[-test.set,], |
|
|
183 |
model.type = 'ranger', |
|
|
184 |
n.trees = n.trees, |
|
|
185 |
split.rule = 'logrank', |
|
|
186 |
n.threads = 8, |
|
|
187 |
respect.unordered.factors = 'partition' |
|
|
188 |
) |
|
|
189 |
|
|
|
190 |
print(surv.model.fit) |
|
|
191 |
|
|
|
192 |
# Get C-index |
|
|
193 |
c.index.test <- |
|
|
194 |
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger') |
|
|
195 |
|
|
|
196 |
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**. |
|
|
197 |
|
|
|
198 |
#' ### Age, gender and heart failure |
|
|
199 |
#' |
|
|
200 |
#' A different third variable: heart failure, the second most important variable |
|
|
201 |
#' (after age) from random forest modelling. |
|
|
202 |
|
|
|
203 |
#+ age_gender_hf_model, cache=cacheoption |
|
|
204 |
|
|
|
205 |
# Fit random forest |
|
|
206 |
surv.model.fit <- |
|
|
207 |
survivalFit( |
|
|
208 |
c('age', 'gender', 'heart_failure'), |
|
|
209 |
COHORT.prep[-test.set,], |
|
|
210 |
model.type = 'ranger', |
|
|
211 |
n.trees = n.trees, |
|
|
212 |
split.rule = 'logrank', |
|
|
213 |
n.threads = 8, |
|
|
214 |
respect.unordered.factors = 'partition' |
|
|
215 |
) |
|
|
216 |
|
|
|
217 |
print(surv.model.fit) |
|
|
218 |
|
|
|
219 |
# Get C-index |
|
|
220 |
c.index.test <- |
|
|
221 |
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger') |
|
|
222 |
|
|
|
223 |
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**. |
|
|
224 |
|
|
|
225 |
|
|
|
226 |
#' ### Just gender |
|
|
227 |
#' |
|
|
228 |
#' Just gender, as a sanity check. |
|
|
229 |
#+ just_gender_model, cache=cacheoption |
|
|
230 |
|
|
|
231 |
# Fit random forest |
|
|
232 |
surv.model.fit <- |
|
|
233 |
survivalFit( |
|
|
234 |
c('gender'), |
|
|
235 |
COHORT.prep[-test.set,], |
|
|
236 |
model.type = 'ranger', |
|
|
237 |
n.trees = n.trees, |
|
|
238 |
split.rule = 'logrank', |
|
|
239 |
n.threads = 8, |
|
|
240 |
respect.unordered.factors = 'partition' |
|
|
241 |
) |
|
|
242 |
|
|
|
243 |
print(surv.model.fit) |
|
|
244 |
|
|
|
245 |
# Get C-index |
|
|
246 |
c.index.test <- |
|
|
247 |
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger') |
|
|
248 |
|
|
|
249 |
#' The C-index on the held-out test set is **`r round(c.index.test, 3)`**. |
|
|
250 |
|
|
|
251 |
#' ### Everything except age |
|
|
252 |
#' |
|
|
253 |
#' How do we do if we use all the variables _except_ age? |
|
|
254 |
#+ no_age_model, cache=cacheoption |
|
|
255 |
|
|
|
256 |
# Fit random forest |
|
|
257 |
surv.model.fit <- |
|
|
258 |
survivalFit( |
|
|
259 |
surv.predict[surv.predict != 'age'], |
|
|
260 |
COHORT.prep[-test.set,], |
|
|
261 |
model.type = 'ranger', |
|
|
262 |
n.trees = n.trees, |
|
|
263 |
split.rule = 'logrank', |
|
|
264 |
n.threads = 8, |
|
|
265 |
respect.unordered.factors = 'partition' |
|
|
266 |
) |
|
|
267 |
|
|
|
268 |
print(surv.model.fit) |
|
|
269 |
|
|
|
270 |
# Get C-index |
|
|
271 |
c.index.test.all.not.age <- |
|
|
272 |
cIndex(surv.model.fit, COHORT.prep[test.set, ], model.type = 'ranger') |
|
|
273 |
|
|
|
274 |
#' The C-index on the held-out test set is |
|
|
275 |
#' **`r round(c.index.test.all.not.age, 3)`**. |
|
|
276 |
|
|
|
277 |
|
|
|
278 |
#' ### Predicting age |
|
|
279 |
#' |
|
|
280 |
#' So why does the model which doesn't include age do so well? Clearly the other |
|
|
281 |
#' variables allow you to predict age with reasonable accuracy... So let's try |
|
|
282 |
#' just that as a final test. |
|
|
283 |
#+ predict_age, cache=cacheoption |
|
|
284 |
|
|
|
285 |
options(rf.cores = n.threads) |
|
|
286 |
|
|
|
287 |
age.model <- |
|
|
288 |
rfsrc( |
|
|
289 |
formula( |
|
|
290 |
paste0( |
|
|
291 |
# Predicting just the age |
|
|
292 |
'age ~ ', |
|
|
293 |
# Predictor variables then make up the other side |
|
|
294 |
paste(surv.predict[!(surv.predict %in% c('age', 'most_deprived'))], collapse = '+') |
|
|
295 |
) |
|
|
296 |
), |
|
|
297 |
COHORT.use[-test.set, ], |
|
|
298 |
ntree = n.trees, |
|
|
299 |
splitrule = 'mse', |
|
|
300 |
na.action = 'na.impute', |
|
|
301 |
nimpute = 3 |
|
|
302 |
) |
|
|
303 |
|
|
|
304 |
age.predictions <- predict(age.model, COHORT.prep[test.set, ]) |
|
|
305 |
|
|
|
306 |
age.cor <- cor(age.predictions$predictions, COHORT.prep[test.set, 'age']) |
|
|
307 |
|
|
|
308 |
to.plot <- |
|
|
309 |
data.frame( |
|
|
310 |
age = COHORT.prep[test.set, 'age'], |
|
|
311 |
predicted = age.predictions$predictions |
|
|
312 |
) |
|
|
313 |
|
|
|
314 |
ggplot(sample.df(to.plot, 10000), aes(x = age, y = predicted)) + |
|
|
315 |
geom_point(alpha = 0.2) |
|
|
316 |
|
|
|
317 |
#' It doesn't look that great, but there is some correlation... |
|
|
318 |
#' r^2 = `r age.cor^2` which is unremarkable, but OK. A more relevant measure |
|
|
319 |
#' would be the pure-age C-index, ie if I gave you a pair of patients, how |
|
|
320 |
#' often could you predict who was older? |
|
|
321 |
|
|
|
322 |
c.index.age.on.age <- |
|
|
323 |
1 - as.numeric( |
|
|
324 |
survConcordance( |
|
|
325 |
age ~ predicted, |
|
|
326 |
to.plot |
|
|
327 |
)$concordance |
|
|
328 |
) |
|
|
329 |
|
|
|
330 |
#' This comes out as **`r round(c.index.age.on.age, 3)`**, as compared to |
|
|
331 |
#' **`r round(c.index.test.all.not.age, 3)`**, which was the C-index for the |
|
|
332 |
#' survival model based on all other variables. |
|
|
333 |
#' |
|
|
334 |
#' This makes sense. The maths of C-indices is a bit tricky (how much they add |
|
|
335 |
#' to one-another depends in large part of how correlated the variables are, as |
|
|
336 |
#' well as probably being somewhat non-linear anyway), |
|
|
337 |
#' but clearly some significant fraction of the all-except-age model's |
|
|
338 |
#' predictive power comes from its ability to infer age from the remaining |
|
|
339 |
#' variables, and then use _that_ (implicitly) to predict time to death. |
|
|
340 |
#' |
|
|
341 |
#' The mechanism for this could be that people are likely to |
|
|
342 |
#' get more diseases as they get older, so if you have a, b _and_ c you're |
|
|
343 |
#' likely to be older. Second-order predictivity may occur if particular |
|
|
344 |
#' combinations of disorders or test results are common in certain age groups. |
|
|
345 |
#' |
|
|
346 |
#' ## Conclusion |
|
|
347 |
#' |
|
|
348 |
#' So, in conclusion, the sanity check has worked: giving the |
|
|
349 |
#' random forest model more data to work with improves its performance. Age |
|
|
350 |
#' alone is less predictive, adding gender makes it slightly more predictive, |
|
|
351 |
#' and so on. |
|
|
352 |
#' |
|
|
353 |
#' Further, though, a huge amount of the model's predictivity arises from just |
|
|
354 |
#' a patien's age. Not only is that alone a good predictor, but the |
|
|
355 |
#' reasonable performance on the model of all factors except age is explained in |
|
|
356 |
#' part by those factors' ability to act as a proxy for age. |