|
a |
|
b/cox-ph/cox-discrete-elasticnet.R |
|
|
1 |
#+ knitr_setup, include = FALSE |
|
|
2 |
|
|
|
3 |
# Whether to cache the intensive code sections. Set to FALSE to recalculate |
|
|
4 |
# everything afresh. |
|
|
5 |
cacheoption <- TRUE |
|
|
6 |
# Disable lazy caching globally, because it fails for large objects, and all the |
|
|
7 |
# objects we wish to cache are large... |
|
|
8 |
opts_chunk$set(cache.lazy = FALSE) |
|
|
9 |
|
|
|
10 |
#' # Variable selection in data-driven health records with discretised |
|
|
11 |
#' # Cox models |
|
|
12 |
#' |
|
|
13 |
#' Having extracted around 600 variables which occur most frequently in patient |
|
|
14 |
#' records, let's try to narrow these down using a methodology based on varSelRf |
|
|
15 |
#' combined with survival modelling. We'll find the predictability of variables |
|
|
16 |
#' as defined by the p-value of a logrank test on survival curves of different |
|
|
17 |
#' categories within that variable, and then iteratively throw out unimportant |
|
|
18 |
#' variables, cross-validating for optimum performance. |
|
|
19 |
#' |
|
|
20 |
#' ## User variables |
|
|
21 |
#' |
|
|
22 |
#+ user_variables |
|
|
23 |
|
|
|
24 |
output.filename.base <- '../../output/cox-discrete-elasticnet-08' |
|
|
25 |
|
|
|
26 |
bootstraps <- 100 |
|
|
27 |
bootstrap.filename <- paste0(output.filename.base, '-boot-all.csv') |
|
|
28 |
|
|
|
29 |
n.data <- NA # This is after any variables being excluded in prep |
|
|
30 |
|
|
|
31 |
n.threads <- 16 |
|
|
32 |
|
|
|
33 |
#' ## Data set-up |
|
|
34 |
#' |
|
|
35 |
#+ data_setup |
|
|
36 |
|
|
|
37 |
data.filename.big <- '../../data/cohort-datadriven-02.csv' |
|
|
38 |
|
|
|
39 |
surv.predict.old <- c('age', 'smokstatus', 'imd_score', 'gender') |
|
|
40 |
untransformed.vars <- c('time_death', 'endpoint_death', 'exclude') |
|
|
41 |
|
|
|
42 |
source('../lib/shared.R') |
|
|
43 |
require(xtable) |
|
|
44 |
|
|
|
45 |
# Define these after shared.R or they will be overwritten! |
|
|
46 |
exclude.vars <- |
|
|
47 |
c( |
|
|
48 |
# Entity type 4 is smoking status, which we already have |
|
|
49 |
"clinical.values.4_data1", "clinical.values.4_data5", |
|
|
50 |
"clinical.values.4_data6", |
|
|
51 |
# Entity 13 data2 is the patient's weight centile, and not a single one is |
|
|
52 |
# entered, but they come out as 0 so the algorithm, looking for NAs, thinks |
|
|
53 |
# it's a useful column |
|
|
54 |
"clinical.values.13_data2", |
|
|
55 |
# Entities 148 and 149 are to do with death certification. I'm not sure how |
|
|
56 |
# it made it into the dataset, but since all the datapoints in this are |
|
|
57 |
# looking back in time, they're all NA. This causes rfsrc to fail. |
|
|
58 |
"clinical.values.148_data1", "clinical.values.148_data2", |
|
|
59 |
"clinical.values.148_data3", "clinical.values.148_data4", |
|
|
60 |
"clinical.values.148_data5", |
|
|
61 |
"clinical.values.149_data1", "clinical.values.149_data2", |
|
|
62 |
# These are all the same value except where NA, which causes issues with |
|
|
63 |
# discretisation |
|
|
64 |
"clinical.values.14_data2", "clinical.values.62_data1", |
|
|
65 |
"clinical.values.64_data1", "clinical.values.65_data1", |
|
|
66 |
"clinical.values.65_data1", "clinical.values.67_data1", |
|
|
67 |
"clinical.values.68_data2", "clinical.values.70_data1" |
|
|
68 |
) |
|
|
69 |
|
|
|
70 |
COHORT <- fread(data.filename.big) |
|
|
71 |
|
|
|
72 |
bigdata.prefixes <- |
|
|
73 |
c( |
|
|
74 |
'hes.icd.', |
|
|
75 |
'hes.opcs.', |
|
|
76 |
'tests.enttype.', |
|
|
77 |
'clinical.history.', |
|
|
78 |
'clinical.values.', |
|
|
79 |
'bnf.' |
|
|
80 |
) |
|
|
81 |
|
|
|
82 |
bigdata.columns <- |
|
|
83 |
colnames(COHORT)[ |
|
|
84 |
which( |
|
|
85 |
# Does is start with one of the data column names? |
|
|
86 |
startsWithAny(names(COHORT), bigdata.prefixes) & |
|
|
87 |
# And it's not one of the columns we want to exclude? |
|
|
88 |
!(colnames(COHORT) %in% exclude.vars) |
|
|
89 |
) |
|
|
90 |
] |
|
|
91 |
|
|
|
92 |
COHORT.bigdata <- |
|
|
93 |
COHORT[, c( |
|
|
94 |
untransformed.vars, surv.predict.old, bigdata.columns |
|
|
95 |
), |
|
|
96 |
with = FALSE |
|
|
97 |
] |
|
|
98 |
|
|
|
99 |
# Get the missingness before we start removing missing values |
|
|
100 |
missingness <- sort(sapply(COHORT.bigdata, percentMissing)) |
|
|
101 |
# Remove values for the 'untransformed.vars' above, which are the survival |
|
|
102 |
# values plus exclude column |
|
|
103 |
missingness <- missingness[!(names(missingness) %in% untransformed.vars)] |
|
|
104 |
|
|
|
105 |
# Deal appropriately with missing data |
|
|
106 |
# Most of the variables are number of days since the first record of that type |
|
|
107 |
time.based.vars <- |
|
|
108 |
names(COHORT.bigdata)[ |
|
|
109 |
startsWithAny( |
|
|
110 |
names(COHORT.bigdata), |
|
|
111 |
c('hes.icd.', 'hes.opcs.', 'clinical.history.') |
|
|
112 |
) |
|
|
113 |
] |
|
|
114 |
# We're dealing with this as a logical, so we want non-NA values to be TRUE, |
|
|
115 |
# is there is something in the history |
|
|
116 |
for (j in time.based.vars) { |
|
|
117 |
set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]])) |
|
|
118 |
} |
|
|
119 |
|
|
|
120 |
# Again, taking this as a logical, set any non-NA value to TRUE. |
|
|
121 |
prescriptions.vars <- names(COHORT.bigdata)[startsWith(names(COHORT.bigdata), 'bnf.')] |
|
|
122 |
for (j in prescriptions.vars) { |
|
|
123 |
set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]])) |
|
|
124 |
} |
|
|
125 |
|
|
|
126 |
# This leaves tests and clinical.values, which are test results and should be |
|
|
127 |
# imputed. |
|
|
128 |
|
|
|
129 |
# Manually fix clinical values items... |
|
|
130 |
# |
|
|
131 |
# "clinical.values.1_data1" "clinical.values.1_data2" |
|
|
132 |
# These are just blood pressure values...fine to impute |
|
|
133 |
# |
|
|
134 |
# "clinical.values.13_data1" "clinical.values.13_data3" |
|
|
135 |
# These are weight and BMI...also fine to impute |
|
|
136 |
# |
|
|
137 |
# Entity 5 is alcohol consumption status, 1 = Yes, 2 = No, 3 = Ex, so should be |
|
|
138 |
# a factor, and NA can be a factor level |
|
|
139 |
COHORT.bigdata$clinical.values.5_data1 <- |
|
|
140 |
factorNAfix(factor(COHORT.bigdata$clinical.values.5_data1), NAval = 'missing') |
|
|
141 |
|
|
|
142 |
# Both gender and smokstatus are factors...fix that |
|
|
143 |
COHORT.bigdata$gender <- factor(COHORT.bigdata$gender) |
|
|
144 |
COHORT.bigdata$smokstatus <- |
|
|
145 |
factorNAfix(factor(COHORT.bigdata$smokstatus), NAval = 'missing') |
|
|
146 |
|
|
|
147 |
# Exclude invalid patients |
|
|
148 |
COHORT.bigdata <- COHORT.bigdata[!COHORT.bigdata$exclude] |
|
|
149 |
COHORT.bigdata$exclude <- NULL |
|
|
150 |
|
|
|
151 |
# Remove negative survival times |
|
|
152 |
COHORT.bigdata <- subset(COHORT.bigdata, time_death > 0) |
|
|
153 |
|
|
|
154 |
# Define test set |
|
|
155 |
test.set <- testSetIndices(COHORT.bigdata, random.seed = 78361) |
|
|
156 |
|
|
|
157 |
# If n.data was specified, trim the data table down to size |
|
|
158 |
if(!is.na(n.data)) { |
|
|
159 |
COHORT.bigdata <- sample.df(COHORT.bigdata, n.data) |
|
|
160 |
} |
|
|
161 |
|
|
|
162 |
# Create an appropraite survival column |
|
|
163 |
COHORT.bigdata <- |
|
|
164 |
prepSurvCol( |
|
|
165 |
data.frame(COHORT.bigdata), 'time_death', 'endpoint_death', 'Death' |
|
|
166 |
) |
|
|
167 |
|
|
|
168 |
# Start by predicting survival with all the variables provided |
|
|
169 |
surv.predict <- c(surv.predict.old, bigdata.columns) |
|
|
170 |
|
|
|
171 |
# Set up a csv file to store calibration data, or retrieve previous data |
|
|
172 |
calibration.filename <- paste0(output.filename.base, '-varselcalibration.csv') |
|
|
173 |
|
|
|
174 |
# Create process settings |
|
|
175 |
|
|
|
176 |
# Variables to leave alone, including those whose logrank p-value is NA because |
|
|
177 |
# that means there is only one value in the column and so it can't be discretised |
|
|
178 |
# properly anyway |
|
|
179 |
vars.noprocess <- c('surv_time', 'surv_event') |
|
|
180 |
process.settings <- |
|
|
181 |
list( |
|
|
182 |
var = vars.noprocess, |
|
|
183 |
method = rep(NA, length(vars.noprocess)), |
|
|
184 |
settings = rep(list(NA), length(vars.noprocess)) |
|
|
185 |
) |
|
|
186 |
# Find continuous variables which will need discretising |
|
|
187 |
continuous.vars <- names(COHORT.bigdata)[sapply(COHORT.bigdata, class) %in% c('integer', 'numeric')] |
|
|
188 |
# Remove those variables already explicitly excluded, mainly for those whose |
|
|
189 |
# logrank score was NA |
|
|
190 |
continuous.vars <- continuous.vars[!(continuous.vars %in% process.settings$var)] |
|
|
191 |
process.settings$var <- c(process.settings$var, continuous.vars) |
|
|
192 |
process.settings$method <- |
|
|
193 |
c(process.settings$method, |
|
|
194 |
rep('binByQuantile', length(continuous.vars)) |
|
|
195 |
) |
|
|
196 |
process.settings$settings <- |
|
|
197 |
c( |
|
|
198 |
process.settings$settings, |
|
|
199 |
rep( |
|
|
200 |
list( |
|
|
201 |
seq( |
|
|
202 |
# Quantiles are obviously between 0 and 1 |
|
|
203 |
0, 1, |
|
|
204 |
# All have the same number of bins |
|
|
205 |
length.out = 10 |
|
|
206 |
) |
|
|
207 |
), |
|
|
208 |
length(continuous.vars) |
|
|
209 |
) |
|
|
210 |
) |
|
|
211 |
|
|
|
212 |
|
|
|
213 |
# Need a way to ID in advance those which are going to fail here, ie those where |
|
|
214 |
# there are no quantiles. The |
|
|
215 |
|
|
|
216 |
COHORT.prep <- |
|
|
217 |
prepData( |
|
|
218 |
# Data for cross-validation excludes test set |
|
|
219 |
COHORT.bigdata, |
|
|
220 |
names(COHORT.bigdata), |
|
|
221 |
process.settings, |
|
|
222 |
'surv_time', 'surv_event', |
|
|
223 |
TRUE |
|
|
224 |
) |
|
|
225 |
|
|
|
226 |
# Kludge...remove surv_time.1 and rename surv_event.1 |
|
|
227 |
COHORT.prep$surv_time.1 <- NULL |
|
|
228 |
names(COHORT.prep)[names(COHORT.prep) == 'surv_event.1'] <- 'surv_event' |
|
|
229 |
|
|
|
230 |
COHORT.bin <- convertFactorsToBinaryColumns(COHORT.prep) |
|
|
231 |
# model.matrix renames logicals to varTRUE, so fix that for status |
|
|
232 |
colnames(COHORT.bin)[colnames(COHORT.bin) == 'surv_eventTRUE'] <- 'surv_event' |
|
|
233 |
|
|
|
234 |
# Coxnet code, should you ever decide to go that route |
|
|
235 |
# test <- |
|
|
236 |
# Coxnet( |
|
|
237 |
# data.matrix(COHORT.bin[-test.set, !(colnames(COHORT.bin) %in% c('time', 'status'))]), |
|
|
238 |
# data.matrix(COHORT.bin[-test.set, c('time', 'status')]), |
|
|
239 |
# penalty = 'Enet', |
|
|
240 |
# alpha = 0, |
|
|
241 |
# nlambda = 50, nfolds = 10, maxit = 1e+5 |
|
|
242 |
# ) |
|
|
243 |
|
|
|
244 |
#' ## Elastic net regression |
|
|
245 |
#' |
|
|
246 |
#' Run a loop over alphas running from LASSO to ridge regression, and see which |
|
|
247 |
#' is best after tenfold cross-validation... |
|
|
248 |
#' |
|
|
249 |
#+ elastic_net_full |
|
|
250 |
|
|
|
251 |
require(glmnet) |
|
|
252 |
initParallel(n.threads) |
|
|
253 |
|
|
|
254 |
time.start <- handyTimer() |
|
|
255 |
|
|
|
256 |
alphas <- seq(0, 1, length.out = 101) |
|
|
257 |
mse <- c() |
|
|
258 |
|
|
|
259 |
for(alpha in alphas) { |
|
|
260 |
cv.fit <- |
|
|
261 |
cv.glmnet( |
|
|
262 |
COHORT.bin[-test.set, !(colnames(COHORT.bin) %in% c('surv_time', 'surv_event'))], |
|
|
263 |
Surv(COHORT.bin[-test.set, 'surv_time'], COHORT.bin[-test.set, 'surv_event']), |
|
|
264 |
family = "cox", |
|
|
265 |
maxit = 1000, |
|
|
266 |
alpha = alpha, |
|
|
267 |
parallel = TRUE |
|
|
268 |
) |
|
|
269 |
best.lambda.i <- which(cv.fit$lambda == cv.fit$lambda.min) |
|
|
270 |
mse <- c(mse, cv.fit$cvm[best.lambda.i]) |
|
|
271 |
} |
|
|
272 |
|
|
|
273 |
time.cv <- handyTimer(time.start) |
|
|
274 |
|
|
|
275 |
write.csv( |
|
|
276 |
data.frame( |
|
|
277 |
alphas, mse |
|
|
278 |
), |
|
|
279 |
paste0(output.filename.base, '-alpha-calibration.csv') |
|
|
280 |
) |
|
|
281 |
|
|
|
282 |
#' `r length(alphas)` alpha values tested in `r time.cv` seconds! |
|
|
283 |
|
|
|
284 |
alpha.best <- alphas[which.min(mse)] |
|
|
285 |
|
|
|
286 |
# To avoid saving all the fits, let's just refit the best one |
|
|
287 |
cv.fit <- |
|
|
288 |
cv.glmnet( |
|
|
289 |
COHORT.bin[-test.set, !(colnames(COHORT.bin) %in% c('surv_time', 'surv_event'))], |
|
|
290 |
Surv(COHORT.bin[-test.set, 'surv_time'], COHORT.bin[-test.set, 'surv_event']), |
|
|
291 |
family = "cox", |
|
|
292 |
maxit = 1000, |
|
|
293 |
alpha = alpha.best, |
|
|
294 |
parallel = TRUE |
|
|
295 |
) |
|
|
296 |
|
|
|
297 |
# Save for future use |
|
|
298 |
saveRDS(cv.fit, 'cv.fit.rds') |
|
|
299 |
|
|
|
300 |
#' The best alpha was `r alpha.best`, and the lambda with the lowest mean-square |
|
|
301 |
#' error was `r cv.fit$lambda.min`. We'll be using the strictest lambda which is |
|
|
302 |
#' within 1 se of the minimum, `r cv.fit$lambda.1se`. |
|
|
303 |
#' |
|
|
304 |
#' ## Performance |
|
|
305 |
#' |
|
|
306 |
#' ### C-index |
|
|
307 |
#' |
|
|
308 |
#' Calculate C-index manually. The glmnet interface requiring matrices is |
|
|
309 |
#' sufficiently different to the usual one that I've not spent time integrating |
|
|
310 |
#' it with the rest of the ``handymedical.R`` functions yet. |
|
|
311 |
#' |
|
|
312 |
#+ c_index |
|
|
313 |
|
|
|
314 |
glmnetCIndex <- function(model.fit, dm) { |
|
|
315 |
test.predictions <- |
|
|
316 |
getRisk( |
|
|
317 |
model.fit, |
|
|
318 |
dm[, !(colnames(dm) %in% c('surv_time', 'surv_event'))] |
|
|
319 |
) |
|
|
320 |
|
|
|
321 |
as.numeric( |
|
|
322 |
survConcordance( |
|
|
323 |
as.formula(paste0('Surv(surv_time, surv_event) ~ risk')), |
|
|
324 |
data.frame( |
|
|
325 |
surv_time = dm[, 'surv_time'], |
|
|
326 |
surv_event = dm[, 'surv_event'], |
|
|
327 |
risk = test.predictions |
|
|
328 |
) |
|
|
329 |
)$concordance |
|
|
330 |
) |
|
|
331 |
} |
|
|
332 |
|
|
|
333 |
c.index <- glmnetCIndex(cv.fit, COHORT.bin[test.set, ]) |
|
|
334 |
|
|
|
335 |
#' C-index is `r c.index`. |
|
|
336 |
#' |
|
|
337 |
#' ### Calibration |
|
|
338 |
#' |
|
|
339 |
#' For now, calibration is manual too just to get it working. It's surprisingly |
|
|
340 |
#' hard... A package called `c060` should contain a function `predictProb`, but |
|
|
341 |
#' on loading it, is says function not found. So here is a very manual solution, |
|
|
342 |
#' creating a dummy Cox model using the `survival` package, inspired by |
|
|
343 |
#' [this](https://stat.ethz.ch/pipermail/r-help/2012-May/312029.html). |
|
|
344 |
#' |
|
|
345 |
#+ calibration_plot |
|
|
346 |
|
|
|
347 |
glmnetCalibrationTable <- function(model.fit, dm, test.set, risk.time = 5) { |
|
|
348 |
# Work out risks at risk.time for the special case of a glmnet model |
|
|
349 |
|
|
|
350 |
# Derive baseline hazard from cv.glmnet model, heavily based on the |
|
|
351 |
# glmnet.survcurve and glmnet.basesurv functions in hdnom... |
|
|
352 |
|
|
|
353 |
# Get predictions from the training set, because it's the training set whose |
|
|
354 |
# baseline hazard we need |
|
|
355 |
|
|
|
356 |
# This is the relevant section of glmnet.survcurve from 02-hdnom-nomogram.R: |
|
|
357 |
# lp = as.numeric(predict(object, newx = data.matrix(x), |
|
|
358 |
# s = object$'lambda', type = 'link')) |
|
|
359 |
# lp means linear predictor from predict.glmnet, because type = 'link' |
|
|
360 |
lp <- |
|
|
361 |
as.numeric( |
|
|
362 |
predict( |
|
|
363 |
model.fit, |
|
|
364 |
newx = |
|
|
365 |
data.matrix( |
|
|
366 |
dm[-test.set, !(colnames(dm) %in% c('surv_time', 'surv_event'))] |
|
|
367 |
), |
|
|
368 |
s = model.fit$lambda.1se, |
|
|
369 |
type = 'link' |
|
|
370 |
) |
|
|
371 |
) |
|
|
372 |
# At all unique times in the training set... |
|
|
373 |
t.unique <- |
|
|
374 |
# MUST sort these or the cumulative sum below will go crazy! |
|
|
375 |
sort(unique(dm[-test.set, 'surv_time'][dm[-test.set, 'surv_event'] == 1L])) |
|
|
376 |
|
|
|
377 |
alpha <- c() |
|
|
378 |
for (i in 1:length(t.unique)) { |
|
|
379 |
# ...loop over calculating the fraction of the population which dies at each |
|
|
380 |
# timepoint |
|
|
381 |
alpha[i] <- |
|
|
382 |
sum( |
|
|
383 |
# Training set |
|
|
384 |
dm[-test.set, 'surv_time'][ |
|
|
385 |
dm[-test.set, 'surv_event'] == 1 |
|
|
386 |
] == t.unique[i] |
|
|
387 |
) / |
|
|
388 |
sum( |
|
|
389 |
exp(lp[dm[-test.set, 'surv_time'] >= t.unique[i]]) |
|
|
390 |
) |
|
|
391 |
} |
|
|
392 |
|
|
|
393 |
# Get the cumulative hazard at risk.time by interpolating... |
|
|
394 |
baseline.cumhaz <- |
|
|
395 |
approx( |
|
|
396 |
t.unique, cumsum(alpha), yleft = 0, xout = risk.time, rule = 2 |
|
|
397 |
)$y |
|
|
398 |
|
|
|
399 |
# Get predictions from the test set to modify the baseline hazard with |
|
|
400 |
lp.test <- |
|
|
401 |
as.numeric( |
|
|
402 |
predict( |
|
|
403 |
model.fit, |
|
|
404 |
newx = |
|
|
405 |
data.matrix( |
|
|
406 |
dm[test.set, !(colnames(dm) %in% c('surv_time', 'surv_event'))] |
|
|
407 |
), |
|
|
408 |
s = model.fit$lambda.1se, |
|
|
409 |
type = 'link' |
|
|
410 |
) |
|
|
411 |
) |
|
|
412 |
|
|
|
413 |
# 1 minus to get % dead rather than alive |
|
|
414 |
risks <- 1 - exp(-exp(lp.test) * (baseline.cumhaz)) |
|
|
415 |
|
|
|
416 |
calibration.table <- |
|
|
417 |
data.frame( |
|
|
418 |
surv_event = dm[test.set, 'surv_event'], |
|
|
419 |
surv_time = dm[test.set, 'surv_time'], |
|
|
420 |
risk = risks |
|
|
421 |
) |
|
|
422 |
|
|
|
423 |
# Was there an event? Start with NA, because default is unknown (ie censored) |
|
|
424 |
calibration.table$event <- NA |
|
|
425 |
# Event before risk.time |
|
|
426 |
calibration.table$event[ |
|
|
427 |
calibration.table$surv_event & calibration.table$surv_time <= risk.time |
|
|
428 |
] <- TRUE |
|
|
429 |
# Event after, whether censorship or not, means no event by risk.time |
|
|
430 |
calibration.table$event[calibration.table$surv_time > risk.time] <- FALSE |
|
|
431 |
# Otherwise, censored before risk.time, leave as NA |
|
|
432 |
|
|
|
433 |
# Drop unnecessary columns and return |
|
|
434 |
calibration.table[, c('risk', 'event')] |
|
|
435 |
} |
|
|
436 |
|
|
|
437 |
calibration.table <- glmnetCalibrationTable(cv.fit, COHORT.bin, test.set) |
|
|
438 |
|
|
|
439 |
calibration.score <- calibrationScore(calibration.table) |
|
|
440 |
|
|
|
441 |
calibrationPlot(calibration.table, show.censored = TRUE, max.points = 10000) |
|
|
442 |
|
|
|
443 |
#' Calibration score is `r calibration.score`. |
|
|
444 |
|
|
|
445 |
#' ### Coefficients |
|
|
446 |
#' |
|
|
447 |
#' The elastic net regression generates coefficients for every factor/level |
|
|
448 |
#' combination (for continuous values, this means that every decile gets its own |
|
|
449 |
#' coefficient). The lambda penalty means that some amount of variable selection |
|
|
450 |
#' is performed in this process, penalising too many large coefficients. |
|
|
451 |
#' Depending on the value of alpha, quite a few of the coefficients can be |
|
|
452 |
#' exactly zero. Let's have a look at what we got... |
|
|
453 |
#' |
|
|
454 |
#+ coefficients |
|
|
455 |
|
|
|
456 |
# Make a data frame of the coefficients in decreasing order |
|
|
457 |
cv.fit.coefficients.ordered <- |
|
|
458 |
data.frame( |
|
|
459 |
factorlevel = # Names are ordered in decreasing order of absolute value |
|
|
460 |
factorOrderedLevels( |
|
|
461 |
colnames(COHORT.bin)[ |
|
|
462 |
order(abs(coef(cv.fit, s = "lambda.1se")), decreasing = TRUE) |
|
|
463 |
] |
|
|
464 |
), |
|
|
465 |
val = |
|
|
466 |
coef(cv.fit, s = "lambda.1se")[ |
|
|
467 |
order(abs(coef(cv.fit, s = "lambda.1se")), decreasing = TRUE) |
|
|
468 |
] |
|
|
469 |
) |
|
|
470 |
|
|
|
471 |
# Get the variable names by removing TRUE, (x,y] or missing from the end |
|
|
472 |
cv.fit.coefficients.ordered$var <- |
|
|
473 |
gsub('TRUE', '', cv.fit.coefficients.ordered$factorlevel) |
|
|
474 |
cv.fit.coefficients.ordered$var <- |
|
|
475 |
# Can contain numbers, decimals, e+/- notation and commas separating bounds |
|
|
476 |
gsub('\\([0-9,.e\\+-]+\\]', '', cv.fit.coefficients.ordered$var) |
|
|
477 |
cv.fit.coefficients.ordered$var <- |
|
|
478 |
gsub('missing', '', cv.fit.coefficients.ordered$var) |
|
|
479 |
# Kludgey manual fix for 5_data1 which can take values 1, 2 or 3 and is |
|
|
480 |
# therefore very hard to catch |
|
|
481 |
cv.fit.coefficients.ordered$var <- |
|
|
482 |
gsub( |
|
|
483 |
'clinical.values.5_data1[0-9]', 'clinical.values.5_data1', |
|
|
484 |
cv.fit.coefficients.ordered$var |
|
|
485 |
) |
|
|
486 |
|
|
|
487 |
# And then get human-readable descriptions |
|
|
488 |
cv.fit.coefficients.ordered$desc <- |
|
|
489 |
lookUpDescriptions(cv.fit.coefficients.ordered$var) |
|
|
490 |
|
|
|
491 |
#' #### Top 30 coefficients |
|
|
492 |
#' |
|
|
493 |
#+ coefficients_table, results='asis' |
|
|
494 |
print( |
|
|
495 |
xtable( |
|
|
496 |
cv.fit.coefficients.ordered[1:30, c('desc', 'factorlevel', 'val')], |
|
|
497 |
digits = c(0, 0, 0, 3) |
|
|
498 |
), |
|
|
499 |
type = 'html', |
|
|
500 |
include.rownames = FALSE |
|
|
501 |
) |
|
|
502 |
|
|
|
503 |
#' #### Graph of all coefficient values |
|
|
504 |
#' |
|
|
505 |
#' Nonzero values are red, zero values are blue. |
|
|
506 |
|
|
|
507 |
ggplot(cv.fit.coefficients.ordered, aes(x = factorlevel, y = val, colour = val == 0)) + |
|
|
508 |
geom_point() + |
|
|
509 |
theme( |
|
|
510 |
axis.title.x=element_blank(), axis.text.x=element_blank(), |
|
|
511 |
axis.ticks.x=element_blank() |
|
|
512 |
) |
|
|
513 |
|
|
|
514 |
#' Overall, there are `r sum(cv.fit.coefficients.ordered$val != 0)` nonzero |
|
|
515 |
#' coefficients out of `r nrow(cv.fit.coefficients.ordered)`. In the case of |
|
|
516 |
#' multilevel factors or continuous values, multiple coefficients may result |
|
|
517 |
#' from a single variable in the original data. Correcting for this, there are |
|
|
518 |
#' `r length(unique(cv.fit.coefficients.ordered$var[cv.fit.coefficients.ordered$val != 0]))` |
|
|
519 |
#' unique variables represented out of |
|
|
520 |
#' `r length(unique(cv.fit.coefficients.ordered$var))` total variables. |
|
|
521 |
#' |
|
|
522 |
#' ## Bootstrapping |
|
|
523 |
#' |
|
|
524 |
#' Having got those results for a single run on all the data, now bootstrap to |
|
|
525 |
#' find sample-induced variability in performance statistics. Again, because |
|
|
526 |
#' glmnet requires a matrix rather than a data frame this would require a large |
|
|
527 |
#' amount of code in ``handymedical.R``, so do this manually. |
|
|
528 |
#' |
|
|
529 |
#+ bootstrap_performance |
|
|
530 |
|
|
|
531 |
time.start <- handyTimer() |
|
|
532 |
|
|
|
533 |
# Instantiate a blank data frame |
|
|
534 |
bootstrap.params <- data.frame() |
|
|
535 |
|
|
|
536 |
for(i in 1:bootstraps) { |
|
|
537 |
# Take a bootstrap sample of the training set. We do this with COHORT.prep for |
|
|
538 |
# the variable importance calculations later. |
|
|
539 |
COHORT.prep.boot <- bootstrapSampleDf(COHORT.prep[-test.set, ]) |
|
|
540 |
|
|
|
541 |
# Create a binary matrix for fitting |
|
|
542 |
COHORT.boot <- convertFactorsToBinaryColumns(COHORT.prep.boot) |
|
|
543 |
# model.matrix renames logicals to varTRUE, so fix that for status |
|
|
544 |
colnames(COHORT.boot)[colnames(COHORT.boot) == 'surv_eventTRUE'] <- 'surv_event' |
|
|
545 |
|
|
|
546 |
# Fit, but with alpha fixed on the optimal value |
|
|
547 |
cv.fit.boot <- #readRDS('cv.fit.rds') |
|
|
548 |
cv.glmnet( |
|
|
549 |
COHORT.boot[, !(colnames(COHORT.boot) %in% c('surv_time', 'surv_event'))], |
|
|
550 |
Surv(COHORT.boot[, 'surv_time'], COHORT.boot[, 'surv_event']), |
|
|
551 |
family = "cox", |
|
|
552 |
maxit = 1000, |
|
|
553 |
alpha = alpha.best |
|
|
554 |
) |
|
|
555 |
|
|
|
556 |
c.index.boot <- glmnetCIndex(cv.fit.boot, COHORT.bin[test.set,]) |
|
|
557 |
|
|
|
558 |
calibration.table.boot <- |
|
|
559 |
glmnetCalibrationTable( |
|
|
560 |
cv.fit.boot, rbind(COHORT.boot, COHORT.bin[test.set, ]), |
|
|
561 |
test.set = (nrow(COHORT.boot) + 1):(nrow(COHORT.boot) + nrow(COHORT.bin[test.set, ])) |
|
|
562 |
) |
|
|
563 |
|
|
|
564 |
calibration.boot <- calibrationScore(calibration.table.boot) |
|
|
565 |
|
|
|
566 |
print(calibrationPlot(calibration.table.boot)) |
|
|
567 |
|
|
|
568 |
var.imp.vector <- c() |
|
|
569 |
# Loop over variables to get variable importance |
|
|
570 |
for( |
|
|
571 |
var in |
|
|
572 |
colnames( |
|
|
573 |
COHORT.prep.boot[, !(colnames(COHORT.prep.boot) %in% c('surv_time', 'surv_event'))] |
|
|
574 |
) |
|
|
575 |
) { |
|
|
576 |
# Create a dummy data frame and scramble the column var |
|
|
577 |
COHORT.vimp <- COHORT.prep.boot |
|
|
578 |
COHORT.vimp[, var] <- sample(COHORT.vimp[, var], replace = TRUE) |
|
|
579 |
# Make it into a model matrix for fitting |
|
|
580 |
COHORT.vimp <- convertFactorsToBinaryColumns(COHORT.vimp) |
|
|
581 |
# model.matrix renames logicals to varTRUE, so fix that for status |
|
|
582 |
colnames(COHORT.vimp)[colnames(COHORT.vimp) == 'surv_eventTRUE'] <- 'surv_event' |
|
|
583 |
# Calculate the new C-index |
|
|
584 |
c.index.vimp <- glmnetCIndex(cv.fit.boot, COHORT.vimp) |
|
|
585 |
|
|
|
586 |
# Append the difference between the C-index with scrambling and the original |
|
|
587 |
var.imp.vector <- |
|
|
588 |
c( |
|
|
589 |
var.imp.vector, |
|
|
590 |
c.index.boot - c.index.vimp |
|
|
591 |
) |
|
|
592 |
} |
|
|
593 |
|
|
|
594 |
names(var.imp.vector) <- |
|
|
595 |
paste0( |
|
|
596 |
'vimp.c.index.', |
|
|
597 |
colnames( |
|
|
598 |
COHORT.prep.boot[ |
|
|
599 |
, |
|
|
600 |
!(colnames(COHORT.prep.boot) %in% c('surv_time', 'surv_event')) |
|
|
601 |
] |
|
|
602 |
) |
|
|
603 |
) |
|
|
604 |
|
|
|
605 |
bootstrap.params <- |
|
|
606 |
rbind( |
|
|
607 |
bootstrap.params, |
|
|
608 |
data.frame( |
|
|
609 |
t(var.imp.vector), |
|
|
610 |
c.index = c.index.boot, |
|
|
611 |
calibration.score = calibration.boot |
|
|
612 |
) |
|
|
613 |
) |
|
|
614 |
|
|
|
615 |
# Save the bootstrap parameters for later use |
|
|
616 |
write.csv(bootstrap.params, bootstrap.filename) |
|
|
617 |
} |
|
|
618 |
|
|
|
619 |
time.boot.final <- handyTimer(time.start) |
|
|
620 |
|
|
|
621 |
#' `r bootstraps` bootstrap fits completed in `r time.boot.final` seconds! |
|
|
622 |
|
|
|
623 |
# Get coefficients and variable importances from bootstrap fits |
|
|
624 |
surv.model.fit.coeffs <- bootStatsDf(bootstrap.params) |
|
|
625 |
|
|
|
626 |
print(surv.model.fit.coeffs) |
|
|
627 |
|
|
|
628 |
# Save performance results |
|
|
629 |
varsToTable( |
|
|
630 |
data.frame( |
|
|
631 |
model = 'cox-elnet', |
|
|
632 |
imputation = FALSE, |
|
|
633 |
discretised = TRUE, |
|
|
634 |
c.index = surv.model.fit.coeffs['c.index', 'val'], |
|
|
635 |
c.index.lower = surv.model.fit.coeffs['c.index', 'lower'], |
|
|
636 |
c.index.upper = surv.model.fit.coeffs['c.index', 'upper'], |
|
|
637 |
calibration.score = surv.model.fit.coeffs['calibration.score', 'val'], |
|
|
638 |
calibration.score.lower = |
|
|
639 |
surv.model.fit.coeffs['calibration.score', 'lower'], |
|
|
640 |
calibration.score.upper = |
|
|
641 |
surv.model.fit.coeffs['calibration.score', 'upper'] |
|
|
642 |
), |
|
|
643 |
performance.file, |
|
|
644 |
index.cols = c('model', 'imputation', 'discretised') |
|
|
645 |
) |
|
|
646 |
|
|
|
647 |
#' The bootstrapped C-index is |
|
|
648 |
#' **`r round(surv.model.fit.coeffs['c.index', 'val'], 3)` |
|
|
649 |
#' (`r round(surv.model.fit.coeffs['c.index', 'lower'], 3)` - |
|
|
650 |
#' `r round(surv.model.fit.coeffs['c.index', 'upper'], 3)`)** |
|
|
651 |
#' on the held-out test set. |
|
|
652 |
#' |
|
|
653 |
#' The bootstrapped calibration score is |
|
|
654 |
#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)` |
|
|
655 |
#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` - |
|
|
656 |
#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**. |
|
|
657 |
#' |
|
|
658 |
#' ### Variable importances |
|
|
659 |
#' |
|
|
660 |
#' Top 20 most important variables from the most recent bootstrap. (This is |
|
|
661 |
#' obviously indicative but just to plot a quick graph and get an idea.) |
|
|
662 |
#' |
|
|
663 |
#+ bootstrap_var_imp |
|
|
664 |
|
|
|
665 |
boot.var.imp.ordered <- |
|
|
666 |
data.frame( |
|
|
667 |
var = textAfter(names(var.imp.vector), 'vimp.c.index.'), |
|
|
668 |
val = var.imp.vector, |
|
|
669 |
stringsAsFactors = FALSE |
|
|
670 |
) |
|
|
671 |
|
|
|
672 |
boot.var.imp.ordered$desc <- lookUpDescriptions(boot.var.imp.ordered$var) |
|
|
673 |
|
|
|
674 |
ggplot( |
|
|
675 |
boot.var.imp.ordered[order(boot.var.imp.ordered$val[1:20], decreasing = TRUE), ], |
|
|
676 |
aes(x = var, y = val) |
|
|
677 |
) + |
|
|
678 |
geom_bar(stat = 'identity') |