Switch to unified view

a b/cox-ph/cox-discrete-elasticnet.R
1
#+ knitr_setup, include = FALSE
2
3
# Whether to cache the intensive code sections. Set to FALSE to recalculate
4
# everything afresh.
5
cacheoption <- TRUE
6
# Disable lazy caching globally, because it fails for large objects, and all the
7
# objects we wish to cache are large...
8
opts_chunk$set(cache.lazy = FALSE)
9
10
#' # Variable selection in data-driven health records with discretised
11
#' # Cox models
12
#' 
13
#' Having extracted around 600 variables which occur most frequently in patient
14
#' records, let's try to narrow these down using a methodology based on varSelRf
15
#' combined with survival modelling. We'll find the predictability of variables
16
#' as defined by the p-value of a logrank test on survival curves of different
17
#' categories within that variable, and then iteratively throw out unimportant
18
#' variables, cross-validating for optimum performance.
19
#' 
20
#' ## User variables
21
#' 
22
#+ user_variables
23
24
output.filename.base <- '../../output/cox-discrete-elasticnet-08'
25
26
bootstraps <- 100
27
bootstrap.filename <- paste0(output.filename.base, '-boot-all.csv')
28
29
n.data <- NA # This is after any variables being excluded in prep
30
31
n.threads <- 16
32
33
#' ## Data set-up
34
#' 
35
#+ data_setup
36
37
data.filename.big <- '../../data/cohort-datadriven-02.csv'
38
39
surv.predict.old <- c('age', 'smokstatus', 'imd_score', 'gender')
40
untransformed.vars <- c('time_death', 'endpoint_death', 'exclude')
41
42
source('../lib/shared.R')
43
require(xtable)
44
45
# Define these after shared.R or they will be overwritten!
46
exclude.vars <-
47
  c(
48
    # Entity type 4 is smoking status, which we already have
49
    "clinical.values.4_data1", "clinical.values.4_data5",
50
    "clinical.values.4_data6",
51
    # Entity 13 data2 is the patient's weight centile, and not a single one is
52
    # entered, but they come out as 0 so the algorithm, looking for NAs, thinks
53
    # it's a useful column
54
    "clinical.values.13_data2",
55
    # Entities 148 and 149 are to do with death certification. I'm not sure how 
56
    # it made it into the dataset, but since all the datapoints in this are
57
    # looking back in time, they're all NA. This causes rfsrc to fail.
58
    "clinical.values.148_data1", "clinical.values.148_data2",
59
    "clinical.values.148_data3", "clinical.values.148_data4",
60
    "clinical.values.148_data5",
61
    "clinical.values.149_data1", "clinical.values.149_data2",
62
    # These are all the same value except where NA, which causes issues with
63
    # discretisation
64
    "clinical.values.14_data2", "clinical.values.62_data1",
65
    "clinical.values.64_data1", "clinical.values.65_data1",
66
    "clinical.values.65_data1", "clinical.values.67_data1",
67
    "clinical.values.68_data2", "clinical.values.70_data1"
68
  )
69
70
COHORT <- fread(data.filename.big)
71
72
bigdata.prefixes <-
73
  c(
74
    'hes.icd.',
75
    'hes.opcs.',
76
    'tests.enttype.',
77
    'clinical.history.',
78
    'clinical.values.',
79
    'bnf.'
80
  )
81
82
bigdata.columns <-
83
  colnames(COHORT)[
84
    which(
85
      # Does is start with one of the data column names?
86
      startsWithAny(names(COHORT), bigdata.prefixes) &
87
        # And it's not one of the columns we want to exclude?
88
        !(colnames(COHORT) %in% exclude.vars)
89
    )
90
    ]
91
92
COHORT.bigdata <-
93
  COHORT[, c(
94
    untransformed.vars, surv.predict.old, bigdata.columns
95
  ),
96
  with = FALSE
97
  ]
98
99
# Get the missingness before we start removing missing values
100
missingness <- sort(sapply(COHORT.bigdata, percentMissing))
101
# Remove values for the 'untransformed.vars' above, which are the survival
102
# values plus exclude column
103
missingness <- missingness[!(names(missingness) %in% untransformed.vars)]
104
105
# Deal appropriately with missing data
106
# Most of the variables are number of days since the first record of that type
107
time.based.vars <-
108
  names(COHORT.bigdata)[
109
    startsWithAny(
110
      names(COHORT.bigdata),
111
      c('hes.icd.', 'hes.opcs.', 'clinical.history.')
112
    )
113
    ]
114
# We're dealing with this as a logical, so we want non-NA values to be TRUE,
115
# is there is something in the history
116
for (j in time.based.vars) {
117
  set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]]))
118
}
119
120
# Again, taking this as a logical, set any non-NA value to TRUE.
121
prescriptions.vars <- names(COHORT.bigdata)[startsWith(names(COHORT.bigdata), 'bnf.')]
122
for (j in prescriptions.vars) {
123
  set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]]))
124
}
125
126
# This leaves tests and clinical.values, which are test results and should be
127
# imputed.
128
129
# Manually fix clinical values items...
130
#
131
# "clinical.values.1_data1"  "clinical.values.1_data2"
132
# These are just blood pressure values...fine to impute
133
#
134
# "clinical.values.13_data1" "clinical.values.13_data3"
135
# These are weight and BMI...also fine to impute
136
#
137
# Entity 5 is alcohol consumption status, 1 = Yes, 2 = No, 3 = Ex, so should be
138
# a factor, and NA can be a factor level
139
COHORT.bigdata$clinical.values.5_data1 <-
140
  factorNAfix(factor(COHORT.bigdata$clinical.values.5_data1), NAval = 'missing')
141
142
# Both gender and smokstatus are factors...fix that
143
COHORT.bigdata$gender <- factor(COHORT.bigdata$gender)
144
COHORT.bigdata$smokstatus <-
145
  factorNAfix(factor(COHORT.bigdata$smokstatus), NAval = 'missing')
146
147
# Exclude invalid patients
148
COHORT.bigdata <- COHORT.bigdata[!COHORT.bigdata$exclude]
149
COHORT.bigdata$exclude <- NULL
150
151
# Remove negative survival times
152
COHORT.bigdata <- subset(COHORT.bigdata, time_death > 0)
153
154
# Define test set
155
test.set <- testSetIndices(COHORT.bigdata, random.seed = 78361)
156
157
# If n.data was specified, trim the data table down to size
158
if(!is.na(n.data)) {
159
  COHORT.bigdata <- sample.df(COHORT.bigdata, n.data)
160
}
161
162
# Create an appropraite survival column
163
COHORT.bigdata <- 
164
  prepSurvCol(
165
    data.frame(COHORT.bigdata), 'time_death', 'endpoint_death', 'Death'
166
  )
167
168
# Start by predicting survival with all the variables provided
169
surv.predict <- c(surv.predict.old, bigdata.columns)
170
171
# Set up a csv file to store calibration data, or retrieve previous data
172
calibration.filename <- paste0(output.filename.base, '-varselcalibration.csv')
173
174
# Create process settings
175
176
# Variables to leave alone, including those whose logrank p-value is NA because
177
# that means there is only one value in the column and so it can't be discretised
178
# properly anyway
179
vars.noprocess <- c('surv_time', 'surv_event')
180
process.settings <-
181
  list(
182
    var        = vars.noprocess,
183
    method     = rep(NA, length(vars.noprocess)),
184
    settings   = rep(list(NA), length(vars.noprocess))
185
  )
186
# Find continuous variables which will need discretising
187
continuous.vars <- names(COHORT.bigdata)[sapply(COHORT.bigdata, class) %in% c('integer', 'numeric')]
188
# Remove those variables already explicitly excluded, mainly for those whose
189
# logrank score was NA
190
continuous.vars <- continuous.vars[!(continuous.vars %in% process.settings$var)]
191
process.settings$var <- c(process.settings$var, continuous.vars)
192
process.settings$method <-
193
  c(process.settings$method,
194
    rep('binByQuantile', length(continuous.vars))
195
  )
196
process.settings$settings <-
197
  c(
198
    process.settings$settings,
199
    rep(
200
      list(
201
        seq(
202
          # Quantiles are obviously between 0 and 1
203
          0, 1,
204
          # All have the same number of bins
205
          length.out = 10
206
        )
207
      ),
208
      length(continuous.vars)
209
    )
210
  )
211
212
213
# Need a way to ID in advance those which are going to fail here, ie those where
214
# there are no quantiles. The 
215
216
COHORT.prep <-
217
  prepData(
218
    # Data for cross-validation excludes test set
219
    COHORT.bigdata,
220
    names(COHORT.bigdata),
221
    process.settings,
222
    'surv_time', 'surv_event',
223
    TRUE
224
  )
225
226
# Kludge...remove surv_time.1 and rename surv_event.1
227
COHORT.prep$surv_time.1 <- NULL
228
names(COHORT.prep)[names(COHORT.prep) == 'surv_event.1'] <- 'surv_event'
229
230
COHORT.bin <- convertFactorsToBinaryColumns(COHORT.prep)
231
# model.matrix renames logicals to varTRUE, so fix that for status
232
colnames(COHORT.bin)[colnames(COHORT.bin) == 'surv_eventTRUE'] <- 'surv_event'
233
234
# Coxnet code, should you ever decide to go that route
235
# test <-
236
#   Coxnet(
237
#     data.matrix(COHORT.bin[-test.set, !(colnames(COHORT.bin) %in% c('time', 'status'))]),
238
#     data.matrix(COHORT.bin[-test.set, c('time', 'status')]),
239
#     penalty = 'Enet',
240
#     alpha = 0,
241
#     nlambda = 50, nfolds = 10, maxit = 1e+5
242
#   )
243
244
#' ## Elastic net regression
245
#' 
246
#' Run a loop over alphas running from LASSO to ridge regression, and see which
247
#' is best after tenfold cross-validation...
248
#' 
249
#+ elastic_net_full
250
251
require(glmnet)
252
initParallel(n.threads)
253
254
time.start <- handyTimer()
255
256
alphas <- seq(0, 1, length.out = 101)
257
mse <- c()
258
259
for(alpha in alphas) {
260
  cv.fit <-
261
    cv.glmnet(
262
      COHORT.bin[-test.set, !(colnames(COHORT.bin) %in% c('surv_time', 'surv_event'))],
263
      Surv(COHORT.bin[-test.set, 'surv_time'], COHORT.bin[-test.set, 'surv_event']),
264
      family = "cox",
265
      maxit = 1000,
266
      alpha = alpha,
267
      parallel = TRUE
268
    )
269
  best.lambda.i <- which(cv.fit$lambda == cv.fit$lambda.min)
270
  mse <- c(mse, cv.fit$cvm[best.lambda.i])
271
}
272
273
time.cv <- handyTimer(time.start)
274
275
write.csv(
276
  data.frame(
277
    alphas, mse
278
  ),
279
  paste0(output.filename.base, '-alpha-calibration.csv')
280
)
281
282
#' `r length(alphas)` alpha values tested in `r time.cv` seconds!
283
284
alpha.best <- alphas[which.min(mse)]
285
286
# To avoid saving all the fits, let's just refit the best one
287
cv.fit <-
288
  cv.glmnet(
289
    COHORT.bin[-test.set, !(colnames(COHORT.bin) %in% c('surv_time', 'surv_event'))],
290
    Surv(COHORT.bin[-test.set, 'surv_time'], COHORT.bin[-test.set, 'surv_event']),
291
    family = "cox",
292
    maxit = 1000,
293
    alpha = alpha.best,
294
    parallel = TRUE
295
  )
296
297
# Save for future use
298
saveRDS(cv.fit, 'cv.fit.rds')
299
300
#' The best alpha was `r alpha.best`, and the lambda with the lowest mean-square
301
#' error was `r cv.fit$lambda.min`. We'll be using the strictest lambda which is
302
#' within 1 se of the minimum, `r cv.fit$lambda.1se`.
303
#'
304
#' ## Performance
305
#' 
306
#' ### C-index
307
#' 
308
#' Calculate C-index manually. The glmnet interface requiring matrices is
309
#' sufficiently different to the usual one that I've not spent time integrating
310
#' it with the rest of the ``handymedical.R`` functions yet.
311
#' 
312
#+ c_index
313
314
glmnetCIndex <- function(model.fit, dm) {
315
  test.predictions <-
316
    getRisk(
317
      model.fit,
318
      dm[, !(colnames(dm) %in% c('surv_time', 'surv_event'))]
319
    )
320
  
321
  as.numeric(
322
    survConcordance(
323
      as.formula(paste0('Surv(surv_time, surv_event) ~ risk')),
324
      data.frame(
325
        surv_time = dm[, 'surv_time'],
326
        surv_event = dm[, 'surv_event'],
327
        risk = test.predictions
328
      )
329
    )$concordance
330
  )
331
}
332
333
c.index <- glmnetCIndex(cv.fit, COHORT.bin[test.set, ])
334
335
#' C-index is `r c.index`.
336
#'
337
#' ### Calibration
338
#' 
339
#' For now, calibration is manual too just to get it working. It's surprisingly
340
#' hard... A package called `c060` should contain a function `predictProb`, but
341
#' on loading it, is says function not found. So here is a very manual solution,
342
#' creating a dummy Cox model using the `survival` package, inspired by
343
#' [this](https://stat.ethz.ch/pipermail/r-help/2012-May/312029.html).
344
#' 
345
#+ calibration_plot
346
347
glmnetCalibrationTable <- function(model.fit, dm, test.set, risk.time = 5) {
348
  # Work out risks at risk.time for the special case of a glmnet model
349
  
350
  # Derive baseline hazard from cv.glmnet model, heavily based on the
351
  # glmnet.survcurve and glmnet.basesurv functions in hdnom...
352
  
353
  # Get predictions from the training set, because it's the training set whose
354
  # baseline hazard we need
355
  
356
  # This is the relevant section of glmnet.survcurve from 02-hdnom-nomogram.R:
357
  # lp = as.numeric(predict(object, newx = data.matrix(x),
358
  #                         s = object$'lambda', type = 'link'))
359
  # lp means linear predictor from predict.glmnet, because type = 'link'
360
  lp <-
361
    as.numeric(
362
      predict(
363
        model.fit,
364
        newx =
365
          data.matrix(
366
            dm[-test.set, !(colnames(dm) %in% c('surv_time', 'surv_event'))]
367
          ),
368
        s = model.fit$lambda.1se,
369
        type = 'link'
370
      )
371
    )
372
  # At all unique times in the training set...
373
  t.unique <-
374
    # MUST sort these or the cumulative sum below will go crazy!
375
    sort(unique(dm[-test.set, 'surv_time'][dm[-test.set, 'surv_event'] == 1L]))
376
  
377
  alpha <- c()
378
  for (i in 1:length(t.unique)) {
379
    # ...loop over calculating the fraction of the population which dies at each
380
    # timepoint
381
    alpha[i] <-
382
      sum(
383
        # Training set 
384
        dm[-test.set, 'surv_time'][
385
          dm[-test.set, 'surv_event'] == 1
386
        ] == t.unique[i]
387
      ) /
388
      sum(
389
        exp(lp[dm[-test.set, 'surv_time'] >= t.unique[i]])
390
      )
391
  }
392
  
393
  # Get the cumulative hazard at risk.time by interpolating...
394
  baseline.cumhaz <-
395
    approx(
396
      t.unique, cumsum(alpha), yleft = 0, xout = risk.time, rule = 2
397
    )$y
398
  
399
  # Get predictions from the test set to modify the baseline hazard with
400
  lp.test <-
401
    as.numeric(
402
      predict(
403
        model.fit,
404
        newx =
405
          data.matrix(
406
            dm[test.set, !(colnames(dm) %in% c('surv_time', 'surv_event'))]
407
          ),
408
        s = model.fit$lambda.1se,
409
        type = 'link'
410
      )
411
    )
412
  
413
  # 1 minus to get % dead rather than alive
414
  risks <- 1 - exp(-exp(lp.test) * (baseline.cumhaz))
415
 
416
  calibration.table <-
417
    data.frame(
418
      surv_event = dm[test.set, 'surv_event'],
419
      surv_time = dm[test.set, 'surv_time'],
420
      risk = risks
421
    )
422
  
423
  # Was there an event? Start with NA, because default is unknown (ie censored)
424
  calibration.table$event <- NA
425
  # Event before risk.time
426
  calibration.table$event[
427
    calibration.table$surv_event & calibration.table$surv_time <= risk.time
428
    ] <- TRUE
429
  # Event after, whether censorship or not, means no event by risk.time
430
  calibration.table$event[calibration.table$surv_time > risk.time] <- FALSE
431
  # Otherwise, censored before risk.time, leave as NA
432
  
433
  # Drop unnecessary columns and return
434
  calibration.table[, c('risk', 'event')]
435
}
436
437
calibration.table <- glmnetCalibrationTable(cv.fit, COHORT.bin, test.set)
438
439
calibration.score <- calibrationScore(calibration.table)
440
441
calibrationPlot(calibration.table, show.censored = TRUE, max.points = 10000)
442
443
#' Calibration score is `r calibration.score`.
444
445
#' ### Coefficients
446
#' 
447
#' The elastic net regression generates coefficients for every factor/level
448
#' combination (for continuous values, this means that every decile gets its own
449
#' coefficient). The lambda penalty means that some amount of variable selection
450
#' is performed in this process, penalising too many large coefficients.
451
#' Depending on the value of alpha, quite a few of the coefficients can be
452
#' exactly zero. Let's have a look at what we got... 
453
#' 
454
#+ coefficients
455
456
# Make a data frame of the coefficients in decreasing order
457
cv.fit.coefficients.ordered <-
458
  data.frame(
459
    factorlevel = # Names are ordered in decreasing order of absolute value
460
      factorOrderedLevels(
461
        colnames(COHORT.bin)[
462
          order(abs(coef(cv.fit, s = "lambda.1se")), decreasing = TRUE)
463
          ]
464
      ),
465
    val =
466
      coef(cv.fit, s = "lambda.1se")[
467
        order(abs(coef(cv.fit, s = "lambda.1se")), decreasing = TRUE)
468
        ]
469
  )
470
471
# Get the variable names by removing TRUE, (x,y] or missing from the end
472
cv.fit.coefficients.ordered$var <-
473
  gsub('TRUE', '', cv.fit.coefficients.ordered$factorlevel)
474
cv.fit.coefficients.ordered$var <-
475
  # Can contain numbers, decimals, e+/- notation and commas separating bounds
476
  gsub('\\([0-9,.e\\+-]+\\]', '', cv.fit.coefficients.ordered$var)
477
cv.fit.coefficients.ordered$var <-
478
  gsub('missing', '', cv.fit.coefficients.ordered$var)
479
# Kludgey manual fix for 5_data1 which can take values 1, 2 or 3 and is
480
# therefore very hard to catch
481
cv.fit.coefficients.ordered$var <-
482
  gsub(
483
    'clinical.values.5_data1[0-9]', 'clinical.values.5_data1',
484
    cv.fit.coefficients.ordered$var
485
  )
486
487
# And then get human-readable descriptions
488
cv.fit.coefficients.ordered$desc <-
489
  lookUpDescriptions(cv.fit.coefficients.ordered$var)
490
491
#' #### Top 30 coefficients
492
#'
493
#+ coefficients_table, results='asis'
494
print(
495
  xtable(
496
    cv.fit.coefficients.ordered[1:30, c('desc', 'factorlevel', 'val')],
497
    digits = c(0, 0, 0, 3)
498
  ),
499
  type = 'html',
500
  include.rownames = FALSE
501
)
502
503
#' #### Graph of all coefficient values
504
#' 
505
#' Nonzero values are red, zero values are blue.
506
507
ggplot(cv.fit.coefficients.ordered, aes(x = factorlevel, y = val, colour = val == 0)) +
508
  geom_point() +
509
  theme(
510
    axis.title.x=element_blank(), axis.text.x=element_blank(),
511
    axis.ticks.x=element_blank()
512
  )
513
514
#' Overall, there are `r sum(cv.fit.coefficients.ordered$val != 0)` nonzero
515
#' coefficients out of `r nrow(cv.fit.coefficients.ordered)`. In the case of
516
#' multilevel factors or continuous values, multiple coefficients may result
517
#' from a single variable in the original data. Correcting for this, there are
518
#' `r length(unique(cv.fit.coefficients.ordered$var[cv.fit.coefficients.ordered$val != 0]))`
519
#' unique variables represented out of
520
#' `r length(unique(cv.fit.coefficients.ordered$var))` total variables.
521
#' 
522
#' ## Bootstrapping
523
#' 
524
#' Having got those results for a single run on all the data, now bootstrap to
525
#' find sample-induced variability in performance statistics. Again, because
526
#' glmnet requires a matrix rather than a data frame this would require a large
527
#' amount of code in ``handymedical.R``, so do this manually.
528
#'
529
#+ bootstrap_performance
530
531
time.start <- handyTimer()
532
533
# Instantiate a blank data frame
534
bootstrap.params <- data.frame()
535
536
for(i in 1:bootstraps) {
537
  # Take a bootstrap sample of the training set. We do this with COHORT.prep for
538
  # the variable importance calculations later.
539
  COHORT.prep.boot <- bootstrapSampleDf(COHORT.prep[-test.set, ])
540
  
541
  # Create a binary matrix for fitting
542
  COHORT.boot <- convertFactorsToBinaryColumns(COHORT.prep.boot)
543
  # model.matrix renames logicals to varTRUE, so fix that for status
544
  colnames(COHORT.boot)[colnames(COHORT.boot) == 'surv_eventTRUE'] <- 'surv_event'
545
  
546
  # Fit, but with alpha fixed on the optimal value
547
  cv.fit.boot <- #readRDS('cv.fit.rds')
548
    cv.glmnet(
549
      COHORT.boot[, !(colnames(COHORT.boot) %in% c('surv_time', 'surv_event'))],
550
      Surv(COHORT.boot[, 'surv_time'], COHORT.boot[, 'surv_event']),
551
      family = "cox",
552
      maxit = 1000,
553
      alpha = alpha.best
554
    )
555
556
  c.index.boot <- glmnetCIndex(cv.fit.boot, COHORT.bin[test.set,])
557
  
558
  calibration.table.boot <-
559
    glmnetCalibrationTable(
560
      cv.fit.boot, rbind(COHORT.boot, COHORT.bin[test.set, ]),
561
      test.set = (nrow(COHORT.boot) + 1):(nrow(COHORT.boot) + nrow(COHORT.bin[test.set, ]))
562
    )
563
  
564
  calibration.boot <- calibrationScore(calibration.table.boot)
565
  
566
  print(calibrationPlot(calibration.table.boot))
567
  
568
  var.imp.vector <- c()
569
  # Loop over variables to get variable importance
570
  for(
571
    var in
572
    colnames(
573
      COHORT.prep.boot[, !(colnames(COHORT.prep.boot) %in% c('surv_time', 'surv_event'))]
574
    )
575
  ) {
576
    # Create a dummy data frame and scramble the column var
577
    COHORT.vimp <- COHORT.prep.boot
578
    COHORT.vimp[, var] <- sample(COHORT.vimp[, var], replace = TRUE)
579
    # Make it into a model matrix for fitting
580
    COHORT.vimp <- convertFactorsToBinaryColumns(COHORT.vimp)
581
    # model.matrix renames logicals to varTRUE, so fix that for status
582
    colnames(COHORT.vimp)[colnames(COHORT.vimp) == 'surv_eventTRUE'] <- 'surv_event'
583
    # Calculate the new C-index
584
    c.index.vimp <- glmnetCIndex(cv.fit.boot, COHORT.vimp)
585
586
    # Append the difference between the C-index with scrambling and the original
587
    var.imp.vector <-
588
      c(
589
        var.imp.vector,
590
        c.index.boot - c.index.vimp
591
      )
592
  }
593
594
  names(var.imp.vector) <-
595
    paste0(
596
      'vimp.c.index.',
597
      colnames(
598
        COHORT.prep.boot[
599
          ,
600
          !(colnames(COHORT.prep.boot) %in% c('surv_time', 'surv_event'))
601
          ]
602
      )
603
    )
604
  
605
  bootstrap.params <-
606
    rbind(
607
      bootstrap.params,
608
      data.frame(
609
        t(var.imp.vector),
610
        c.index = c.index.boot,
611
        calibration.score = calibration.boot
612
      )
613
    )
614
  
615
  # Save the bootstrap parameters for later use
616
  write.csv(bootstrap.params, bootstrap.filename)
617
}
618
619
time.boot.final <- handyTimer(time.start)
620
621
#' `r bootstraps` bootstrap fits completed in `r time.boot.final` seconds!
622
623
# Get coefficients and variable importances from bootstrap fits
624
surv.model.fit.coeffs <- bootStatsDf(bootstrap.params)
625
626
print(surv.model.fit.coeffs)
627
628
# Save performance results
629
varsToTable(
630
  data.frame(
631
    model = 'cox-elnet',
632
    imputation = FALSE,
633
    discretised = TRUE,
634
    c.index = surv.model.fit.coeffs['c.index', 'val'],
635
    c.index.lower = surv.model.fit.coeffs['c.index', 'lower'],
636
    c.index.upper = surv.model.fit.coeffs['c.index', 'upper'],
637
    calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
638
    calibration.score.lower =
639
      surv.model.fit.coeffs['calibration.score', 'lower'],
640
    calibration.score.upper =
641
      surv.model.fit.coeffs['calibration.score', 'upper']
642
  ),
643
  performance.file,
644
  index.cols = c('model', 'imputation', 'discretised')
645
)
646
647
#' The bootstrapped C-index is
648
#' **`r round(surv.model.fit.coeffs['c.index', 'val'], 3)`
649
#' (`r round(surv.model.fit.coeffs['c.index', 'lower'], 3)` - 
650
#' `r round(surv.model.fit.coeffs['c.index', 'upper'], 3)`)**
651
#' on the held-out test set.
652
#' 
653
#' The bootstrapped calibration score is
654
#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)`
655
#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` - 
656
#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**.
657
#' 
658
#' ### Variable importances
659
#' 
660
#' Top 20 most important variables from the most recent bootstrap. (This is
661
#' obviously indicative but just to plot a quick graph and get an idea.)
662
#' 
663
#+ bootstrap_var_imp
664
665
boot.var.imp.ordered <-
666
  data.frame(
667
    var = textAfter(names(var.imp.vector), 'vimp.c.index.'),
668
    val = var.imp.vector,
669
    stringsAsFactors = FALSE
670
  )
671
672
boot.var.imp.ordered$desc <- lookUpDescriptions(boot.var.imp.ordered$var)
673
674
ggplot(
675
    boot.var.imp.ordered[order(boot.var.imp.ordered$val[1:20], decreasing = TRUE), ],
676
    aes(x = var, y = val)
677
  ) +
678
  geom_bar(stat = 'identity')