Diff of /age-only/age-only.R [000000] .. [0375db]

Switch to unified view

a b/age-only/age-only.R
1
#+ knitr_setup, include = FALSE
2
3
# Whether to cache the intensive code sections. Set to FALSE to recalculate
4
# everything afresh.
5
cacheoption <- FALSE
6
# Disable lazy caching globally, because it fails for large objects, and all the
7
# objects we wish to cache are large...
8
opts_chunk$set(cache.lazy = FALSE)
9
10
#' # Performance of an age-only model
11
#' 
12
#+ setup, message=FALSE
13
14
bootstraps <- 200
15
16
data.filename <- '../../data/cohort-sanitised.csv'
17
n.threads <- 16
18
19
source('../lib/shared.R')
20
require(rms)
21
22
COHORT.full <- data.frame(fread(data.filename))
23
24
COHORT.use <- subset(COHORT.full, !exclude)
25
26
n.data <- nrow(COHORT.use)
27
28
# Define indices of test set
29
test.set <- testSetIndices(COHORT.use, random.seed = 78361)
30
31
COHORT.use <- prepSurvCol(COHORT.use, surv.time, surv.event,surv.event.yes)
32
33
#' ## Concordance index
34
#' 
35
#' The c-index can be calculated taking age itself as a risk score. Since it's
36
#' purely rank-based, it doesn't matter that age is nonlinearly related to true
37
#' risk.
38
#' 
39
#' We bootstrap this based purely on the test set to make the bootstrap
40
#' variability commensurate with other models tested on the test set. You can
41
#' imagine that this model was 'trained' on the training set, even though it's
42
#' so simple that we actually did no such thing...
43
#' 
44
#+ c_index
45
46
# Define a trivial function used for bootstrapping which simply returns the
47
# c-index of a 'model' based purely on age.
48
49
ageOnly <- function(df, indices, df.test) {
50
  # Create a Kaplan-Meier curve from the bootstrap sample
51
  km.by.age <-
52
    survfit(
53
      Surv(surv_time, surv_event) ~ age,
54
      data = df[indices, ],
55
      conf.type = "log-log"
56
    )
57
58
  # Return the calibration score
59
  c(
60
    calibration.score =
61
      calibrationScore(
62
        calibrationTable(km.by.age, df.test)
63
      )$area
64
  )
65
}
66
67
# C-index is by definition non-variable because we do not bootstrap or otherwise
68
# permute the test set, and there's no 'training' phase
69
age.c.index <- 
70
  as.numeric(
71
    survConcordance(
72
      Surv(surv_time, surv_event) ~ age,
73
      COHORT.use[test.set,]
74
    )$concordance
75
  )
76
77
# Bootstrap to establish variability of calibration
78
age.only.boot <-
79
  boot(
80
    data = COHORT.use[-test.set,],
81
    statistic = ageOnly,
82
    R = bootstraps,
83
    parallel = 'multicore',
84
    ncpus = n.threads,
85
    df.test =  COHORT.use[test.set,]
86
  )
87
88
age.only.boot.stats <- bootStats(age.only.boot, uncertainty = '95ci')
89
90
#' C-index is
91
#' **`r round(age.c.index, 3)`** 
92
#' on the held-out test set (not that it really matters, the model isn't
93
#' 'trained' as such for the discrimination test...it's just oldest patient dies
94
#' first).
95
#' 
96
#' 
97
#' ## Calibration
98
#' 
99
#' 
100
#+ calibration
101
102
km.by.age <-
103
  survfit(
104
    Surv(surv_time, surv_event) ~ age,
105
    data = COHORT.use[-test.set,],
106
    conf.type = "log-log"
107
  )
108
109
calibration.table <- calibrationTable(km.by.age, COHORT.use[test.set, ])
110
111
print(calibrationScore(calibration.table))
112
113
calibrationPlot(calibration.table)
114
115
#' Calibration score is
116
#' **`r round(age.only.boot.stats['calibration.score', 'val'], 3)`
117
#' (`r round(age.only.boot.stats['calibration.score', 'lower'], 3)` -
118
#' `r round(age.only.boot.stats['calibration.score', 'upper'], 3)`)** 
119
#' on the held-out test set.
120
121
varsToTable(
122
  data.frame(
123
    model = 'age',
124
    imputation = FALSE,
125
    discretised = FALSE,
126
    c.index = age.c.index,
127
    c.index.lower = NA,
128
    c.index.upper = NA,
129
    calibration.score = age.only.boot.stats['calibration.score', 'val'],
130
    calibration.score.lower = age.only.boot.stats['calibration.score', 'lower'],
131
    calibration.score.upper = age.only.boot.stats['calibration.score', 'upper']
132
  ),
133
  performance.file,
134
  index.cols = c('model', 'imputation', 'discretised')
135
)