|
a |
|
b/age-only/age-only.R |
|
|
1 |
#+ knitr_setup, include = FALSE |
|
|
2 |
|
|
|
3 |
# Whether to cache the intensive code sections. Set to FALSE to recalculate |
|
|
4 |
# everything afresh. |
|
|
5 |
cacheoption <- FALSE |
|
|
6 |
# Disable lazy caching globally, because it fails for large objects, and all the |
|
|
7 |
# objects we wish to cache are large... |
|
|
8 |
opts_chunk$set(cache.lazy = FALSE) |
|
|
9 |
|
|
|
10 |
#' # Performance of an age-only model |
|
|
11 |
#' |
|
|
12 |
#+ setup, message=FALSE |
|
|
13 |
|
|
|
14 |
bootstraps <- 200 |
|
|
15 |
|
|
|
16 |
data.filename <- '../../data/cohort-sanitised.csv' |
|
|
17 |
n.threads <- 16 |
|
|
18 |
|
|
|
19 |
source('../lib/shared.R') |
|
|
20 |
require(rms) |
|
|
21 |
|
|
|
22 |
COHORT.full <- data.frame(fread(data.filename)) |
|
|
23 |
|
|
|
24 |
COHORT.use <- subset(COHORT.full, !exclude) |
|
|
25 |
|
|
|
26 |
n.data <- nrow(COHORT.use) |
|
|
27 |
|
|
|
28 |
# Define indices of test set |
|
|
29 |
test.set <- testSetIndices(COHORT.use, random.seed = 78361) |
|
|
30 |
|
|
|
31 |
COHORT.use <- prepSurvCol(COHORT.use, surv.time, surv.event,surv.event.yes) |
|
|
32 |
|
|
|
33 |
#' ## Concordance index |
|
|
34 |
#' |
|
|
35 |
#' The c-index can be calculated taking age itself as a risk score. Since it's |
|
|
36 |
#' purely rank-based, it doesn't matter that age is nonlinearly related to true |
|
|
37 |
#' risk. |
|
|
38 |
#' |
|
|
39 |
#' We bootstrap this based purely on the test set to make the bootstrap |
|
|
40 |
#' variability commensurate with other models tested on the test set. You can |
|
|
41 |
#' imagine that this model was 'trained' on the training set, even though it's |
|
|
42 |
#' so simple that we actually did no such thing... |
|
|
43 |
#' |
|
|
44 |
#+ c_index |
|
|
45 |
|
|
|
46 |
# Define a trivial function used for bootstrapping which simply returns the |
|
|
47 |
# c-index of a 'model' based purely on age. |
|
|
48 |
|
|
|
49 |
ageOnly <- function(df, indices, df.test) { |
|
|
50 |
# Create a Kaplan-Meier curve from the bootstrap sample |
|
|
51 |
km.by.age <- |
|
|
52 |
survfit( |
|
|
53 |
Surv(surv_time, surv_event) ~ age, |
|
|
54 |
data = df[indices, ], |
|
|
55 |
conf.type = "log-log" |
|
|
56 |
) |
|
|
57 |
|
|
|
58 |
# Return the calibration score |
|
|
59 |
c( |
|
|
60 |
calibration.score = |
|
|
61 |
calibrationScore( |
|
|
62 |
calibrationTable(km.by.age, df.test) |
|
|
63 |
)$area |
|
|
64 |
) |
|
|
65 |
} |
|
|
66 |
|
|
|
67 |
# C-index is by definition non-variable because we do not bootstrap or otherwise |
|
|
68 |
# permute the test set, and there's no 'training' phase |
|
|
69 |
age.c.index <- |
|
|
70 |
as.numeric( |
|
|
71 |
survConcordance( |
|
|
72 |
Surv(surv_time, surv_event) ~ age, |
|
|
73 |
COHORT.use[test.set,] |
|
|
74 |
)$concordance |
|
|
75 |
) |
|
|
76 |
|
|
|
77 |
# Bootstrap to establish variability of calibration |
|
|
78 |
age.only.boot <- |
|
|
79 |
boot( |
|
|
80 |
data = COHORT.use[-test.set,], |
|
|
81 |
statistic = ageOnly, |
|
|
82 |
R = bootstraps, |
|
|
83 |
parallel = 'multicore', |
|
|
84 |
ncpus = n.threads, |
|
|
85 |
df.test = COHORT.use[test.set,] |
|
|
86 |
) |
|
|
87 |
|
|
|
88 |
age.only.boot.stats <- bootStats(age.only.boot, uncertainty = '95ci') |
|
|
89 |
|
|
|
90 |
#' C-index is |
|
|
91 |
#' **`r round(age.c.index, 3)`** |
|
|
92 |
#' on the held-out test set (not that it really matters, the model isn't |
|
|
93 |
#' 'trained' as such for the discrimination test...it's just oldest patient dies |
|
|
94 |
#' first). |
|
|
95 |
#' |
|
|
96 |
#' |
|
|
97 |
#' ## Calibration |
|
|
98 |
#' |
|
|
99 |
#' |
|
|
100 |
#+ calibration |
|
|
101 |
|
|
|
102 |
km.by.age <- |
|
|
103 |
survfit( |
|
|
104 |
Surv(surv_time, surv_event) ~ age, |
|
|
105 |
data = COHORT.use[-test.set,], |
|
|
106 |
conf.type = "log-log" |
|
|
107 |
) |
|
|
108 |
|
|
|
109 |
calibration.table <- calibrationTable(km.by.age, COHORT.use[test.set, ]) |
|
|
110 |
|
|
|
111 |
print(calibrationScore(calibration.table)) |
|
|
112 |
|
|
|
113 |
calibrationPlot(calibration.table) |
|
|
114 |
|
|
|
115 |
#' Calibration score is |
|
|
116 |
#' **`r round(age.only.boot.stats['calibration.score', 'val'], 3)` |
|
|
117 |
#' (`r round(age.only.boot.stats['calibration.score', 'lower'], 3)` - |
|
|
118 |
#' `r round(age.only.boot.stats['calibration.score', 'upper'], 3)`)** |
|
|
119 |
#' on the held-out test set. |
|
|
120 |
|
|
|
121 |
varsToTable( |
|
|
122 |
data.frame( |
|
|
123 |
model = 'age', |
|
|
124 |
imputation = FALSE, |
|
|
125 |
discretised = FALSE, |
|
|
126 |
c.index = age.c.index, |
|
|
127 |
c.index.lower = NA, |
|
|
128 |
c.index.upper = NA, |
|
|
129 |
calibration.score = age.only.boot.stats['calibration.score', 'val'], |
|
|
130 |
calibration.score.lower = age.only.boot.stats['calibration.score', 'lower'], |
|
|
131 |
calibration.score.upper = age.only.boot.stats['calibration.score', 'upper'] |
|
|
132 |
), |
|
|
133 |
performance.file, |
|
|
134 |
index.cols = c('model', 'imputation', 'discretised') |
|
|
135 |
) |