|
a |
|
b/cox-ph/cox-discretised-imputed.R |
|
|
1 |
#+ knitr_setup, include = FALSE |
|
|
2 |
|
|
|
3 |
# Whether to cache the intensive code sections. Set to FALSE to recalculate |
|
|
4 |
# everything afresh. |
|
|
5 |
cacheoption <- FALSE |
|
|
6 |
# Disable lazy caching globally, because it fails for large objects, and all the |
|
|
7 |
# objects we wish to cache are large... |
|
|
8 |
opts_chunk$set(cache.lazy = FALSE) |
|
|
9 |
|
|
|
10 |
#' # Discrete Cox model with imputed data |
|
|
11 |
#' |
|
|
12 |
#' The discrete Cox model performs very similarly to the normal Cox model, even |
|
|
13 |
#' without performing imputation first. Let's try it with imputation, and see |
|
|
14 |
#' if the performance is boosted. |
|
|
15 |
#' |
|
|
16 |
#' We're going to use the same parameters for discretising as were found for the |
|
|
17 |
#' data with missing values. On the one hand, this is slightly unfair because |
|
|
18 |
#' these values might be suboptimal now that the data are imputed but, on the |
|
|
19 |
#' other, it does allow for direct comparison. If we cross-validated again, we |
|
|
20 |
#' would be very likely to find different parameters (there are far more than |
|
|
21 |
#' can plausibly be tried), which may lead performance to be better or worse |
|
|
22 |
#' entirely at random. |
|
|
23 |
|
|
|
24 |
# Calibration from cross-validation performed on data before imputation |
|
|
25 |
calibration.filename <- '../../output/survreg-crossvalidation-try4.csv' |
|
|
26 |
# The first part of the filename for any output |
|
|
27 |
output.filename.base <- '../../output/survreg-discrete-imputed-try1' |
|
|
28 |
imputed.data.filename <- '../../data/COHORT_complete.rds' |
|
|
29 |
boot.filename <- paste0(output.filename.base, '-boot.rds') |
|
|
30 |
|
|
|
31 |
n.threads <- 20 |
|
|
32 |
bootstraps <- 50 |
|
|
33 |
|
|
|
34 |
model.type <- 'survreg' |
|
|
35 |
|
|
|
36 |
#' ## Discretise data |
|
|
37 |
#' |
|
|
38 |
#' First, let's load the results from the calibrations and find the parameters |
|
|
39 |
#' for discretisation. |
|
|
40 |
|
|
|
41 |
source('../lib/shared.R') |
|
|
42 |
|
|
|
43 |
continuous.vars <- |
|
|
44 |
c( |
|
|
45 |
'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo', |
|
|
46 |
'total_wbc_6mo', 'haemoglobin_6mo' |
|
|
47 |
) |
|
|
48 |
|
|
|
49 |
# read file containing calibrations |
|
|
50 |
cv.performance <- read.csv(calibration.filename) |
|
|
51 |
|
|
|
52 |
# Find the best calibration... |
|
|
53 |
# First, average performance across cross-validation folds |
|
|
54 |
cv.performance.average <- |
|
|
55 |
aggregate( |
|
|
56 |
c.index.val ~ calibration, |
|
|
57 |
data = cv.performance, |
|
|
58 |
mean |
|
|
59 |
) |
|
|
60 |
# Find the highest value |
|
|
61 |
best.calibration <- |
|
|
62 |
cv.performance.average$calibration[ |
|
|
63 |
which.max(cv.performance.average$c.index.val) |
|
|
64 |
] |
|
|
65 |
# And finally, find the first row of that calibration to get the n.bins values |
|
|
66 |
best.calibration.row1 <- |
|
|
67 |
min(which(cv.performance$calibration == best.calibration)) |
|
|
68 |
|
|
|
69 |
# Get its parameters |
|
|
70 |
n.bins <- |
|
|
71 |
t( |
|
|
72 |
cv.performance[best.calibration.row1, continuous.vars] |
|
|
73 |
) |
|
|
74 |
|
|
|
75 |
|
|
|
76 |
# Reset process settings with the base setings |
|
|
77 |
process.settings <- |
|
|
78 |
list( |
|
|
79 |
var = c('anonpatid', 'time_death', 'imd_score', 'exclude'), |
|
|
80 |
method = c(NA, NA, NA, NA), |
|
|
81 |
settings = list(NA, NA, NA, NA) |
|
|
82 |
) |
|
|
83 |
for(j in 1:length(continuous.vars)) { |
|
|
84 |
process.settings$var <- c(process.settings$var, continuous.vars[j]) |
|
|
85 |
process.settings$method <- c(process.settings$method, 'binByQuantile') |
|
|
86 |
process.settings$settings <- |
|
|
87 |
c( |
|
|
88 |
process.settings$settings, |
|
|
89 |
list( |
|
|
90 |
seq( |
|
|
91 |
# Quantiles are obviously between 0 and 1 |
|
|
92 |
0, 1, |
|
|
93 |
# Choose a random number of bins (and for n bins, you need n + 1 breaks) |
|
|
94 |
length.out = n.bins[j] |
|
|
95 |
) |
|
|
96 |
) |
|
|
97 |
) |
|
|
98 |
} |
|
|
99 |
|
|
|
100 |
#' ## Load imputed data |
|
|
101 |
#' |
|
|
102 |
#' Load the data and prepare it with the settings above |
|
|
103 |
|
|
|
104 |
# Load the data from its RDS file |
|
|
105 |
imputed.data <- readRDS(imputed.data.filename) |
|
|
106 |
|
|
|
107 |
# Remove rows with death time of 0 to avoid fitting errors, and get the survival |
|
|
108 |
# columns ready |
|
|
109 |
for(i in 1:length(imputed.data)) { |
|
|
110 |
imputed.data[[i]] <- imputed.data[[i]][imputed.data[[i]][, surv.time] > 0, ] |
|
|
111 |
# Put in a fake exclude column for the next function (exclusions are already |
|
|
112 |
# excluded in the imputed dataset) |
|
|
113 |
imputed.data[[i]]$exclude <- FALSE |
|
|
114 |
imputed.data[[i]]$imd_score <- as.numeric(imputed.data[[i]]$imd_score) |
|
|
115 |
imputed.data[[i]] <- |
|
|
116 |
prepData( |
|
|
117 |
imputed.data[[i]], |
|
|
118 |
cols.keep, |
|
|
119 |
process.settings, |
|
|
120 |
'time_death', 'endpoint_death', 1, |
|
|
121 |
extra.fun = caliberExtraPrep |
|
|
122 |
) |
|
|
123 |
} |
|
|
124 |
|
|
|
125 |
# Define test set |
|
|
126 |
test.set <- testSetIndices(imputed.data[[1]], random.seed = 78361) |
|
|
127 |
|
|
|
128 |
# Do a quick and dirty fit on a single imputed dataset, to draw calibration |
|
|
129 |
# curve from |
|
|
130 |
time.start <- handyTimer() |
|
|
131 |
surv.model.fit <- |
|
|
132 |
survivalFit( |
|
|
133 |
surv.predict, |
|
|
134 |
COHORT.optimised[-test.set,], # Training set |
|
|
135 |
model.type = model.type, |
|
|
136 |
n.threads = n.threads |
|
|
137 |
) |
|
|
138 |
time.fit <- handyTimer(time.start) |
|
|
139 |
|
|
|
140 |
# Perform bootstrap fitting for every multiply imputed dataset |
|
|
141 |
surv.fit.boot <- list() |
|
|
142 |
time.start <- handyTimer() |
|
|
143 |
for(i in 1:length(imputed.data)) { |
|
|
144 |
surv.fit.boot[[i]] <- |
|
|
145 |
survivalBootstrap( |
|
|
146 |
surv.predict, |
|
|
147 |
imputed.data[[i]][-test.set, ], |
|
|
148 |
imputed.data[[i]][test.set, ], |
|
|
149 |
model.type = model.type, |
|
|
150 |
bootstraps = bootstraps, |
|
|
151 |
n.threads = n.threads |
|
|
152 |
) |
|
|
153 |
} |
|
|
154 |
time.boot <- handyTimer(time.start) |
|
|
155 |
|
|
|
156 |
# Save the fits, because it might've taken a while! |
|
|
157 |
saveRDS(surv.fit.boot, boot.filename) |
|
|
158 |
|
|
|
159 |
#' Model fitted in `r round(time.fit)` seconds, and `r bootstraps` fits |
|
|
160 |
#' performed on `r length(imputed.data)` imputed datasets in |
|
|
161 |
#' `r round(time.boot)` seconds. |
|
|
162 |
|
|
|
163 |
# Unpackage the uncertainties from the bootstrapped data |
|
|
164 |
surv.fit.boot.ests <- bootMIStats(surv.fit.boot) |
|
|
165 |
|
|
|
166 |
# Save bootstrapped performance values |
|
|
167 |
varsToTable( |
|
|
168 |
data.frame( |
|
|
169 |
model = 'cox', |
|
|
170 |
imputation = TRUE, |
|
|
171 |
discretised = TRUE, |
|
|
172 |
c.index = surv.fit.boot.ests['c.test', 'val'], |
|
|
173 |
c.index.lower = surv.fit.boot.ests['c.test', 'lower'], |
|
|
174 |
c.index.upper = surv.fit.boot.ests['c.test', 'upper'], |
|
|
175 |
calibration.score = surv.fit.boot.ests['calibration.score', 'val'], |
|
|
176 |
calibration.score.lower = surv.fit.boot.ests['calibration.score', 'lower'], |
|
|
177 |
calibration.score.upper = surv.fit.boot.ests['calibration.score', 'upper'] |
|
|
178 |
), |
|
|
179 |
performance.file, |
|
|
180 |
index.cols = c('model', 'imputation', 'discretised') |
|
|
181 |
) |
|
|
182 |
|
|
|
183 |
|
|
|
184 |
#' # Results |
|
|
185 |
#' |
|
|
186 |
#' ## Performance |
|
|
187 |
#' |
|
|
188 |
#' Having fitted the Cox model, how did we do? The c-indices were calculated as |
|
|
189 |
#' part of the bootstrapping, so we just need to take a look at those... |
|
|
190 |
#' |
|
|
191 |
#' C-indices are **`r round(surv.fit.boot.ests['c.train', 'val'], 3)` |
|
|
192 |
#' (`r round(surv.fit.boot.ests['c.train', 'lower'], 3)` - |
|
|
193 |
#' `r round(surv.fit.boot.ests['c.train', 'upper'], 3)`)** on the training set and |
|
|
194 |
#' **`r round(surv.fit.boot.ests['c.test', 'val'], 3)` |
|
|
195 |
#' (`r round(surv.fit.boot.ests['c.test', 'lower'], 3)` - |
|
|
196 |
#' `r round(surv.fit.boot.ests['c.test', 'upper'], 3)`)** on the test set. |
|
|
197 |
#' |
|
|
198 |
#' |
|
|
199 |
#' ### Calibration |
|
|
200 |
#' |
|
|
201 |
#' The bootstrapped calibration score is |
|
|
202 |
#' **`r round(surv.fit.boot.ests['calibration.score', 'val'], 3)` |
|
|
203 |
#' (`r round(surv.fit.boot.ests['calibration.score', 'lower'], 3)` - |
|
|
204 |
#' `r round(surv.fit.boot.ests['calibration.score', 'upper'], 3)`)**. |
|
|
205 |
#' |
|
|
206 |
#' Let's draw a representative curve from the unbootstrapped fit... (It would be |
|
|
207 |
#' better to draw all the curves from the bootstrap fit to get an idea of |
|
|
208 |
#' variability, but I've not implemented this yet.) |
|
|
209 |
#' |
|
|
210 |
#+ calibration_plot |
|
|
211 |
|
|
|
212 |
calibration.table <- |
|
|
213 |
calibrationTable(surv.model.fit, imputed.data[[1]][test.set, ]) |
|
|
214 |
|
|
|
215 |
calibration.score <- calibrationScore(calibration.table) |
|
|
216 |
|
|
|
217 |
calibrationPlot(calibration.table) |
|
|
218 |
|
|
|
219 |
#' The area between the calibration curve and the diagonal is |
|
|
220 |
#' **`r round(calibration.score[['area']], 3)`** +/- |
|
|
221 |
#' **`r round(calibration.score[['se']], 3)`**. |