Diff of /lib/all-cv-bootstrap.R [000000] .. [0375db]

Switch to unified view

a b/lib/all-cv-bootstrap.R
1
# All model types are bootstrapped this many times
2
bootstraps <- 200
3
# n.trees is (obviously) only relevant for random forests
4
n.trees <- 500
5
# The following two variables are only relevant if the model.type is 'ranger'
6
split.rule <- 'logrank'
7
n.threads <- 10
8
9
# Cross-validation variables
10
input.n.bins <- 10:20
11
cv.n.folds <- 3
12
n.calibrations <- 1000
13
n.data <- NA # This is of full dataset...further rows may be excluded in prep
14
15
continuous.vars <-
16
  c(
17
    'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
18
    'total_wbc_6mo', 'haemoglobin_6mo'
19
  )
20
21
source('shared.R')
22
require(ggrepel)
23
24
# Load the data and convert to data frame to make column-selecting code in
25
# prepData simpler
26
COHORT.full <- data.frame(fread(data.filename))
27
28
# If n.data was specified...
29
if(!is.na(n.data)){
30
  # Take a subset n.data in size
31
  COHORT.use <- sample.df(COHORT.full, n.data)
32
  rm(COHORT.full)
33
} else {
34
  # Use all the data
35
  COHORT.use <- COHORT.full
36
  rm(COHORT.full)
37
}
38
39
# We now need a quick null preparation of the data to get its length (some rows
40
# may be excluded during preparation)
41
COHORT.prep <-
42
  prepData(
43
    COHORT.use,
44
    cols.keep, discretise.settings, surv.time, surv.event,
45
    surv.event.yes, extra.fun = caliberExtraPrep, n.keep = n.data
46
  )
47
n.data <- nrow(COHORT.prep)
48
49
# Define indices of test set
50
test.set <- testSetIndices(COHORT.prep, random.seed = 78361)
51
52
# If we've not already done a calibration, then do one
53
if(!file.exists(calibration.filename)) {
54
  # Create an empty data frame to aggregate stats per fold
55
  cv.performance <- data.frame()
56
  
57
  # We can parallelise this bit with foreach, so set that up
58
  initParallel(n.threads)
59
  
60
  # Run crossvalidations in parallel
61
  cv.performance <- 
62
    foreach(i = 1:n.calibrations, .combine = 'rbind') %dopar% {
63
    cat(
64
      'Calibration', i, '...\n'
65
    )
66
    
67
    # Reset process settings with the base setings
68
    process.settings <-
69
      list(
70
        var        = c('anonpatid', 'time_death', 'imd_score', 'exclude'),
71
        method     = c(NA, NA, NA, NA),
72
        settings   = list(NA, NA, NA, NA)
73
      )
74
    # Generate some random numbers of bins (and for n bins, you need n + 1 breaks)
75
    n.bins <- sample(input.n.bins, length(continuous.vars), replace = TRUE) + 1
76
    names(n.bins) <- continuous.vars
77
    # Go through each variable setting it to bin by quantile with a random number of bins
78
    for(j in 1:length(continuous.vars)) {
79
      process.settings$var <- c(process.settings$var, continuous.vars[j])
80
      process.settings$method <- c(process.settings$method, 'binByQuantile')
81
      process.settings$settings <-
82
        c(
83
          process.settings$settings,
84
          list(
85
            seq(
86
              # Quantiles are obviously between 0 and 1
87
              0, 1,
88
              # Choose a random number of bins (and for n bins, you need n + 1 breaks)
89
              length.out = n.bins[j]
90
            )
91
          )
92
        )
93
    }
94
    
95
    # prep the data given the variables provided
96
    COHORT.cv <-
97
      prepData(
98
        # Data for cross-validation excludes test set
99
        COHORT.use[-test.set, ],
100
        cols.keep,
101
        process.settings,
102
        surv.time, surv.event,
103
        surv.event.yes,
104
        extra.fun = caliberExtraPrep
105
      )
106
    
107
    # Get folds for cross-validation
108
    cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds)
109
    
110
    cv.fold.performance <- data.frame()
111
    
112
    for(j in 1:cv.n.folds) {
113
      time.start <- handyTimer()
114
      # Fit model to the training set
115
      surv.model.fit <-
116
        survivalFit(
117
          surv.predict,
118
          COHORT.cv[-cv.folds[[j]],],
119
          model.type = model.type,
120
          n.trees = n.trees,
121
          split.rule = split.rule
122
          # n.threads not used because this is run in parallel
123
        )
124
      time.learn <- handyTimer(time.start)
125
      
126
      time.start <- handyTimer()
127
      # Get C-indices for training and validation sets
128
      c.index.train <-
129
        cIndex(
130
          surv.model.fit, COHORT.cv[-cv.folds[[j]],], model.type = model.type
131
        )
132
      c.index.val <-
133
        cIndex(
134
          surv.model.fit, COHORT.cv[cv.folds[[j]],], model.type = model.type
135
        )
136
      time.predict <- handyTimer(time.start)
137
      
138
      # Append the stats we've obtained from this fold
139
      cv.fold.performance <-
140
        rbind(
141
          cv.fold.performance,
142
          data.frame(
143
            calibration = i,
144
            cv.fold = j,
145
            as.list(n.bins),
146
            c.index.train,
147
            c.index.val,
148
            time.learn,
149
            time.predict
150
          )
151
        )
152
      
153
    } # End cross-validation loop (j)
154
    
155
    # rbind the performance by fold
156
    cv.fold.performance
157
  } # End calibration loop (i)
158
  
159
  # Save output at end of calibration
160
  write.csv(cv.performance, calibration.filename)
161
162
} else { # If we did previously calibrate, load it
163
  cv.performance <- read.csv(calibration.filename)
164
}
165
166
# Find the best calibration...
167
# First, average performance across cross-validation folds
168
cv.performance.average <-
169
  aggregate(
170
    c.index.val ~ calibration,
171
    data = cv.performance,
172
    mean
173
  )
174
# Find the highest value
175
best.calibration <-
176
  cv.performance.average$calibration[
177
    which.max(cv.performance.average$c.index.val)
178
  ]
179
# And finally, find the first row of that calibration to get the n.bins values
180
best.calibration.row1 <-
181
  min(which(cv.performance$calibration == best.calibration))
182
183
# Get its parameters
184
n.bins <-
185
  t(
186
    cv.performance[best.calibration.row1, continuous.vars]
187
  )
188
189
# Prepare the data with those settings...
190
191
# Reset process settings with the base setings
192
process.settings <-
193
  list(
194
    var        = c('anonpatid', 'time_death', 'imd_score', 'exclude'),
195
    method     = c(NA, NA, NA, NA),
196
    settings   = list(NA, NA, NA, NA)
197
  )
198
for(j in 1:length(continuous.vars)) {
199
  process.settings$var <- c(process.settings$var, continuous.vars[j])
200
  process.settings$method <- c(process.settings$method, 'binByQuantile')
201
  process.settings$settings <-
202
    c(
203
      process.settings$settings,
204
      list(
205
        seq(
206
          # Quantiles are obviously between 0 and 1
207
          0, 1,
208
          # Choose a random number of bins (and for n bins, you need n + 1 breaks)
209
          length.out = n.bins[j]
210
        )
211
      )
212
    )
213
}
214
215
# prep the data given the variables provided
216
COHORT.optimised <-
217
  prepData(
218
    # Data for cross-validation excludes test set
219
    COHORT.use,
220
    cols.keep,
221
    process.settings,
222
    surv.time, surv.event,
223
    surv.event.yes,
224
    extra.fun = caliberExtraPrep
225
  )
226
227
#' ## Fit the final model
228
#' 
229
#' This may take some time, so we'll cache it if possible...
230
231
#+ fit_final_model
232
233
# Fit to whole training set
234
surv.model.fit <-
235
  survivalFit(
236
    surv.predict,
237
    COHORT.optimised[-test.set,], # Training set
238
    model.type = model.type,
239
    n.trees = n.trees,
240
    split.rule = split.rule,
241
    n.threads = n.threads
242
  )
243
244
cl <- initParallel(n.threads, backend = 'doParallel')
245
246
surv.model.params.boot <-
247
  foreach(
248
    i = 1:bootstraps,
249
    .combine = rbind,
250
    .packages = c('survival'),
251
    .verbose = TRUE
252
    ) %dopar% {
253
    
254
    # Bootstrap-sampled training set
255
    COHORT.boot <-
256
      sample.df(
257
        COHORT.optimised[-test.set,],
258
        nrow(COHORT.optimised[-test.set,]),
259
        replace = TRUE
260
      )
261
    
262
    surv.model.fit.i <-
263
      survivalFit(
264
        surv.predict,
265
        COHORT.boot,
266
        model.type = model.type,
267
        n.trees = n.trees,
268
        split.rule = split.rule,
269
        # 1 thread, because we're parallelising the bootstrapping
270
        n.threads = 1
271
      )
272
    
273
    # Work out other quantities of interest
274
    #var.imp.vector <- bootstrapVarImp(surv.model.fit.i, COHORT.boot)
275
    c.index <- cIndex(surv.model.fit.i, COHORT.optimised[test.set, ])
276
    calibration.score <-
277
      calibrationScoreWrapper(surv.model.fit.i, COHORT.optimised[test.set, ])
278
    
279
    data.frame(
280
      i,
281
      t(coef(surv.model.fit.i)),
282
      #t(var.imp.vector),
283
      c.index,
284
      calibration.score
285
    )
286
  }
287
288
# Save the fit object
289
write.csv(surv.model.params.boot, paste0(output.filename.base, '-surv-boot.csv'))
290
291
# Tidy up by removing the cluster
292
stopCluster(cl)
293
294
surv.model.fit.coeffs <-  bootStatsDf(surv.model.params.boot)