Switch to unified view

a b/lib/rfsrc-cv-nsplit-bootstrap.R
1
bootstraps <- 20
2
split.rule <- 'logrank'
3
n.threads <- 16
4
5
# Cross-validation variables
6
ns.splits <- 0:20
7
ns.trees  <- c(500, 1000, 2000)
8
ns.imputations <- 1:3
9
cv.n.folds <- 3
10
n.data <- NA # This is of full dataset...further rows may be excluded in prep
11
12
calibration.filename <- paste0(output.filename.base, '-calibration.csv')
13
14
continuous.vars <-
15
  c(
16
    'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo',
17
    'total_wbc_6mo', 'haemoglobin_6mo'
18
  )
19
20
untransformed.vars <- c('anonpatid', 'surv_time', 'imd_score', 'exclude')
21
22
source('../lib/shared.R')
23
require(ggrepel)
24
25
# Load the data and convert to data frame to make column-selecting code in
26
# prepData simpler
27
COHORT.full <- data.frame(fread(data.filename))
28
29
# If n.data was specified...
30
if(!is.na(n.data)){
31
  # Take a subset n.data in size
32
  COHORT.use <- sample.df(COHORT.full, n.data)
33
  rm(COHORT.full)
34
} else {
35
  # Use all the data
36
  COHORT.use <- COHORT.full
37
  rm(COHORT.full)
38
}
39
40
# We now need a quick null preparation of the data to get its length (some rows
41
# may be excluded during preparation)
42
COHORT.prep <-
43
  prepData(
44
    COHORT.use,
45
    cols.keep, discretise.settings, surv.time, surv.event,
46
    surv.event.yes, extra.fun = caliberExtraPrep, n.keep = n.data
47
  )
48
n.data <- nrow(COHORT.prep)
49
50
# Define indices of test set
51
test.set <- testSetIndices(COHORT.prep, random.seed = 78361)
52
53
# Process settings: don't touch anything!!
54
process.settings <-
55
  list(
56
    var       = c(untransformed.vars, continuous.vars),
57
    method    = rep(NA, length(untransformed.vars) + length(continuous.vars)),
58
    settings  = rep(NA, length(untransformed.vars) + length(continuous.vars))
59
  )
60
61
# If we've not already done a calibration, then do one
62
if(!file.exists(calibration.filename)) {
63
  # Create an empty data frame to aggregate stats per fold
64
  cv.performance <- data.frame()
65
  
66
  # Items to cross-validate over
67
  cv.vars <- expand.grid(ns.splits, ns.trees, ns.imputations)
68
  names(cv.vars) <- c('n.splits', 'n.trees', 'n.imputations')
69
  
70
  # prep the data (since we're not cross-validating on data prep this can be
71
  # done before the loop)
72
  
73
  # Prep the data
74
  COHORT.cv <-
75
    prepData(
76
      # Data for cross-validation excludes test set
77
      COHORT.use[-test.set, ],
78
      cols.keep,
79
      process.settings,
80
      surv.time, surv.event,
81
      surv.event.yes,
82
      extra.fun = caliberExtraPrep
83
    )
84
  
85
  # Finally, add missing flag columns, but leave the missing data intact because
86
  # rfsrc can do on-the-fly imputation
87
  COHORT.cv <- prepCoxMissing(COHORT.cv, missingReplace = NA)
88
  
89
  # Add on those column names we just created
90
  surv.predict <-
91
    c(surv.predict, names(COHORT.cv)[grepl('_missing', names(COHORT.cv))])
92
  
93
  # Run crossvalidations. No need to parallelise because rfsrc is parallelised
94
  for(i in 1:nrow(cv.vars)) {
95
    cat(
96
      'Calibration', i, '...\n'
97
    )
98
    
99
    # Get folds for cross-validation
100
    cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds)
101
    
102
    cv.fold.performance <- data.frame()
103
    
104
    for(j in 1:cv.n.folds) {
105
      time.start <- handyTimer()
106
      # Fit model to the training set
107
      surv.model.fit <-
108
        survivalFit(
109
          surv.predict,
110
          COHORT.cv[-cv.folds[[j]],],
111
          model.type = 'rfsrc',
112
          n.trees = cv.vars$n.trees[i],
113
          split.rule = split.rule,
114
          n.threads = n.threads,
115
          nsplit = cv.vars$n.splits[i],
116
          nimpute = cv.vars$n.imputations[i],
117
          na.action = 'na.impute'
118
        )
119
      time.learn <- handyTimer(time.start)
120
      
121
      time.start <- handyTimer()
122
      # Get C-indices for training and validation sets
123
      c.index.train <-
124
        cIndex(
125
          surv.model.fit, COHORT.cv[-cv.folds[[j]],],
126
          na.action = 'na.impute'
127
        )
128
      c.index.val <-
129
        cIndex(
130
          surv.model.fit, COHORT.cv[cv.folds[[j]],],
131
          na.action = 'na.impute'
132
        )
133
      time.predict <- handyTimer(time.start)
134
      
135
      # Append the stats we've obtained from this fold
136
      cv.fold.performance <-
137
        rbind(
138
          cv.fold.performance,
139
          data.frame(
140
            calibration = i,
141
            cv.fold = j,
142
            n.trees = cv.vars$n.trees[i],
143
            n.splits = cv.vars$n.splits[i],
144
            n.imputations = cv.vars$n.imputations[i],
145
            c.index.train,
146
            c.index.val,
147
            time.learn,
148
            time.predict
149
          )
150
        )
151
      
152
    } # End cross-validation loop (j)
153
    
154
    
155
    # rbind the performance by fold
156
    cv.performance <-
157
      rbind(
158
        cv.performance,
159
        cv.fold.performance
160
      )
161
    
162
    # Save output at the end of each loop
163
    write.csv(cv.performance, calibration.filename)
164
    
165
  } # End calibration loop (i)
166
  
167
168
169
} else { # If we did previously calibrate, load it
170
  cv.performance <- read.csv(calibration.filename)
171
}
172
173
174
175
# Find the best calibration...
176
# First, average performance across cross-validation folds
177
cv.performance.average <-
178
  aggregate(
179
    c.index.val ~ calibration,
180
    data = cv.performance,
181
    mean
182
  )
183
# Find the highest value
184
best.calibration <-
185
  cv.performance.average$calibration[
186
    which.max(cv.performance.average$c.index.val)
187
  ]
188
# And finally, find the first row of that calibration to get the n.bins values
189
best.calibration.row1 <-
190
  min(which(cv.performance$calibration == best.calibration))
191
192
# Prep the data to fit and test with
193
COHORT.prep <-
194
  prepData(
195
    # Data for cross-validation excludes test set
196
    COHORT.use,
197
    cols.keep,
198
    process.settings,
199
    surv.time, surv.event,
200
    surv.event.yes,
201
    extra.fun = caliberExtraPrep
202
  )
203
204
# Finally, add missing flag columns, but leave the missing data intact because
205
# rfsrc can do on-the-fly imputation
206
COHORT.prep <- prepCoxMissing(COHORT.prep, missingReplace = NA)
207
208
# Add on those column names we just created
209
surv.predict <-
210
  c(surv.predict, names(COHORT.prep)[grepl('_missing', names(COHORT.prep))])
211
212
#' ## Fit the final model
213
#' 
214
#' This may take some time, so we'll cache it if possible...
215
216
#+ fit_final_model
217
218
surv.model.fit <-
219
  survivalFit(
220
    surv.predict,
221
    COHORT.prep[-test.set,],
222
    model.type = 'rfsrc',
223
    n.trees = cv.performance[best.calibration.row1, 'n.trees'],
224
    split.rule = split.rule,
225
    n.threads = n.threads,
226
    nsplit = cv.performance[best.calibration.row1, 'n.splits'],
227
    nimpute = cv.performance[best.calibration.row1, 'n.imputations'],
228
    na.action = 'na.impute'
229
  )
230
231
surv.model.params.boot <-
232
  survivalFitBoot(
233
    surv.predict,
234
    COHORT.prep[-test.set,], # Training set
235
    COHORT.prep[test.set,],  # Test set
236
    model.type = 'rfsrc',
237
    n.threads = n.threads,
238
    bootstraps = bootstraps,
239
    n.trees = cv.performance[best.calibration.row1, 'n.trees'],
240
    split.rule = split.rule,
241
    nsplit = cv.performance[best.calibration.row1, 'n.splits'],
242
    nimpute = cv.performance[best.calibration.row1, 'n.imputations'],
243
    na.action = 'na.impute',
244
    filename = paste0(output.filename.base, '-boot-all.csv')
245
  )
246
247
# Get C-indices for training and test sets
248
surv.model.fit.coeffs <- bootStatsDf(surv.model.params.boot)
249
250
# Save them to the all-models comparison table
251
varsToTable(
252
  data.frame(
253
    model = 'rfsrc',
254
    imputation = FALSE,
255
    discretised = FALSE,
256
    c.index = surv.model.fit.coeffs['c.index', 'val'],
257
    c.index.lower = surv.model.fit.coeffs['c.index', 'lower'],
258
    c.index.upper = surv.model.fit.coeffs['c.index', 'upper'],
259
    calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
260
    calibration.score.lower =
261
      surv.model.fit.coeffs['calibration.score', 'lower'],
262
    calibration.score.upper =
263
      surv.model.fit.coeffs['calibration.score', 'upper']
264
  ),
265
  performance.file,
266
  index.cols = c('model', 'imputation', 'discretised')
267
)
268
269
write.csv(
270
  surv.model.fit.coeffs,
271
  paste0(output.filename.base, '-boot-summary.csv')
272
)