Diff of /lib/handymedical.R [000000] .. [0375db]

Switch to unified view

a b/lib/handymedical.R
1
source('handy.R')
2
3
requirePlus(
4
  'foreach',
5
  #'CALIBERdatamanage',
6
  #'CALIBERcodelists',
7
  'CALIBERlookups',
8
  'plyr',
9
  'dplyr',
10
  'ggplot2',
11
  'utils',
12
  'reshape2',
13
  'GGally',
14
  'psych',
15
  'bnlearn',
16
  'rms',
17
  'survival',
18
  'ranger',
19
  'randomForestSRC',
20
  'e1071',
21
  'data.table',
22
  'boot',
23
  install = FALSE
24
)
25
26
readMedicalData <- function(filenames, col.keep, col.class) {
27
  # read the file(s) into a data table
28
  df <- foreach(filename = filenames, .combine = 'rbind') %do% {
29
    fread(
30
      filename,
31
      sep = ',',
32
      select = col.keep,
33
      #colClasses = col.class,
34
      data.table = FALSE
35
    )
36
  }
37
  # go through altering the classes of the columns where specified
38
  for(i in 1:ncol(df)) {
39
    if(col.class[i] == 'factor') {
40
      df[,i] <- factor(df[,i])
41
    } else if(col.class[i] == 'date') {
42
      df[,i] <- as.Date(df[,i])
43
    }
44
  }
45
  
46
  # return the data
47
  df
48
}
49
50
getQuantiles <- function(x, probs, duplicate.discard = TRUE) {
51
  breaks <- quantile(x, probs, na.rm = TRUE)
52
  if(duplicate.discard) {
53
    breaks <- unique(breaks)
54
  } else if (sum(duplicated(breaks))) {
55
    stop(
56
      'Non-unique breaks and discarding of duplicates has been disabled. ',
57
      'Please choose different quantiles to split at.'
58
    )
59
  }
60
  breaks
61
}
62
63
binByQuantile <- function(x, probs, duplicate.discard = TRUE) {
64
  # discretises data by binning a vector of values x into quantile-based bins
65
  # defined by probs
66
  breaks <- getQuantiles(x, probs, duplicate.discard = duplicate.discard)
67
  factorNAfix(
68
    cut(
69
      x,
70
      breaks,
71
      include.lowest = TRUE
72
    ),
73
    NAval = 'missing'
74
  )
75
}
76
77
binByAbs <- function(x, breaks) {
78
  # discretises data by binning given absolute values of breaks, and includes
79
  # the minimum and maximum values so all data are included
80
  factorNAfix(
81
    cut(
82
      x,
83
      c(min(x, na.rm = TRUE), breaks, max(x, na.rm = TRUE)),
84
      include.lowest = TRUE
85
    ),
86
    NAval = 'missing'
87
  )
88
}
89
90
missingToAverage <- function(x) {
91
  if(is.factor(x)) {
92
    # If it's a factor, replace with the most common level
93
    return(NA2val(x, val = modalLevel(x)))
94
    
95
  } else {
96
    # If it isn't a factor, replace with the median value
97
    return(NA2val(x, val = median(x, na.rm = TRUE)))
98
  }
99
}
100
101
missingToBig <- function(x) {
102
  # Removes missing values and gives them an extreme (high) value
103
  
104
  # Get a value which is definitely far higher than the maximum value, and is
105
  # easy for a human to spot
106
  max.x <- max(x, na.rm = TRUE)
107
  # If the max is less than zero, zero will do
108
  if(max.x < 0) {
109
    really.big.value <- 0
110
  # If the max is zero, then 100 is easy to spot
111
  } else if(max.x == 0) {
112
    really.big.value <- 100
113
  # Finally, if the max value is positive, choose one at least 10x bigger
114
  } else {
115
    really.big.value <- 10*10^ceiling(log10(max.x))
116
  }
117
  
118
  # Set the NA values to that number and return
119
  NA2val(x, really.big.value)
120
}
121
122
missingToZero <- function(x) {
123
  NA2val(x, val = 0)
124
}
125
126
missingToSample <- function(x) {
127
  NA2val(x, val = samplePlus(x, replace = TRUE))
128
}
129
130
prepSurvCol <- function(df, col.time, col.event, event.yes) {
131
  # Rename the survival time column
132
  names(df)[names(df) == col.time] <- 'surv_time'
133
  # Create a column denoting censorship or otherwise of events
134
  df$surv_event <- df[, col.event] %in% event.yes
135
  # Remove the event column so we don't use it as a covariate later
136
  df[, col.event] <- NULL
137
  
138
  df
139
}
140
141
prepData <- function(
142
  # surv.event cannot be 'surv_event' or will break later!
143
  # The fraction of the data to use as the test set (1 - this will be used as
144
  # the training set)
145
  # Default quantile boundaries for discretisation
146
  df, predictors, process.settings, col.time, col.event, event.yes = NA,
147
  default.quantiles = c(0, 0.1, 0.25, 0.5, 0.75, 0.9, 1),
148
  extra.fun = NULL, random.seed = NA, NAval = 'missing', n.keep = NA
149
) {
150
  # If a random seed was provided, set it
151
  if(!is.na(random.seed)) set.seed(random.seed)
152
  
153
  # If we only want n.keep of the data, might as well throw it out now to make
154
  # all the steps from here on faster...
155
  if(!is.na(n.keep)) {
156
    # Keep rows at random to avoid bias
157
    df <- sample.df(df, n.keep)
158
  } else {
159
    # If there was no n.keep, we should still randomise the rows for consistency
160
    df <- sample.df(df, nrow(df))
161
  }
162
  
163
  # Add event column to predictors to create full column list
164
  columns <- c(col.time, col.event, predictors)
165
  
166
  # Only include the columns we actually need, and don't include any which
167
  # aren't in the data frame because it's possible that some predictors may be
168
  # calculated later, eg during extra.fun
169
  df <- df[, columns[columns %in% names(df)]]
170
  
171
  # Go through per predictor and process them
172
  for(col.name in predictors[predictors %in% names(df)]){
173
    # If we have a specific way to process this column, let's do it!
174
    if(col.name %in% process.settings$var) {
175
      j <- match(col.name, process.settings$var)
176
177
      # Processing method being NA means leave it alone...
178
      if(!is.na(process.settings$method[j])) {
179
        # ...so, if not NA, use the function provided
180
        process.fun <- match.fun(process.settings$method[j])
181
        
182
        # If there are no process settings for this, just call the function
183
        if(isExactlyNA(process.settings$settings[[j]])) {
184
          df[, col.name] <- process.fun(df[, col.name])
185
        # Otherwise, call the function with settings
186
        } else {
187
          df[, col.name] <-
188
            process.fun(
189
              df[, col.name],
190
              process.settings$settings[[j]]
191
            )
192
        }
193
      }
194
    # Otherwise, no specific processing specified, so perform defaults
195
    } else {
196
      # If it's a character column, make it a factor
197
      if(is.character(df[, col.name])) {
198
        df[, col.name] <- factor(df[, col.name])
199
      }
200
      # Then, if there are any NAs, go through and make them a level of their own
201
      if(is.factor(df[, col.name]) & anyNA(df[, col.name])){
202
        df[, col.name] <-
203
          factorNAfix(df[, col.name], NAval = NAval)
204
      }
205
      # If it's numerical, then it needs discretising
206
      if(class(df[,col.name]) %in% c('numeric', 'integer')) {
207
          df[,col.name] <-
208
              binByQuantile(df[,col.name], default.quantiles)
209
      # Finally, if it's logical, turn it into a two-level factor
210
      } else if(class(df[,col.name]) == 'logical') {
211
        df[,col.name] <- factor(df[,col.name])
212
        # If there are missing values, fix them
213
        if(anyNA(df[, col.name])) {
214
          factorNAfix(df[, col.name], NAval = NAval)
215
        }
216
      }
217
    }
218
  }
219
  
220
  # Sort out the time to event and event class columns
221
  df <- prepSurvCol(df, col.time, col.event, event.yes)
222
  
223
  # If there's any more preprocessing to do, do it now!
224
  if(!is.null(extra.fun)) {
225
    df <- extra.fun(df)
226
  }
227
  
228
  # Return prepped data
229
  df
230
}
231
232
prepCoxMissing <- function(
233
  df, missing.cols = NA, missingReplace = missingToZero,
234
  missing.suffix = '_missing', NAval = 'missing'
235
){
236
  # If a list of columns which may contain missing data wasn't provided, then
237
  # find those columns which do, in fact, contain missing data.
238
  # (Check length == 1 or gives a warning if testing a vector.)
239
  if(length(missing.cols) == 1) {
240
    if(is.na(missing.cols)) {
241
      missing.cols <- c()
242
      for(col.name in names(df)) {
243
        if(sum(is.na(df[, col.name])) > 0) {
244
          missing.cols <- c(missing.cols, col.name)
245
        }
246
      }
247
    }
248
  }
249
  
250
  # Go through missing.cols, processing appropriately for data type
251
  for(col.name in missing.cols) {
252
    # If it's a factor, simply create a new level for missing values
253
    if(is.factor(df[, col.name])) {
254
      # If it's a factor, NAs can be their own level
255
      df[, col.name] <-
256
        factorNAfix(df[, col.name], NAval = NAval)
257
      
258
    } else {
259
      # If it isn't a factor, first create a column designating missing values
260
      df[, paste0(col.name, missing.suffix)] <- is.na(df[, col.name])
261
      
262
      # If we want to replace the missing values...
263
      if(!isExactlyNA(missingReplace)) {
264
      
265
        # Then, deal with the actual values, depending on variable type
266
        if(is.logical(df[, col.name])) {
267
          # Set the NA values to baseline so they don't contribute to the model
268
          df[is.na(COHORT.scaled[, col.name]), col.name] <- FALSE
269
        } else {
270
          # Set the NA values to the desired value, eg 0 (ie baseline in a Cox
271
          # model with missingToZero), missingToMedian, missingToBig, etc...
272
          df[, col.name] <- missingReplace(df[, col.name])
273
        }
274
      }
275
      
276
    }
277
  }
278
  
279
  df
280
}
281
282
medianImpute <- function(df, missing.cols = NA) {
283
  # If a list of columns which may contain missing data wasn't provided, then
284
  # find those columns which do, in fact, contain missing data.
285
  # (Check length == 1 or gives a warning if testing a vector.)
286
  if(length(missing.cols) == 1) {
287
    if(is.na(missing.cols)) {
288
      missing.cols <- c()
289
      for(col.name in names(df)) {
290
        if(sum(is.na(df[, col.name])) > 0) {
291
          missing.cols <- c(missing.cols, col.name)
292
        }
293
      }
294
    }
295
  }
296
  
297
  # Go through missing.cols, processing appropriately for data type
298
  for(col.name in missing.cols) {
299
    df[, col.name] <- missingToAverage(df[, col.name])
300
  }
301
  
302
  df
303
}
304
305
modalLevel <- function(x) {
306
  # Return the name of the most common level in a factor x
307
  tt <- table(x)
308
  names(tt[which.max(tt)])
309
}
310
311
plotConfusionMatrix <- function(truth, prediction, title = NA) {
312
  confusion.matrix <- table(truth, prediction)
313
  
314
  # normalise by columns, ie predictions sum to probability 1
315
  confusion.matrix.n <- sweep(confusion.matrix, 1, rowSums(confusion.matrix),
316
                              FUN="/")
317
  
318
  confusion.matrix.n <- melt(confusion.matrix.n)
319
  
320
  confusion.matrix.plot <-
321
    ggplot(confusion.matrix.n,
322
           aes(x=truth,
323
               y=prediction,
324
               fill=value)) +
325
    geom_tile()
326
  
327
  if(!is.na(title)) {
328
    confusion.matrix.plot <-
329
      confusion.matrix.plot + ggtitle(title)
330
  }
331
  
332
  print(confusion.matrix.plot)
333
  
334
  # return the raw confusion matrix
335
  confusion.matrix
336
}
337
338
convertFactorsToBinaryColumns <- function(df) {
339
  covariates <- colnames(df)
340
  return(
341
    model.matrix( 
342
      formula(paste0('~', paste0(covariates, collapse = '+'))), 
343
      data = df
344
    )[,-1] # -1 to remove 'Intercept' column at start which is all 1s
345
  )
346
}
347
348
getTopStates <- function(df, n = 10) {
349
  # Given a data frame, find the top unique 'states', ie collections of common
350
  # values, and return a vector of which rows belong to each state, and NA for
351
  # those which aren't in the top n states.
352
  # df = a data frame
353
  # n = the number of top states
354
  all.states <- do.call(paste, df)
355
  top.states <-
356
    head(
357
      sort(
358
        table(all.states),
359
        decreasing = TRUE
360
      ),
361
      n
362
    )
363
  factor(all.states, levels=names(top.states))
364
}
365
366
cvFolds <- function(n.data, n.folds = 3) {
367
  # Return a list of n.folds vectors containing the numbers 1:n.data, scrambled
368
  # randomly.
369
  split(
370
    sample(1:n.data),
371
    ceiling((1:n.data)/(n.data/n.folds))
372
  )
373
}
374
375
modelType <- function(model.fit) {
376
  # Take a model fit and return a string representing its type so as to deal
377
  # with it correctly
378
  
379
  # rfsrc for some reason has multiple classes associated with its fit objects
380
  if('rfsrc' %in% class(model.fit)) {
381
    return('rfsrc')
382
  # Other models are more sensible, and simply returning the class will do
383
  } else {
384
    return(class(model.fit))
385
  }
386
}
387
388
cIndex <- function(model.fit, df, risk.time = 5, tod.round = 0.1, ...) {
389
  if(modelType(model.fit) == 'rfsrc') {
390
    # rfsrc throws an error unless the y-values in the provided data are
391
    # identical to those used to train the model, so recreate the rounded ones..
392
    df$surv_time_round <-
393
      round_any(df$surv_time, tod.round)
394
    # This means we need to use surv_time_round in the formula
395
    surv.time <- 'surv_time_round'
396
  } else {
397
    # Otherwise, our survival time variable is just surv_time
398
    surv.time <- 'surv_time'
399
  }
400
  
401
  # Calculate the C-index for a Cox proportional hazards model on data in df
402
  
403
  # First, get some risks, or values proportional to them
404
  risk <- getRisk(model.fit, df, ...)
405
  
406
  # Then, get the C-index and, since we don't probably want to do any further
407
  # work with it, simply return the numerical value of the index itself.
408
  as.numeric(
409
    survConcordance(
410
      as.formula(paste0('Surv(', surv.time, ', surv_event) ~ risk')),
411
      df
412
    )$concordance
413
  )
414
}
415
416
generalVarImp <-
417
  function(
418
    model.fit, df, vars = NA, risk.time = 5, tod.round = 0.1,
419
    statistic = cIndex, ...
420
    ) {
421
  baseline.statistic <- statistic(model.fit, df, risk.time, tod.round, ...)
422
  
423
  # If no variables were passed, let's do it on all of the variables
424
  if(isExactlyNA(vars)) {
425
    if(modelType(model.fit) == 'survreg') {
426
      vars <- attr(model.fit$terms, 'term.labels')
427
    } else {
428
      vars <- names(model.fit$xvar)
429
    }
430
    # Then, remove any variables which don't appear in the dataset, because we
431
    # can't test them (this might be interaction terms like age:gender, for
432
    # example)
433
    vars <- vars[vars %in% names(df)]
434
  }
435
  
436
  var.imp <- data.frame(
437
     var = vars,
438
     var.imp = NA,
439
     stringsAsFactors = FALSE
440
  )
441
  for(i in 1:nrow(var.imp)) { 
442
    # Make a new, temporary data frame
443
    df2 <- df
444
    # Permute values of the sample in question
445
    df2[, var.imp[i, 'var']] <- sample(df[, var.imp[i, 'var']], replace = TRUE)
446
    # Calculate the C-index based on the permuted data
447
    var.statistic <- statistic(model.fit, df2, risk.time, tod.round, ...)
448
    var.imp[i, 'var.imp'] <- baseline.statistic - var.statistic
449
  }
450
  
451
  # Return the data frame of variable importances
452
  var.imp
453
}
454
455
modelFactorLevelName <- function(factor.name, level.name, model.type) {
456
  if(model.type == 'cph') {
457
    # factor=Level
458
    return(paste0(factor.name, '=', level.name))
459
  } else if(model.type == 'survreg') {
460
    # factorLevel
461
    return(paste0(factor.name, level.name))
462
  } else if(model.type == 'boot.survreg') {
463
    # factorLevel
464
    return(paste0(factor.name, level.name))
465
  } else if(model.type == 'boot.foreach') {
466
    return(make.names(paste0(factor.name, level.name)))
467
  }
468
}
469
470
cphCoeffs <- function(cph.model, df, surv.predict, model.type = 'cph') {
471
  # Depending on the model type, get a vector of the Cox coefficient names...
472
  if(model.type == 'cph') {
473
    coeff.names <- names(cph.model$coefficients)
474
    coeff.vals  <- cph.model$coefficients
475
  } else {
476
    # Otherwise, it will come as a data frame of some kind
477
    coeff.names <- rownames(cph.model)
478
    coeff.vals  <- cph.model$val
479
    coeff.lower  <- cph.model$lower
480
    coeff.upper  <- cph.model$upper
481
  }
482
  
483
  # Get the names and levels from each of the factors used to create the
484
  # survival model. Models by cph are good enough to separate with = (ie 
485
  # factor=level), but this is not universal so it's a more general solution to
486
  # create these coefficient names from the data in a per-model-type way.
487
  surv.vars.levels <- sapply(surv.predict, function(x){levels(df[,x])})
488
  surv.vars.df <- 
489
    data.frame(
490
      var   = rep(surv.predict, unlist(sapply(surv.vars.levels, length))),
491
      level = unlist(surv.vars.levels),
492
      val   = 0, # betas are zero for all baselines so make that the default val
493
      err   = 0, # uncertainty is zero for a baseline too!
494
      stringsAsFactors = FALSE
495
    )
496
  # go through each coefficient in the survival fit...
497
  for(i in 1:nrow(surv.vars.df)) {
498
    # ...create the factor/level coefficient name...
499
    needle <-
500
      modelFactorLevelName(
501
        surv.vars.df[i, 'var'], surv.vars.df[i, 'level'],
502
        model.type
503
      )
504
    # ...find where in the coefficients that name occurs...
505
    if(sum(coeff.names == needle) > 0) {
506
      needle.i <- which(coeff.names == needle)
507
      # ...and set the relevant value and error
508
      surv.vars.df[i, 'val'] <- coeff.vals[needle.i]
509
      surv.vars.df[i, 'lower'] <- coeff.lower[needle.i]
510
      surv.vars.df[i, 'upper'] <- coeff.upper[needle.i]
511
    }
512
  }
513
  surv.vars.df
514
}
515
516
# Create per-patient survival curves from a data frame and a Cox model
517
cphSurvivalCurves <-
518
  function(
519
    df,
520
    surv.model,
521
    surv.times = max(df$surv_time)*seq(0, 1, length.out = 100)
522
  ) {
523
    # return a large, melted data frame of the relevant curves
524
    data.frame(
525
      #anonpatid = rep(df$anonpatid, each = length(surv.times)),
526
      id = rep(1:nrow(df), each = length(surv.times)),
527
      surv_time = rep(df$surv_time, each = length(surv.times)),
528
      surv_event = rep(df$surv_event, each = length(surv.times)),
529
      t = rep(surv.times, times = nrow(df)),
530
      s = 
531
        c(
532
          t(
533
            survest(surv.model,
534
                    newdata=df,
535
                    times=surv.times,
536
                    conf.int = FALSE # we don't want confidence intervals
537
            )$surv
538
          )
539
        )
540
    )
541
  }
542
543
# Create per-patient survival curves from a data frame and a random forest
544
rfSurvivalCurves <-
545
  function(
546
    df,
547
    predict.rf
548
  ) {
549
    surv.times <- predict.rf$unique.death.times
550
    # return a large, melted data frame of the relevant curves
551
    data.frame(
552
      #anonpatid = rep(df$anonpatid, each = length(surv.times)),
553
      id = rep(1:nrow(df), each = length(surv.times)),
554
      surv_time = rep(df$surv_time, each = length(surv.times)),
555
      surv_event = rep(df$surv_event, each = length(surv.times)),
556
      t = rep(surv.times, times = nrow(df)),
557
      s = c(t(predict.rf$survival))
558
    )
559
  }
560
561
getSurvCurves <- function(
562
  df,
563
  predictions,
564
  model.type = 'cph',
565
  surv.times = max(df$surv_time)*seq(0, 1, length.out = 100)
566
) {
567
  if(model.type == 'cph') {
568
    # return a large, melted data frame of the relevant curves
569
    data.frame(
570
      #anonpatid = rep(df$anonpatid, each = length(surv.times)),
571
      id = rep(1:nrow(df), each = length(surv.times)),
572
      surv_time = rep(df$surv_time, each = length(surv.times)),
573
      surv_event = rep(df$surv_event, each = length(surv.times)),
574
      t = rep(surv.times, times = nrow(df)),
575
      s = 
576
        c(
577
          t(
578
            survest(surv.model,
579
                    newdata=df,
580
                    times=surv.times,
581
                    conf.int = FALSE # we don't want confidence intervals
582
            )$surv
583
          )
584
        )
585
    )
586
  } else if(model.type == 'ranger') {
587
    surv.times <- predictions$unique.death.times
588
    # return a large, melted data frame of the relevant curves
589
    data.frame(
590
      #anonpatid = rep(df$anonpatid, each = length(surv.times)),
591
      id = rep(1:nrow(df), each = length(surv.times)),
592
      surv_time = rep(df$surv_time, each = length(surv.times)),
593
      surv_event = rep(df$surv_event, each = length(surv.times)),
594
      t = rep(surv.times, times = nrow(df)),
595
      s = c(t(predictions$survival))
596
    )
597
  }  else if(model.type == 'rfsrc') {
598
    surv.times <- predictions$time.interest
599
    # return a large, melted data frame of the relevant curves
600
    data.frame(
601
      #anonpatid = rep(df$anonpatid, each = length(surv.times)),
602
      id = rep(1:nrow(df), each = length(surv.times)),
603
      surv_time = rep(df$surv_time, each = length(surv.times)),
604
      surv_event = rep(df$surv_event, each = length(surv.times)),
605
      t = rep(surv.times, times = nrow(df)),
606
      s = c(t(predictions$survival))
607
    )
608
  }
609
}
610
611
survivalFit <- function(
612
  predict.vars, df, model.type = 'cph',
613
  n.trees = 500, split.rule = 'logrank', n.threads = 1, tod.round = 0.1,
614
  bootstraps = 200, ...
615
) {
616
  
617
  # Depending on model.type, change the name of the variable for survival time
618
  if(model.type %in% c('cph', 'survreg', 'survreg.boot')) {
619
    # Cox models can use straight death time
620
    surv.time = 'surv_time'
621
  } else {
622
    # Random forests need to use rounded death time
623
    surv.time = 'surv_time_round'
624
    
625
    df$surv_time_round <-
626
      round_any(df$surv_time, tod.round)
627
  }
628
  
629
  # Create a survival formula with the provided variable names...
630
  surv.formula <-
631
    formula(
632
      paste0(
633
        # Survival object made in-formula
634
        'Surv(', surv.time,', surv_event) ~ ',
635
        # Predictor variables then make up the other side
636
        paste(predict.vars, collapse = '+')
637
      )
638
    )
639
  
640
  # Then, perform the relevant type of fit depending on the model type requested 
641
  if(model.type == 'cph') {
642
    return(
643
      cph(surv.formula, df, surv = TRUE)
644
    )
645
  } else if(model.type == 'survreg') {
646
    return(
647
      survreg(surv.formula, df, dist = 'exponential')
648
    )
649
  } else if(model.type == 'survreg.boot') {
650
    return(
651
      boot(
652
        formula = surv.formula,
653
        data = df,
654
        statistic = bootstrapFit,
655
        fit.fun = survreg,
656
        R = bootstraps,
657
        dist = 'exponential'
658
      )
659
    )
660
  } else if(model.type == 'ranger') {
661
    return(
662
      ranger(
663
        surv.formula,
664
        df,
665
        num.trees = n.trees,
666
        splitrule = split.rule,
667
        num.threads = n.threads,
668
        ...
669
      )
670
    )
671
  } else if(model.type == 'rfsrc') {
672
    # rfsrc, if you installed it correctly, controls threading by changing an
673
    # environment variable
674
    options(rf.cores = n.threads)
675
    
676
    # Fit and return
677
    return(
678
      rfsrc(
679
        surv.formula,
680
        df,
681
        ntree = n.trees,
682
        ...
683
      )
684
    )
685
  }
686
}
687
688
survivalFitBoot <- function(
689
  predict.vars, df, df.test, model.type = 'cph', bootstraps = 200,
690
  filename = NULL, n.threads = 1, n.trees = 500, split.rule = 'logrank',
691
  tod.round = 0.1, ...
692
) {
693
  # This function should be foreach, but currently not in parallel. Running in
694
  # parallel causes some kind of error which is very hard to debug with the 
695
  # calibration score functions (it may be that the LOESS estimation runs out of
696
  # memory, but it's not clear). This error is not reproducible when running the
697
  # processes in serial. This isn't too much of an issue because the slowest
698
  # models are random forests, and these already train in parallel.
699
  # This should therefore be reproduced in foreach, but for now I'll just use a
700
  # for loop so it can write out bootstrap results as you go.
701
  # If implementing parallel, do a nested for/foreach loop combo which does
702
  # 1:(bootstraps/n.threads) in the for and 1:n.threads in the foreach, so you
703
  # can write out after n.threads processes and not lose everything if anything
704
  # bad happens.
705
  
706
  # Instantiate a blank data frame
707
  bootstrap.params <- data.frame()
708
  # And set the start bootstrap index to 1
709
  boot.so.far <- 1
710
  
711
  # If a filename was specified...
712
  if(!is.null(filename)) {
713
    # ...and it exists already...
714
    if(file.exists(filename)) {
715
      # ...read it and see how far we got
716
      bootstrap.params <- read.csv(filename)
717
      boot.so.far <- nrow(bootstrap.params)
718
      
719
      # If we're already done, return the bootstraps
720
      if(boot.so.far >= bootstraps) {
721
        return(bootstrap.params)
722
      }
723
    }
724
  }
725
  # Otherwise, stick with a blank data frame and starting at 1
726
  
727
  # Run a for loop to get the bootstrapped parameter estimates.
728
  for(i in boot.so.far:bootstraps) {
729
    
730
    # Bootstrap-sampled training set
731
    df.boot <- bootstrapSampleDf(df)
732
    
733
    surv.model.fit.i <-
734
      survivalFit(
735
        predict.vars, df.boot, model.type = model.type,
736
        n.trees = n.trees, split.rule = split.rule,
737
        # n.threads to take advantage of random forest parallelisation. Change
738
        # to n.threads = 1 if foreach is parallelised, so everything is done
739
        # in parallel.
740
        n.threads = n.threads,
741
        ...
742
      )
743
    
744
    # Work out other quantities of interest
745
    var.imp.vector <- bootstrapVarImp(surv.model.fit.i, df.boot, ...)
746
    c.index <- cIndex(surv.model.fit.i, df.test, ...)
747
    # This function causes the error when run in parallel.
748
    calibration.score <- calibrationScoreWrapper(surv.model.fit.i, df.test, ...)
749
    
750
    # Some models (eg random forests!) don't return coefficients...so only try
751
    # to add these to the data frame to return from this function if they exist.
752
    if(!is.null(coef(surv.model.fit.i))) {
753
      bootstrap.params <-
754
        rbind(
755
          bootstrap.params,
756
          data.frame(
757
            t(coef(surv.model.fit.i)),
758
            t(var.imp.vector),
759
            c.index,
760
            calibration.score
761
          )
762
        )
763
    } else {
764
      bootstrap.params <-
765
        rbind(
766
          bootstrap.params,
767
          data.frame(
768
            t(var.imp.vector),
769
            c.index,
770
            calibration.score
771
          )
772
        )
773
    }
774
    
775
    # At the end of each iteration, save progress if a filename was provided
776
    if(!is.null(filename)){
777
      write.csv(bootstrap.params, filename)
778
    }
779
  }
780
  
781
  # At the end of the function, return the parameters
782
  bootstrap.params
783
}
784
785
survivalBootstrap <- function(
786
  predict.vars, df, df.test, model.type = 'survreg',
787
  n.trees = 500, split.rule = 'logrank', n.threads = 1, tod.round = 0.1,
788
  bootstraps = 200, nimpute = 1, nsplit = 0
789
) {
790
  
791
  # Depending on model.type, change the name of the variable for survival time
792
  if(model.type %in% c('survreg')) {
793
    # Cox models can use straight death time
794
    surv.time = 'surv_time'
795
  } else {
796
    # Random forests need to use rounded death time
797
    surv.time = 'surv_time_round'
798
    
799
    df$surv_time_round <-
800
      round_any(df$surv_time, tod.round)
801
  }
802
  
803
  # Create a survival formula with the provided variable names...
804
  surv.formula <-
805
    formula(
806
      paste0(
807
        # Survival object made in-formula
808
        'Surv(', surv.time,', surv_event) ~ ',
809
        # Predictor variables then make up the other side
810
        paste(predict.vars, collapse = '+')
811
      )
812
    )
813
  
814
  # Then, perform the relevant type of fit depending on the model type requested 
815
  if(model.type == 'cph') {
816
    stop('model.type cph not yet implemented')
817
  } else if(model.type == 'survreg') {
818
    return(
819
      boot(
820
        formula = surv.formula,
821
        data = df,
822
        statistic = bootstrapFitSurvreg,
823
        R = bootstraps,
824
        parallel = 'multicore',
825
        ncpus = n.threads,
826
        test.data = df.test
827
      )
828
    )
829
  } else if(model.type == 'ranger') {
830
    stop('model.type ranger not yet implemented')
831
  } else if(model.type == 'rfsrc') {
832
    # Make rfsrc single-threaded, so we can parallelise with bootstrap
833
    # (This helps with things like c-index calculation which may not use all
834
    # cores, though in edge cases of very few bootstraps doing it this way will
835
    # slow things down)
836
    options(rf.cores = 1)
837
    
838
    return(
839
      boot(
840
        formula = surv.formula,
841
        data = df,
842
        statistic = bootstrapFitRfsrc,
843
        R = bootstraps,
844
        parallel = 'multicore',
845
        ncpus = n.threads,
846
        n.trees = n.trees,
847
        test.data = df.test,
848
        # Boot requires named variables, so can't use ... here. This slight
849
        # kludge means that this will fail unless these three variables are
850
        # explicitly specified in the survivalBootstrap call.
851
        nimpute = nimpute,
852
        nsplit = nsplit
853
      )
854
    )
855
  }
856
}
857
858
bootstrapFit <- function(formula, data, indices, fit.fun) {
859
  # Wrapper function to pass generic fitting functions to boot for
860
  # bootstrapping. This is actually called by boot, so much of this isn't
861
  # specified manually.
862
  #
863
  # Args:
864
  #   formula: The formula to fit with, given by the formula argument in boot.
865
  #      data: The data to fit, given by the data argument in boot.
866
  #   indices: Used internally by boot to select each bootstrap sample.
867
  #   fit.fun: The function you'd like to use to fit with, eg lm, cph, survreg,
868
  #            etc. You pass this to boot as part of its ... arguments, so
869
  #            provide it as fit.fun. It must return something sensible when
870
  #            acted on by the coef function.
871
  #       ...: Other arguments to your fitting function. This is now a nested
872
  #            ..., since you'll put these hypothetical arguments in boot's ...
873
  #            to pass here, to pass to your fitting function.
874
  #
875
  # Returns:
876
  #   The coefficients of the fit, which are then aggregated over multiple
877
  #   passes by boot to construct estimates of variation in parameters.
878
  
879
  d <- data[indices,]
880
  fit <- fit.fun(formula, data = d)
881
  return(coef(fit))
882
}
883
884
bootstrapVarImp <- function(fit, data, ...) {
885
  # Variable importance by C-index
886
  var.imp.c.index <- generalVarImp(fit, data, statistic = cIndex, ...)
887
  
888
  # Concatenate both into a vector with names to distinguish the two
889
  var.imp.vector <- var.imp.c.index$var.imp
890
  names(var.imp.vector) <- paste0('vimp.c.index.', var.imp.c.index$var)
891
892
  # Return that vector
893
  var.imp.vector
894
}
895
896
bootstrapFitSurvreg <- function(formula, data, indices, test.data) {
897
  # Wrapper function to pass a survreg fit with c-index calculations to boot.
898
  
899
  d <- data[indices,]
900
  fit <- survreg(formula, data = d, dist = 'exponential')
901
  
902
  # Get variable importances by both C-index and calibration
903
  var.imp.vector <- bootstrapVarImp(fit, d)
904
  
905
  c.index <- cIndex(fit, test.data)
906
  calibration.score <- calibrationScoreWrapper(fit, test.data)
907
  
908
  # Return fit coefficients, variable importances, c-index on training data,
909
  # c-index on test data
910
  return(
911
    c(
912
      coef(fit),
913
      var.imp.vector,
914
      c.index = c.index,
915
      calibration.score = calibration.score
916
    )
917
  )
918
}
919
920
bootstrapFitRfsrc <-
921
  function(
922
    formula, data, indices, n.trees, test.data, nimpute, nsplit
923
  )
924
{
925
  # Wrapper function to pass an rfsrc fit with c-index calculations to boot.
926
  
927
  fit <-
928
    rfsrc(
929
      formula, data[indices, ], ntree = n.trees,
930
      nimpute = nimpute, nsplit = nsplit, na.action = 'na.impute'
931
    )
932
933
  # Check the model calibration on the test set
934
  calibration.table <-
935
    calibrationTable(fit, test.data, na.action = 'na.impute')
936
  calibration.score <- calibrationScore(calibration.table, curve = FALSE)
937
  
938
  # Get variable importances by both C-index and calibration
939
  var.imp.vector <- bootstrapVarImp(fit, data[indices, ], na.action = 'na.impute')
940
  
941
  # Return fit coefficients, c-index on training data, c-index on test data
942
  return(
943
    c(
944
      var.imp.vector,
945
      c.index = cIndex(fit, test.data, na.action = 'na.impute'),
946
      calibration.score = calibration.score
947
    )
948
  )
949
}
950
951
bootStats <- function(bootfit, uncertainty = 'sd', transform = identity) {
952
  # Return a data frame with the statistics from a bootstrapped fit
953
  #
954
  # Args:
955
  #     bootfit: A boot object.
956
  # uncertainty: Function to use for returning uncertainty, defaulting to 'sd'
957
  #              which returns the standard deviation.
958
  #   transform: Optional transform for the statistics, defaults to identity, ie
959
  #              leave the values as they are. Useful if you want the value and
960
  #              variance of the exp(statistic), etc.
961
  #
962
  
963
  if(uncertainty == 'sd'){
964
    return(
965
      data.frame(
966
        val  = transform(bootfit$t0),
967
        err  = apply(transform(bootfit$t), 2, sd)
968
      )
969
    )
970
  } else if(uncertainty == '95ci') {
971
    ci <- apply(transform(bootfit$t), 2, quantile, probs = c(0.025, 0.5, 0.975))
972
    return(
973
      data.frame(
974
        val  = t(ci)[, 2],
975
        lower = t(ci)[, 1],
976
        upper = t(ci)[, 3],
977
        row.names = names(bootfit$t0)
978
      )
979
    )
980
  } else {
981
    stop("Unknown value '", uncertainty, "' for uncertainty parameter.")
982
  }
983
}
984
985
bootStatsDf <- function(df, transform = identity) {
986
  data.frame(
987
    val = sapply(df, FUN = function(x) {median(transform(x))}),
988
    lower =
989
      sapply(df, FUN = function(x) {quantile(transform(x), probs = c(0.025))}),
990
    upper =
991
      sapply(df, FUN = function(x) {quantile(transform(x), probs = c(0.975))})
992
  )
993
}
994
995
bootMIStats <- function(boot.mi, uncertainty = '95ci', transform = identity) {
996
  # Return a data frame with the statistics from a bootstrapped fit
997
  #
998
  # Args:
999
  #     bootfit: A boot object.
1000
  # uncertainty: Function to use for returning uncertainty, defaulting to 'sd'
1001
  #              which returns the standard deviation.
1002
  #   transform: Optional transform for the statistics, defaults to identity, ie
1003
  #              leave the values as they are. Useful if you want the value and
1004
  #              variance of the exp(statistic), etc.
1005
  #
1006
  
1007
  boot.mi.combined <-
1008
    do.call(
1009
      # rbind together...
1010
      rbind,
1011
      # ...a list of matrices of bootstrap estimates extracted from the list of
1012
      # bootstrap fits
1013
      lapply(boot.mi, function(x){x$t})
1014
    )
1015
 
1016
  if(uncertainty == 'sd'){
1017
    return(
1018
      data.frame(
1019
        val  = apply(transform(boot.mi.combined), 2, mean),
1020
        err  = apply(transform(boot.mi.combined), 2, sd),
1021
        row.names = names(boot.mi[[1]]$t0)
1022
      )
1023
    )
1024
  } else if(uncertainty == '95ci') {
1025
    ci <-
1026
      apply(
1027
        transform(boot.mi.combined), 2, quantile, probs = c(0.025, 0.5, 0.975)
1028
      )
1029
    return(
1030
      data.frame(
1031
        val  = t(ci)[, 2],
1032
        lower = t(ci)[, 1],
1033
        upper = t(ci)[, 3],
1034
        row.names = names(boot.mi[[1]]$t0)
1035
      )
1036
    )
1037
  } else {
1038
    stop("Unknown value '", uncertainty, "' for uncertainty parameter.")
1039
  }
1040
}
1041
1042
bootstrapDiff <- function(x1, x2, uncertainty = '95ci') {
1043
  # Work out the difference between two values calculated by bootstrapping
1044
  
1045
  x2mx1 <- 
1046
    sample(x2, size = length(x1) * 10, replace = TRUE) -
1047
      sample(x1, size = length(x1) * 10, replace = TRUE)
1048
  
1049
  if(uncertainty == '95ci') {
1050
    est <- quantile(x2mx1, probs = c(0.5, 0.025, 0.975))
1051
    names(est) <- c('val', 'lower', 'upper')
1052
    return(est)
1053
  } else if(uncertainty == 'sd') {
1054
    val <- mean(x2mx1)
1055
    stdev <- sd(x2mx1)
1056
    return(
1057
      c(
1058
        val = val,
1059
        lower = val - stdev,
1060
        upper = val + stdev
1061
      )
1062
    )
1063
  } else {
1064
    stop("Unknown value '", uncertainty, "' for uncertainty parameter.")
1065
  }
1066
}
1067
1068
negExp <- function(x) {
1069
  exp(-x)
1070
}
1071
1072
getRisk <- function(model.fit, df, risk.time = 5, tod.round = 0.1, ...) {
1073
  # If needed, create the rounded survival time
1074
  if(modelType(model.fit) %in% c('ranger', 'rfsrc')) {
1075
    df$surv_time_round <- round_any(df$surv_time, tod.round)
1076
  }
1077
  
1078
  # Make predictions for the data df based on the model model.fit if it doesn't
1079
  # require special treatment (in which case it will be done manually below)
1080
  if(modelType(model.fit) != 'cv.glmnet') {
1081
    predictions <- predict(model.fit, df, ...)
1082
  }
1083
  
1084
  # Then, for any model other than cph, they will need to be transformed in some
1085
  # way to get a proxy for risk...
1086
  
1087
  # If we're dealing with a ranger model, then we need to get a proxy for risk
1088
  if(modelType(model.fit) == 'ranger') {
1089
    risk.bin <- which.min(abs(predictions$unique.death.times - risk.time))
1090
    # Get the chance of having died (ie 1 - survival) for all patients at that
1091
    # time (ie in that bin)
1092
    predictions <- 1 - predictions$survival[, risk.bin]
1093
  } else if(modelType(model.fit) == 'rfsrc') {
1094
    # If we're dealing with a randomForestSRC model, extract the 'predicted' var
1095
    predictions <- predictions$predicted
1096
  } else if(modelType(model.fit) == 'survreg') {
1097
    # survreg type models give larger numbers for longer survival...this is a
1098
    # hack to make this return C-indices which make sense!
1099
    predictions <- max(predictions) - predictions
1100
  } else if(modelType(model.fit) == 'cv.glmnet') {
1101
    predictions <-
1102
      predict(
1103
        model.fit,
1104
        # Use model which is least complex but still within 1 SE of lowest MSE
1105
        s = model.fit$lambda.1se,
1106
        # cv.glmnet takes a matrix, not a data frame, and it must be passed with
1107
        # time correct dimensions, ie time/event columns removed
1108
        newx = df,
1109
        ...
1110
      )
1111
    # cv.glmnet predictions are returned as a matrix, so convert to vector
1112
    predictions <- as.vector(predictions)
1113
  }
1114
  
1115
  predictions
1116
}
1117
1118
getRiskAtTime <- function(model.fit, df, risk.time = 5, ...) {
1119
  
1120
  # If we're dealing with a ranger model, then we need to get a proxy for risk
1121
  if(modelType(model.fit) == 'ranger') {
1122
    # Make predictions for the data df based on the model model.fit
1123
    predictions <- predict(model.fit, df, ...)
1124
    
1125
    risk.bin <- which.min(abs(predictions$unique.death.times - risk.time))
1126
    # Get the chance of having died (ie 1 - survival) for all patients at that
1127
    # time (ie in that bin)
1128
    predictions <- 1 - predictions$survival[, risk.bin]
1129
    
1130
    
1131
  } else if(modelType(model.fit) == 'rfsrc') {
1132
    # Make predictions for the data df based on the model model.fit
1133
    predictions <- predict(model.fit, df, ...)
1134
    
1135
    # If we're dealing with a randomForestSRC model, do the same as ranger but
1136
    # with different variable names
1137
    risk.bin <- which.min(abs(predictions$time.interest - risk.time))
1138
    # Get the chance of having died (ie 1 - survival) for all patients at that
1139
    # time (ie in that bin)
1140
    predictions <- 1 - predictions$survival[, risk.bin]
1141
    
1142
    
1143
  } else if(modelType(model.fit) == 'survreg') {
1144
    # Make predictions for the data df based on the model
1145
    # 'quantile' returns the quantiles of risk, ie the 0.01 quantile would mean
1146
    # 0.01 ie 1% of patients would be dead by x. Returning the risk of death
1147
    # at a time t requires reverse-engineering this table.
1148
    # It doesn't make sense to go to p = 1 because technically by any model the
1149
    # 100th percentile is at infinity.
1150
    # It's really fast, so do 1000 quantiles for accuracy. Could make this a
1151
    # passable parameter...
1152
    risk.quantiles <- seq(0,0.999, 0.001)
1153
    
1154
    predictions <-
1155
      predict(model.fit, df, type = 'quantile', p = risk.quantiles, ...)
1156
    
1157
    predictions <-
1158
      # Find the risk quantile...
1159
      risk.quantiles[
1160
        # ...by choosing the element corresponding to the matrix output of
1161
        # predict, which has the same number of rows as df and a column per
1162
        # risk.quantiles...
1163
        apply(
1164
          predictions,
1165
          # ...and find the quantile closest to the risk.time being sought
1166
          FUN = function (x) {
1167
            which.min(abs(x - risk.time))
1168
          },
1169
          MARGIN = 1
1170
        )
1171
      ]
1172
  } else if(modelType(model.fit) == 'survfit') {
1173
    # For now, survfit is just a Kaplan-Meier fit, and it only deals with a
1174
    # single variable for KM strata. For multiple strata, this would require a
1175
    # bit of parsing to turn names like 'age=93, gender=Men' into an n-column
1176
    # data frame.
1177
    varname <-  substring(
1178
      names(model.fit$strata)[1], 0,
1179
      # Position of the = sign
1180
      strPos('=', names(model.fit$strata)[1]) - 1
1181
    )
1182
    
1183
    km.df <- data.frame(
1184
      var = rep(
1185
        # Chop off characters before and including = (eg 'age=') and turn into a
1186
        # number (would also need generalising for non-numerics, eg factors)
1187
        as.numeric(
1188
          substring(
1189
            names(model.fit$strata),
1190
            # Position of the = sign
1191
            strPos('=', names(model.fit$strata)[1]) + 1
1192
          )
1193
        ),
1194
        # Repeat each number as many times as there are patients that age
1195
        times = model.fit$strata
1196
      ),
1197
      time = model.fit$time,
1198
      surv = model.fit$surv
1199
    )
1200
    
1201
    risk.by.var <-
1202
      data.frame(
1203
        var = unique(km.df$var),
1204
        risk = NA
1205
      )
1206
    
1207
    for(var in unique(km.df$var)) {
1208
      # If anyone with that variable value lived long enough for us to make a
1209
      # prediction...
1210
      if(max(km.df$time[km.df$var == var]) > risk.time) {
1211
        # Find the first event after that point, which gives us the survival,
1212
        # and do 1 - surv to get risk
1213
        risk.by.var$risk[risk.by.var$var == var] <- 1-
1214
          km.df$surv[
1215
            # The datapoint needs to be for the correct age of patient
1216
            km.df$var == var &
1217
              # And pick the time which is the smallest value greater than the
1218
              # time in which we're interested.
1219
              km.df$time ==
1220
              minGt(km.df$time[km.df$var == var], risk.time)
1221
            ]
1222
      }
1223
    }
1224
1225
    # The predictions are then the risk for a given value of var
1226
    predictions <-
1227
      # join from pylr preserves row order
1228
      join(
1229
        # Slight kludge...make a data frame with one column called 'var' from
1230
        # the var (ie variable, depending on variable!) column of the data
1231
        data.frame(var = df[, varname]),
1232
        risk.by.var[, c('var', 'risk')]
1233
      )$risk
1234
  }
1235
  
1236
  # However obtained, return the predictions
1237
  predictions
1238
}
1239
1240
partialEffectTable <-
1241
  function(
1242
    model.fit, df, variable, n.patients = 1000, max.values = 200,
1243
    risk.time = 5, ...
1244
  ) {
1245
    # The number of values we look at will be either max.values, or the number
1246
    # of unique values if that's lower. Remove NAs because they cause errors.
1247
    n.values <- min(max.values, length(NArm(unique(df[,variable]))))
1248
    
1249
    # Take a sample of df, but repeat each one of those samples n.values times
1250
    df.sample <- df[rep(sample(1:nrow(df), n.patients), each = n.values),]
1251
    # Give each value from the original df an id, so we can keep track
1252
    df.sample$id <- rep(1:n.patients, each = n.values)
1253
    
1254
    # Each individual patient from the original sample is then assigned every
1255
    # value of the variable we're interested in exploring
1256
    df.sample[, variable] <-
1257
      sort(
1258
        # We sample in case n.values is less than the total number of unique
1259
        # values for a given variable
1260
        samplePlus(df[, variable], n.values, na.rm = TRUE, only.unique = TRUE)
1261
      )
1262
    # (This sorted samplePlus will be a factor of n.patients too short, but
1263
    # that's OK because it'll just be repeated)
1264
    
1265
    # Use the model to make predictions
1266
    df.sample$risk <- getRisk(model.fit, df.sample, risk.time, ...)
1267
    
1268
    # Use ddply to normalise the risk for each patient by the mean risk for that
1269
    # patient across all values of variable, thus averaging out any risk offsets
1270
    # between patients, and return that data frame.
1271
    as.data.frame(
1272
      df.sample %>%
1273
        group_by(id) %>%
1274
        mutate(risk.normalised = risk/mean(risk))
1275
    )[, c('id', variable, 'risk.normalised')] # discard all unnecessary columns
1276
  }
1277
1278
calibrationTable <- function(
1279
  model.fit, df, risk.time = 5, tod.round = 0.1, ...
1280
  ) {
1281
  
1282
  if(modelType(model.fit) == 'rfsrc') {
1283
    # rfsrc throws an error unless the y-values in the provided data are
1284
    # identical to those used to train the model, so recreate the rounded ones..
1285
    df$surv_time_round <-
1286
      round_any(df$surv_time, tod.round)
1287
    # This means we need to use surv_time_round in the formula
1288
    surv.time <- 'surv_time_round'
1289
  } else {
1290
    # Otherwise, our survival time variable is just surv_time
1291
    surv.time <- 'surv_time'
1292
  }
1293
  
1294
  # Get risk values given this model
1295
  df$risk <- getRiskAtTime(model.fit, df, risk.time, ...)
1296
  
1297
  # Was there an event? Start with NA, because default is unknown (ie censored)
1298
  df$event <- NA
1299
  # Event before risk.time
1300
  df$event[df$surv_event & df$surv_time <= risk.time] <- TRUE
1301
  # Event after, whether censorship or not, means no event by risk.time
1302
  df$event[df$surv_time > risk.time] <- FALSE
1303
  # Otherwise, censored before risk.time, leave as NA
1304
  
1305
  df[, c('risk', 'event')]
1306
}
1307
1308
calibrationPlot <- function(df, max.points = NA, show.censored = FALSE) {
1309
  # Convert risk to numeric, because ggplot treats logicals like categoricals
1310
  df$event <- as.numeric(df$event)
1311
  
1312
  # Make points.df which will be used to plot the points (we need to keep the
1313
  # full df to make sure the smoothed curve is accurate). If max.points is NA,
1314
  # don't do anything, but if it's specified then sample the data frame.
1315
  if(!is.na(max.points)) {
1316
    if(nrow(df) > max.points) {
1317
      points.df <- sample.df(df, max.points)
1318
    }
1319
  } else {
1320
    points.df <- df
1321
  }
1322
  # Either way, let's manually jitter the points in points.df, because ggplot's
1323
  # jitter adds both positive and negative which is confusing
1324
  points.no.event <- points.df$event == 0 & !is.na(points.df$event)
1325
  points.df$event[points.no.event] <-
1326
    runif(sum(points.no.event), min = 0, max = 0.1)
1327
  points.event <- points.df$event == 1 & !is.na(points.df$event)
1328
  points.df$event[points.event] <-
1329
    runif(sum(points.event), min = 0.9, max = 1)
1330
  
1331
  # Start the calibration plot
1332
  calibration.plot <-
1333
    ggplot(df, aes(x = risk, y = event)) +
1334
      # At the back, a 1:1 line for the 'perfect' result
1335
      geom_abline(slope = 1, intercept = 0) +
1336
      # Then, plot the points
1337
      geom_point(data = points.df, alpha = 0.1) +
1338
      # axis limits
1339
      coord_cartesian(xlim = c(0,1), ylim = c(0,1))
1340
  
1341
  # If the censored points need to be added...
1342
  if(show.censored) {
1343
    # Create a dummy data frame of censored values to plot
1344
    censored.df <- df[is.na(df$event),]
1345
    censored.df$event <- 0.5
1346
    
1347
    calibration.plot <-
1348
      calibration.plot +
1349
      geom_point(
1350
        data = censored.df, colour = 'grey', alpha = 0.1,
1351
        position = position_jitter(w = 0, h = 0.05)
1352
      )
1353
  }
1354
  
1355
  # Finally, plot a smoothed calibration curve on top
1356
  calibration.plot <-
1357
    calibration.plot + geom_smooth()
1358
  
1359
  calibration.plot
1360
}
1361
1362
calibrationScore <- function(
1363
  calibration.table, risk.breaks = seq(0, 1, 0.01), curve = FALSE,
1364
  extremes = TRUE
1365
  ) {
1366
  #
1367
  # extremes: If set to true, this assumes predictions of 0 below 0.5, and 1
1368
  # above 0.5, providing a worst-case estimate for cases when the prediction
1369
  # model only provides predictions within a narrower range. This allows such
1370
  # models to be fairly compared to others with broader predictive values.
1371
  #
1372
  # * Could rewrite this with the integrate built-in function
1373
  # * Not totally sure about the standard error here...I assume just integrating
1374
  #   the uncertainty region will result in an overestimate?
1375
  
1376
  
1377
  # Fit a LOESS model to the data
1378
  loess.curve <- loess(event ~ risk, data = calibration.table)
1379
  
1380
  # Get the bin widths, which we'll need in a bit when integrating
1381
  risk.binwidths <- diff(risk.breaks)
1382
  # And the midpoints of the risk bins to calculate predictions at
1383
  risk.mids <- risk.breaks[1:(length(risk.breaks) - 1)] + risk.binwidths / 2
1384
  
1385
  predictions <-
1386
    predict(loess.curve, data.frame(risk = risk.mids), se = FALSE)
1387
  
1388
  if(anyNA(predictions)) {
1389
    if(extremes) {
1390
      # Get the bins where we don't have a valid prediction
1391
      missing.risks <- risk.mids[is.na(predictions)]
1392
      # And predict 0 is < 0.5, 1 if greater, for a worst-case step-function
1393
      missing.risks <- as.numeric(missing.risks > 0.5)
1394
      # Finally, substitute them in
1395
      predictions[is.na(predictions)] <- missing.risks
1396
    } else {
1397
      # If there are missing values but extremes = FALSE, ie don't extend, then
1398
      # issue a warning to let the user know.
1399
      if(length(is.na(risk.mids) < 10)) {
1400
        warning.examples <- paste(risk.mids[is.na(risk.mids)], collapse = ', ')
1401
      } else {
1402
        warning.examples <-
1403
          paste(
1404
            paste(head(risk.mids[is.na(risk.mids)], 3), collapse = ', '),
1405
            '...',
1406
            paste(tail(risk.mids[is.na(risk.mids)], 3), collapse = ', ')
1407
          )
1408
      }
1409
      warning(
1410
        'Some predictions (for risk bins at ', warning.examples, ') return ',
1411
        'NA. This means calibration is being performed outside the range of ',
1412
        'the data which may mean values are not comparable. Set extremes = ',
1413
        'TRUE to assume worst-case predictions beyond the bounds of the ',
1414
        'actual predictions.'
1415
      )
1416
    }
1417
1418
  }
1419
  
1420
  curve.area <-
1421
    sum(
1422
      abs(predictions - risk.mids) * risk.binwidths,
1423
      na.rm = TRUE
1424
    )
1425
  
1426
  # If the curve was requested...
1427
  if(curve) {
1428
    # ...return area between lines and standard error, plus the curve
1429
    list(
1430
      area = curve.area,
1431
      curve = predictions
1432
    )
1433
  } else {
1434
    # ...otherwise, just return the summary statistic
1435
    return(curve.area)
1436
  }
1437
}
1438
1439
calibrationScoreWrapper <- function(
1440
  model.fit, df, risk.time = 5, tod.round = 0.1, ...
1441
) {
1442
  # Simple wrapper function for working out the calibration score directly from
1443
  # model fit, data frame and extra variables if needed.
1444
  # Returns 1 - area so higher is better.
1445
  1 - 
1446
    calibrationScore(
1447
      calibrationTable(model.fit, df, risk.time, tod.round, ...)
1448
    )
1449
}
1450
1451
testSetIndices <- function(df, test.fraction = 1/3, random.seed = NA) {
1452
  # Get indices for the test set in a data frame, with a random seed to make the
1453
  # process deterministic if requested.
1454
  
1455
  n.data <- nrow(df)
1456
  if(!is.na(random.seed)) set.seed(random.seed)
1457
  
1458
  sample.int(n.data, round(n.data * test.fraction))
1459
}
1460
1461
summary2 <- function(x) {
1462
  # Practical summary function for summarising medical records data columns
1463
  # depending on number of unique values...
1464
  if('data.frame' %in% class(x)) {
1465
    lapply(x, summary2)
1466
  } else {
1467
    if(length(unique(x)) < 30) {
1468
      if(length(unique(x)) < 10) {
1469
        return(round(c(table(x))/length(x), 3)*100)
1470
      } else {
1471
        summ <- sort(table(x), decreasing = TRUE)
1472
        return(
1473
          round(
1474
            c(
1475
              summ[1:5],
1476
              other = sum(summ[6:length(summ)]),
1477
              missing = sum(is.na(x))
1478
              # divide all by the length and turn into %
1479
            )/length(x), 3)*100
1480
        )
1481
      }
1482
    } else {
1483
      return(
1484
        c(
1485
          min = min(x, na.rm = TRUE),
1486
          max = max(x, na.rm = TRUE),
1487
          median = median(x, na.rm = TRUE),
1488
          missing = round(sum(is.na(x))/length(x), 3)*100
1489
        )
1490
      )
1491
    }
1492
  }
1493
}
1494
1495
lookUpDescriptions <- function(
1496
  x, bnf.lookup.filename = '../../data/product.txt'
1497
  ) {
1498
  # Create blank columns for which dictionary a given variable comes from, its
1499
  # code in that dictionary, and a human-readable description looked up from the
1500
  # CALIBER tables
1501
  
1502
  data("CALIBER_DICT")
1503
  
1504
  # If there's a BNF lookup filename, load that
1505
  if(!isExactlyNA(bnf.lookup.filename)) {
1506
    bnf.lookup <- fread(bnf.lookup.filename)
1507
  }
1508
  
1509
  # Make a vector to hold descriptions, fill it with x so it's a) the right
1510
  # length and b) as a fallback
1511
  description <- x
1512
  thecode <- x # slightly silly name to avoid data table clash with code column
1513
  
1514
  # Look up ICD and OPCS codes
1515
  relevant.rows <- startsWith(x, 'hes.icd.')
1516
  thecode[relevant.rows] <- textAfter(x, 'hes.icd.')
1517
  for(i in which(relevant.rows)) {
1518
    # Some of these don't work, so add in an if statement to catch the error
1519
    if(
1520
      length(CALIBER_DICT[dict == 'icd10' & code == thecode[i], term]) > 0
1521
    ){
1522
      description[i] <-
1523
        CALIBER_DICT[dict == 'icd10' & code == thecode[i], term]
1524
    } else {
1525
      description[i] <- 'ERROR: ICD not matched'
1526
    }
1527
  }
1528
  
1529
  relevant.rows <- startsWith(x, 'hes.opcs.')
1530
  thecode[relevant.rows] <- textAfter(x, 'hes.opcs.')
1531
  for(i in which(relevant.rows)) {
1532
    if(
1533
      length(CALIBER_DICT[dict == 'opcs' & code == thecode[i], term]) > 0
1534
    ){
1535
      description[i] <-
1536
        CALIBER_DICT[dict == 'opcs' & code == thecode[i], term]
1537
    } else {
1538
      description[i] <- 'ERROR: OPCS not matched'
1539
    }
1540
  }
1541
  
1542
  relevant.rows <- startsWith(x, 'clinical.history.')
1543
  thecode[relevant.rows] <- textAfter(x, 'clinical.history.')
1544
  for(i in which(relevant.rows)) {
1545
    # Some of these don't work, so add in an if statement to catch the error
1546
    if(
1547
      length(CALIBER_DICT[dict == 'read' & medcode == thecode[i], term]) > 0
1548
    ){
1549
      description[i] <-
1550
        CALIBER_DICT[dict == 'read' & medcode == thecode[i], term]
1551
    } else {
1552
      description[i] <- 'ERROR: medcode not matched'
1553
    }
1554
  }
1555
  
1556
  relevant.rows <- startsWith(x, 'clinical.values.')
1557
  thecode[relevant.rows] <- textAfter(x, 'clinical.values.')
1558
  for(i in which(relevant.rows)) {
1559
    testtype.datatype <- strsplit(thecode[i], '_', fixed =TRUE)[[1]]
1560
    description[i] <-
1561
      paste0(
1562
        CALIBER_ENTITY[enttype == testtype.datatype[1], description],
1563
        ', ',
1564
        CALIBER_ENTITY[enttype == testtype.datatype[1], testtype.datatype[2], with = FALSE]
1565
      )
1566
  }
1567
  
1568
  relevant.rows <- startsWith(x, 'bnf.')
1569
  thecode[relevant.rows] <- textAfter(x, 'bnf.')
1570
  for(i in which(relevant.rows)) {
1571
    # Some of these don't work, so add in an if statement to catch the error
1572
    if(
1573
      length(CALIBER_BNFCODES[bnfcode == thecode[i], bnf]) > 0
1574
    ){
1575
      description[i] <-
1576
        CALIBER_BNFCODES[bnfcode == thecode[i], bnf]
1577
      
1578
      # If a BNF product dictionary was supplied
1579
      if(!isExactlyNA(bnf.lookup.filename)) {
1580
        # If there's a matching BNF code, take the first element of the product
1581
        # table (there will often be many because many drugs fit into one code/
1582
        # BNF chapter)
1583
        if(!is.na(bnf.lookup[bnfcode == description[i], bnfchapter][1])) {
1584
          description[i] <- bnf.lookup[bnfcode == description[i], bnfchapter][1]
1585
        }
1586
        # Otherwise, leave it as the BNF code for future parsing
1587
      }
1588
    } else {
1589
      description[i] <- 'ERROR: BNF code not matched'
1590
    }
1591
  }
1592
  
1593
  relevant.rows <- startsWith(x, 'tests.enttype.data3.')
1594
  thecode[relevant.rows] <- textAfter(x, 'tests.enttype.data3.')
1595
  for(i in which(relevant.rows)) {
1596
    testtype.datatype <- strsplit(thecode[i], '_', fixed =TRUE)[[1]]
1597
    description[i] <-
1598
      CALIBER_ENTITY[enttype == testtype.datatype[1], description]
1599
  }
1600
  
1601
  description
1602
}
1603
1604
getVarNums <- function(x, frac = 0.2, min = 1) {
1605
  # Number of iterations until there are only min variables left
1606
  n <- -ceiling(log(x/min)/log(1 - frac))
1607
  unique(round(x*((1 - frac)^(n:0))))
1608
}
1609
1610
percentMissing <- function(x) {
1611
  sum(is.na(x))/length(x) * 100
1612
}