Diff of /lib/shared.R [000000] .. [0375db]

Switch to unified view

a b/lib/shared.R
1
#' # prep-data.R
2
#' We start by preparing the data for reproducible comparisons...
3
4
# Set the random seed for reproducibility
5
random.seed <- 35498L
6
7
# Specify the data file containing the patient cohort
8
data.filename <- '../../data/cohort-sanitised.csv'
9
10
# Specify file to write performance characteristics to
11
performance.file <- '../../output/models-performance.tsv'
12
13
# The fraction of the data to use as the test set (1 - this will be used as the
14
# training set)
15
test.fraction <- 1/3
16
17
# If surv.predict wasn't already specified, use the defaults...
18
if(!exists('surv.predict')) {
19
  # Column names of variables to use for predictions
20
  surv.predict <- c(
21
    'age', 'gender', 'diagnosis', 'pci_6mo', 'cabg_6mo',
22
    'hx_mi', 'long_nitrate', 'smokstatus', 'hypertension', 'diabetes',
23
    'total_chol_6mo', 'hdl_6mo', 'heart_failure', 'pad', 'hx_af', 'hx_stroke',
24
    'hx_renal', 'hx_copd', 'hx_cancer', 'hx_liver', 'hx_depression',
25
    'hx_anxiety', 'pulse_6mo', 'crea_6mo', 'total_wbc_6mo','haemoglobin_6mo',
26
    'most_deprived'
27
  )
28
}
29
30
cols.keep <- c(surv.predict, 'exclude', 'imd_score')
31
32
exclude.vars <- c('hx_mi')
33
surv.predict <- surv.predict[!(surv.predict %in% exclude.vars)]
34
35
# Check to see if endpoint exists to avoid error
36
if(!exists('endpoint')) {
37
  # Default is all-cause mortality
38
  endpoint <- 'death'
39
}
40
41
# If we're looking at MI...
42
if(endpoint == 'mi') {
43
  surv.time      <- 'time_coronary'
44
  surv.event     <- 'endpoint_coronary'
45
  surv.event.yes <- c('Nonfatal MI', 'Coronary death')
46
  
47
# If dealing with death in an imputed dataset...
48
} else if(endpoint == 'death.imputed') {
49
  surv.time    <- 'time_death'
50
  surv.event   <- 'endpoint_death' 
51
  surv.event.yes <- 1 # Coded as 1s and 0s for imputation
52
  
53
# Default is all-cause mortality...
54
} else {
55
  # Name of column giving time for use in survival object
56
  surv.time    <- 'time_death'
57
  # Name of event column for survival object
58
  surv.event   <- 'endpoint_death' # Cannot be 'surv_event' or will break later!
59
  # Value of surv.event column if an event is recorded
60
  surv.event.yes <- 'Death'
61
}
62
63
64
65
# Quantile boundaries for discretisation
66
discretise.quantiles <- c(0, 0.1, 0.25, 0.5, 0.75, 0.9, 1)
67
68
# Columns to discretise in specific ways, or not discretise at all. Those not
69
# listed here will be discretised by quantile with the default quantiles listed
70
# above.
71
discretise.settings <-
72
  list(
73
    var        = c('anonpatid', 'surv_time', 'imd_score', 'exclude'),
74
    method     = c(NA, NA, NA, NA),
75
    settings   = list(NA, NA, NA, NA)
76
  )
77
78
################################################################################
79
### END USER VARIABLES #########################################################
80
################################################################################
81
82
if(!is.na(random.seed)) {
83
  set.seed(random.seed)
84
}
85
86
source('../lib/handymedical.R', chdir = TRUE)
87
88
# Define a function of extra non-general prep to be done on this dataset
89
caliberExclude <- function(df) {
90
  df <-
91
    df[
92
      # remove negative times to death
93
      df$surv_time > 0 &
94
        # remove patients who should be excluded
95
        !df$exclude
96
      ,
97
      ]
98
  # Remove the exclude column, which we don't need any more
99
  df$exclude <- NULL
100
  
101
  df
102
}
103
104
caliberExtraPrep <- function(df) {
105
  df <- caliberExclude(df)
106
  
107
  # Create most_deprived, as defined in the paper: the bottom 20%
108
  df$most_deprived <- df$imd_score > quantile(df$imd_score, 0.8, na.rm = TRUE)
109
  df$most_deprived <- factorNAfix(factor(df$most_deprived), NAval = 'missing')
110
  # Remove the imd_score, to avoid confusion later
111
  df$imd_score <- NULL
112
  
113
  df
114
}