#' # prep-data.R
#' We start by preparing the data for reproducible comparisons...
# Set the random seed for reproducibility
random.seed <- 35498L
# Specify the data file containing the patient cohort
data.filename <- '../../data/cohort-sanitised.csv'
# Specify file to write performance characteristics to
performance.file <- '../../output/models-performance.tsv'
# The fraction of the data to use as the test set (1 - this will be used as the
# training set)
test.fraction <- 1/3
# If surv.predict wasn't already specified, use the defaults...
if(!exists('surv.predict')) {
# Column names of variables to use for predictions
surv.predict <- c(
'age', 'gender', 'diagnosis', 'pci_6mo', 'cabg_6mo',
'hx_mi', 'long_nitrate', 'smokstatus', 'hypertension', 'diabetes',
'total_chol_6mo', 'hdl_6mo', 'heart_failure', 'pad', 'hx_af', 'hx_stroke',
'hx_renal', 'hx_copd', 'hx_cancer', 'hx_liver', 'hx_depression',
'hx_anxiety', 'pulse_6mo', 'crea_6mo', 'total_wbc_6mo','haemoglobin_6mo',
'most_deprived'
)
}
cols.keep <- c(surv.predict, 'exclude', 'imd_score')
exclude.vars <- c('hx_mi')
surv.predict <- surv.predict[!(surv.predict %in% exclude.vars)]
# Check to see if endpoint exists to avoid error
if(!exists('endpoint')) {
# Default is all-cause mortality
endpoint <- 'death'
}
# If we're looking at MI...
if(endpoint == 'mi') {
surv.time <- 'time_coronary'
surv.event <- 'endpoint_coronary'
surv.event.yes <- c('Nonfatal MI', 'Coronary death')
# If dealing with death in an imputed dataset...
} else if(endpoint == 'death.imputed') {
surv.time <- 'time_death'
surv.event <- 'endpoint_death'
surv.event.yes <- 1 # Coded as 1s and 0s for imputation
# Default is all-cause mortality...
} else {
# Name of column giving time for use in survival object
surv.time <- 'time_death'
# Name of event column for survival object
surv.event <- 'endpoint_death' # Cannot be 'surv_event' or will break later!
# Value of surv.event column if an event is recorded
surv.event.yes <- 'Death'
}
# Quantile boundaries for discretisation
discretise.quantiles <- c(0, 0.1, 0.25, 0.5, 0.75, 0.9, 1)
# Columns to discretise in specific ways, or not discretise at all. Those not
# listed here will be discretised by quantile with the default quantiles listed
# above.
discretise.settings <-
list(
var = c('anonpatid', 'surv_time', 'imd_score', 'exclude'),
method = c(NA, NA, NA, NA),
settings = list(NA, NA, NA, NA)
)
################################################################################
### END USER VARIABLES #########################################################
################################################################################
if(!is.na(random.seed)) {
set.seed(random.seed)
}
source('../lib/handymedical.R', chdir = TRUE)
# Define a function of extra non-general prep to be done on this dataset
caliberExclude <- function(df) {
df <-
df[
# remove negative times to death
df$surv_time > 0 &
# remove patients who should be excluded
!df$exclude
,
]
# Remove the exclude column, which we don't need any more
df$exclude <- NULL
df
}
caliberExtraPrep <- function(df) {
df <- caliberExclude(df)
# Create most_deprived, as defined in the paper: the bottom 20%
df$most_deprived <- df$imd_score > quantile(df$imd_score, 0.8, na.rm = TRUE)
df$most_deprived <- factorNAfix(factor(df$most_deprived), NAval = 'missing')
# Remove the imd_score, to avoid confusion later
df$imd_score <- NULL
df
}