MLehealth / Git / [0375db] /lib/shared.R

Models:
philipB/
MLehealth
Downloads: 1
[0375db]: / lib / shared.R
History
Download this file
114 lines (91 with data), 3.6 kB

#' # prep-data.R
#' We start by preparing the data for reproducible comparisons...

# Set the random seed for reproducibility
random.seed <- 35498L

# Specify the data file containing the patient cohort
data.filename <- '../../data/cohort-sanitised.csv'

# Specify file to write performance characteristics to
performance.file <- '../../output/models-performance.tsv'

# The fraction of the data to use as the test set (1 - this will be used as the
# training set)
test.fraction <- 1/3

# If surv.predict wasn't already specified, use the defaults...
if(!exists('surv.predict')) {
  # Column names of variables to use for predictions
  surv.predict <- c(
    'age', 'gender', 'diagnosis', 'pci_6mo', 'cabg_6mo',
    'hx_mi', 'long_nitrate', 'smokstatus', 'hypertension', 'diabetes',
    'total_chol_6mo', 'hdl_6mo', 'heart_failure', 'pad', 'hx_af', 'hx_stroke',
    'hx_renal', 'hx_copd', 'hx_cancer', 'hx_liver', 'hx_depression',
    'hx_anxiety', 'pulse_6mo', 'crea_6mo', 'total_wbc_6mo','haemoglobin_6mo',
    'most_deprived'
  )
}

cols.keep <- c(surv.predict, 'exclude', 'imd_score')

exclude.vars <- c('hx_mi')
surv.predict <- surv.predict[!(surv.predict %in% exclude.vars)]

# Check to see if endpoint exists to avoid error
if(!exists('endpoint')) {
  # Default is all-cause mortality
  endpoint <- 'death'
}

# If we're looking at MI...
if(endpoint == 'mi') {
  surv.time      <- 'time_coronary'
  surv.event     <- 'endpoint_coronary'
  surv.event.yes <- c('Nonfatal MI', 'Coronary death')
  
# If dealing with death in an imputed dataset...
} else if(endpoint == 'death.imputed') {
  surv.time    <- 'time_death'
  surv.event   <- 'endpoint_death' 
  surv.event.yes <- 1 # Coded as 1s and 0s for imputation
  
# Default is all-cause mortality...
} else {
  # Name of column giving time for use in survival object
  surv.time    <- 'time_death'
  # Name of event column for survival object
  surv.event   <- 'endpoint_death' # Cannot be 'surv_event' or will break later!
  # Value of surv.event column if an event is recorded
  surv.event.yes <- 'Death'
}



# Quantile boundaries for discretisation
discretise.quantiles <- c(0, 0.1, 0.25, 0.5, 0.75, 0.9, 1)

# Columns to discretise in specific ways, or not discretise at all. Those not
# listed here will be discretised by quantile with the default quantiles listed
# above.
discretise.settings <-
  list(
    var        = c('anonpatid', 'surv_time', 'imd_score', 'exclude'),
    method     = c(NA, NA, NA, NA),
    settings   = list(NA, NA, NA, NA)
  )

################################################################################
### END USER VARIABLES #########################################################
################################################################################

if(!is.na(random.seed)) {
  set.seed(random.seed)
}

source('../lib/handymedical.R', chdir = TRUE)

# Define a function of extra non-general prep to be done on this dataset
caliberExclude <- function(df) {
  df <-
    df[
      # remove negative times to death
      df$surv_time > 0 &
        # remove patients who should be excluded
        !df$exclude
      ,
      ]
  # Remove the exclude column, which we don't need any more
  df$exclude <- NULL
  
  df
}

caliberExtraPrep <- function(df) {
  df <- caliberExclude(df)
  
  # Create most_deprived, as defined in the paper: the bottom 20%
  df$most_deprived <- df$imd_score > quantile(df$imd_score, 0.8, na.rm = TRUE)
  df$most_deprived <- factorNAfix(factor(df$most_deprived), NAval = 'missing')
  # Remove the imd_score, to avoid confusion later
  df$imd_score <- NULL
  
  df
}