Diff of /lib/shared.R [000000] .. [0375db]

Switch to side-by-side view

--- a
+++ b/lib/shared.R
@@ -0,0 +1,114 @@
+#' # prep-data.R
+#' We start by preparing the data for reproducible comparisons...
+
+# Set the random seed for reproducibility
+random.seed <- 35498L
+
+# Specify the data file containing the patient cohort
+data.filename <- '../../data/cohort-sanitised.csv'
+
+# Specify file to write performance characteristics to
+performance.file <- '../../output/models-performance.tsv'
+
+# The fraction of the data to use as the test set (1 - this will be used as the
+# training set)
+test.fraction <- 1/3
+
+# If surv.predict wasn't already specified, use the defaults...
+if(!exists('surv.predict')) {
+  # Column names of variables to use for predictions
+  surv.predict <- c(
+    'age', 'gender', 'diagnosis', 'pci_6mo', 'cabg_6mo',
+    'hx_mi', 'long_nitrate', 'smokstatus', 'hypertension', 'diabetes',
+    'total_chol_6mo', 'hdl_6mo', 'heart_failure', 'pad', 'hx_af', 'hx_stroke',
+    'hx_renal', 'hx_copd', 'hx_cancer', 'hx_liver', 'hx_depression',
+    'hx_anxiety', 'pulse_6mo', 'crea_6mo', 'total_wbc_6mo','haemoglobin_6mo',
+    'most_deprived'
+  )
+}
+
+cols.keep <- c(surv.predict, 'exclude', 'imd_score')
+
+exclude.vars <- c('hx_mi')
+surv.predict <- surv.predict[!(surv.predict %in% exclude.vars)]
+
+# Check to see if endpoint exists to avoid error
+if(!exists('endpoint')) {
+  # Default is all-cause mortality
+  endpoint <- 'death'
+}
+
+# If we're looking at MI...
+if(endpoint == 'mi') {
+  surv.time      <- 'time_coronary'
+  surv.event     <- 'endpoint_coronary'
+  surv.event.yes <- c('Nonfatal MI', 'Coronary death')
+  
+# If dealing with death in an imputed dataset...
+} else if(endpoint == 'death.imputed') {
+  surv.time    <- 'time_death'
+  surv.event   <- 'endpoint_death' 
+  surv.event.yes <- 1 # Coded as 1s and 0s for imputation
+  
+# Default is all-cause mortality...
+} else {
+  # Name of column giving time for use in survival object
+  surv.time    <- 'time_death'
+  # Name of event column for survival object
+  surv.event   <- 'endpoint_death' # Cannot be 'surv_event' or will break later!
+  # Value of surv.event column if an event is recorded
+  surv.event.yes <- 'Death'
+}
+
+
+
+# Quantile boundaries for discretisation
+discretise.quantiles <- c(0, 0.1, 0.25, 0.5, 0.75, 0.9, 1)
+
+# Columns to discretise in specific ways, or not discretise at all. Those not
+# listed here will be discretised by quantile with the default quantiles listed
+# above.
+discretise.settings <-
+  list(
+    var        = c('anonpatid', 'surv_time', 'imd_score', 'exclude'),
+    method     = c(NA, NA, NA, NA),
+    settings   = list(NA, NA, NA, NA)
+  )
+
+################################################################################
+### END USER VARIABLES #########################################################
+################################################################################
+
+if(!is.na(random.seed)) {
+  set.seed(random.seed)
+}
+
+source('../lib/handymedical.R', chdir = TRUE)
+
+# Define a function of extra non-general prep to be done on this dataset
+caliberExclude <- function(df) {
+  df <-
+    df[
+      # remove negative times to death
+      df$surv_time > 0 &
+        # remove patients who should be excluded
+        !df$exclude
+      ,
+      ]
+  # Remove the exclude column, which we don't need any more
+  df$exclude <- NULL
+  
+  df
+}
+
+caliberExtraPrep <- function(df) {
+  df <- caliberExclude(df)
+  
+  # Create most_deprived, as defined in the paper: the bottom 20%
+  df$most_deprived <- df$imd_score > quantile(df$imd_score, 0.8, na.rm = TRUE)
+  df$most_deprived <- factorNAfix(factor(df$most_deprived), NAval = 'missing')
+  # Remove the imd_score, to avoid confusion later
+  df$imd_score <- NULL
+  
+  df
+}
\ No newline at end of file