|
a |
|
b/lib/shared.R |
|
|
1 |
#' # prep-data.R |
|
|
2 |
#' We start by preparing the data for reproducible comparisons... |
|
|
3 |
|
|
|
4 |
# Set the random seed for reproducibility |
|
|
5 |
random.seed <- 35498L |
|
|
6 |
|
|
|
7 |
# Specify the data file containing the patient cohort |
|
|
8 |
data.filename <- '../../data/cohort-sanitised.csv' |
|
|
9 |
|
|
|
10 |
# Specify file to write performance characteristics to |
|
|
11 |
performance.file <- '../../output/models-performance.tsv' |
|
|
12 |
|
|
|
13 |
# The fraction of the data to use as the test set (1 - this will be used as the |
|
|
14 |
# training set) |
|
|
15 |
test.fraction <- 1/3 |
|
|
16 |
|
|
|
17 |
# If surv.predict wasn't already specified, use the defaults... |
|
|
18 |
if(!exists('surv.predict')) { |
|
|
19 |
# Column names of variables to use for predictions |
|
|
20 |
surv.predict <- c( |
|
|
21 |
'age', 'gender', 'diagnosis', 'pci_6mo', 'cabg_6mo', |
|
|
22 |
'hx_mi', 'long_nitrate', 'smokstatus', 'hypertension', 'diabetes', |
|
|
23 |
'total_chol_6mo', 'hdl_6mo', 'heart_failure', 'pad', 'hx_af', 'hx_stroke', |
|
|
24 |
'hx_renal', 'hx_copd', 'hx_cancer', 'hx_liver', 'hx_depression', |
|
|
25 |
'hx_anxiety', 'pulse_6mo', 'crea_6mo', 'total_wbc_6mo','haemoglobin_6mo', |
|
|
26 |
'most_deprived' |
|
|
27 |
) |
|
|
28 |
} |
|
|
29 |
|
|
|
30 |
cols.keep <- c(surv.predict, 'exclude', 'imd_score') |
|
|
31 |
|
|
|
32 |
exclude.vars <- c('hx_mi') |
|
|
33 |
surv.predict <- surv.predict[!(surv.predict %in% exclude.vars)] |
|
|
34 |
|
|
|
35 |
# Check to see if endpoint exists to avoid error |
|
|
36 |
if(!exists('endpoint')) { |
|
|
37 |
# Default is all-cause mortality |
|
|
38 |
endpoint <- 'death' |
|
|
39 |
} |
|
|
40 |
|
|
|
41 |
# If we're looking at MI... |
|
|
42 |
if(endpoint == 'mi') { |
|
|
43 |
surv.time <- 'time_coronary' |
|
|
44 |
surv.event <- 'endpoint_coronary' |
|
|
45 |
surv.event.yes <- c('Nonfatal MI', 'Coronary death') |
|
|
46 |
|
|
|
47 |
# If dealing with death in an imputed dataset... |
|
|
48 |
} else if(endpoint == 'death.imputed') { |
|
|
49 |
surv.time <- 'time_death' |
|
|
50 |
surv.event <- 'endpoint_death' |
|
|
51 |
surv.event.yes <- 1 # Coded as 1s and 0s for imputation |
|
|
52 |
|
|
|
53 |
# Default is all-cause mortality... |
|
|
54 |
} else { |
|
|
55 |
# Name of column giving time for use in survival object |
|
|
56 |
surv.time <- 'time_death' |
|
|
57 |
# Name of event column for survival object |
|
|
58 |
surv.event <- 'endpoint_death' # Cannot be 'surv_event' or will break later! |
|
|
59 |
# Value of surv.event column if an event is recorded |
|
|
60 |
surv.event.yes <- 'Death' |
|
|
61 |
} |
|
|
62 |
|
|
|
63 |
|
|
|
64 |
|
|
|
65 |
# Quantile boundaries for discretisation |
|
|
66 |
discretise.quantiles <- c(0, 0.1, 0.25, 0.5, 0.75, 0.9, 1) |
|
|
67 |
|
|
|
68 |
# Columns to discretise in specific ways, or not discretise at all. Those not |
|
|
69 |
# listed here will be discretised by quantile with the default quantiles listed |
|
|
70 |
# above. |
|
|
71 |
discretise.settings <- |
|
|
72 |
list( |
|
|
73 |
var = c('anonpatid', 'surv_time', 'imd_score', 'exclude'), |
|
|
74 |
method = c(NA, NA, NA, NA), |
|
|
75 |
settings = list(NA, NA, NA, NA) |
|
|
76 |
) |
|
|
77 |
|
|
|
78 |
################################################################################ |
|
|
79 |
### END USER VARIABLES ######################################################### |
|
|
80 |
################################################################################ |
|
|
81 |
|
|
|
82 |
if(!is.na(random.seed)) { |
|
|
83 |
set.seed(random.seed) |
|
|
84 |
} |
|
|
85 |
|
|
|
86 |
source('../lib/handymedical.R', chdir = TRUE) |
|
|
87 |
|
|
|
88 |
# Define a function of extra non-general prep to be done on this dataset |
|
|
89 |
caliberExclude <- function(df) { |
|
|
90 |
df <- |
|
|
91 |
df[ |
|
|
92 |
# remove negative times to death |
|
|
93 |
df$surv_time > 0 & |
|
|
94 |
# remove patients who should be excluded |
|
|
95 |
!df$exclude |
|
|
96 |
, |
|
|
97 |
] |
|
|
98 |
# Remove the exclude column, which we don't need any more |
|
|
99 |
df$exclude <- NULL |
|
|
100 |
|
|
|
101 |
df |
|
|
102 |
} |
|
|
103 |
|
|
|
104 |
caliberExtraPrep <- function(df) { |
|
|
105 |
df <- caliberExclude(df) |
|
|
106 |
|
|
|
107 |
# Create most_deprived, as defined in the paper: the bottom 20% |
|
|
108 |
df$most_deprived <- df$imd_score > quantile(df$imd_score, 0.8, na.rm = TRUE) |
|
|
109 |
df$most_deprived <- factorNAfix(factor(df$most_deprived), NAval = 'missing') |
|
|
110 |
# Remove the imd_score, to avoid confusion later |
|
|
111 |
df$imd_score <- NULL |
|
|
112 |
|
|
|
113 |
df |
|
|
114 |
} |