|
a |
|
b/random-forest/rf-classification.R |
|
|
1 |
#+ knitr_setup, include = FALSE |
|
|
2 |
|
|
|
3 |
# Whether to cache the intensive code sections. Set to FALSE to recalculate |
|
|
4 |
# everything afresh. |
|
|
5 |
cacheoption <- FALSE |
|
|
6 |
# Disable lazy caching globally, because it fails for large objects, and all the |
|
|
7 |
# objects we wish to cache are large... |
|
|
8 |
opts_chunk$set(cache.lazy = FALSE) |
|
|
9 |
|
|
|
10 |
#' # Cross-validating discretisation of input variables in a survival model |
|
|
11 |
#' |
|
|
12 |
#' In difference to previous attempts at cross-validation, this uses between 10 |
|
|
13 |
#' and 20 bins, not between 2 and 20, in an attempt to avoid throwing away data. |
|
|
14 |
|
|
|
15 |
# The first part of the filename for any output |
|
|
16 |
output.filename.base <- '../../output/rfsrc-classification-try1' |
|
|
17 |
|
|
|
18 |
risk.time <- 5 |
|
|
19 |
|
|
|
20 |
n.data <- NA |
|
|
21 |
split.rule <- 'logrank' |
|
|
22 |
n.trees <- 2000 |
|
|
23 |
n.threads <- 19 |
|
|
24 |
|
|
|
25 |
continuous.vars <- |
|
|
26 |
c( |
|
|
27 |
'age', 'total_chol_6mo', 'hdl_6mo', 'pulse_6mo', 'crea_6mo', |
|
|
28 |
'total_wbc_6mo', 'haemoglobin_6mo' |
|
|
29 |
) |
|
|
30 |
|
|
|
31 |
untransformed.vars <- c('anonpatid', 'surv_time', 'imd_score', 'exclude') |
|
|
32 |
|
|
|
33 |
# If surv.vars is defined as a character vector here, the model only uses those |
|
|
34 |
# variables specified, eg c('age') would build a model purely based on age. If |
|
|
35 |
# not specified (ie commented out), it will use the defaults. |
|
|
36 |
# surv.predict <- c('age') |
|
|
37 |
|
|
|
38 |
#' ## Fit the model |
|
|
39 |
#' |
|
|
40 |
#' Now, let's fit the model, but without cross-validating the number of factor |
|
|
41 |
#' levels! The issue is that, if we're allowing factor levels to be grouped into |
|
|
42 |
#' two branches arbitrarily, there are 2^n - 2 combinations, which rapidly |
|
|
43 |
#' becomes a huge number. Thus, cross-validating, especially with large numbers |
|
|
44 |
#' of factor levels, is very impractical. |
|
|
45 |
#' |
|
|
46 |
#' We'll also leave age as a pure number: we know that it's both a very |
|
|
47 |
#' significant variable, and also that it makes sense to treat it as though it's |
|
|
48 |
#' ordered, because risk should increase monotonically with it. |
|
|
49 |
#' |
|
|
50 |
#+ rf_discretised, cache=cacheoption |
|
|
51 |
|
|
|
52 |
source('../lib/shared.R') |
|
|
53 |
|
|
|
54 |
# Load the data and convert to data frame to make column-selecting code in |
|
|
55 |
# prepData simpler |
|
|
56 |
COHORT.full <- data.frame(fread(data.filename)) |
|
|
57 |
|
|
|
58 |
# If n.data was specified... |
|
|
59 |
if(!is.na(n.data)){ |
|
|
60 |
# Take a subset n.data in size |
|
|
61 |
COHORT.use <- sample.df(COHORT.full, n.data) |
|
|
62 |
rm(COHORT.full) |
|
|
63 |
} else { |
|
|
64 |
# Use all the data |
|
|
65 |
COHORT.use <- COHORT.full |
|
|
66 |
rm(COHORT.full) |
|
|
67 |
} |
|
|
68 |
|
|
|
69 |
# Process settings: don't touch anything!! |
|
|
70 |
process.settings <- |
|
|
71 |
list( |
|
|
72 |
var = c(untransformed.vars, continuous.vars), |
|
|
73 |
method = rep(NA, length(untransformed.vars) + length(continuous.vars)), |
|
|
74 |
settings = rep(NA, length(untransformed.vars) + length(continuous.vars)) |
|
|
75 |
) |
|
|
76 |
|
|
|
77 |
COHORT.prep <- |
|
|
78 |
prepData( |
|
|
79 |
COHORT.use, |
|
|
80 |
cols.keep, |
|
|
81 |
process.settings, |
|
|
82 |
surv.time, surv.event, |
|
|
83 |
surv.event.yes, |
|
|
84 |
extra.fun = caliberExtraPrep |
|
|
85 |
) |
|
|
86 |
n.data <- nrow(COHORT.prep) |
|
|
87 |
|
|
|
88 |
# Define indices of test set |
|
|
89 |
test.set <- sample(1:n.data, (1/3)*n.data) |
|
|
90 |
|
|
|
91 |
# Create column for whether or not the patient had an event before risk.time |
|
|
92 |
COHORT.prep$event <- NA |
|
|
93 |
# Event before risk.time |
|
|
94 |
COHORT.prep$event[ |
|
|
95 |
COHORT.prep$surv_event & COHORT.prep$surv_time <= risk.time |
|
|
96 |
] <- TRUE |
|
|
97 |
# Event after, whether censorship or not, means no event by risk.time |
|
|
98 |
COHORT.prep$event[COHORT.prep$surv_time > risk.time] <- FALSE |
|
|
99 |
# Otherwise, censored before risk.time, let's remove the row |
|
|
100 |
COHORT.prep <- COHORT.prep[!is.na(COHORT.prep$event), ] |
|
|
101 |
|
|
|
102 |
surv.model.fit <- |
|
|
103 |
rfsrc( |
|
|
104 |
as.formula( |
|
|
105 |
paste('event ~', paste(surv.predict, collapse = '+')) |
|
|
106 |
), |
|
|
107 |
COHORT.prep[-test.set,], # Training set |
|
|
108 |
ntree = n.trees, |
|
|
109 |
splitrule = 'gini', |
|
|
110 |
n.threads = n.threads, |
|
|
111 |
na.action = 'na.impute', |
|
|
112 |
nimpute = 3 |
|
|
113 |
) |