Switch to unified view

a b/overview/variable-importances.R
1
source('../lib/handymedical.R', chdir = TRUE)
2
requirePlus('cowplot')
3
4
5
# Load the variable importances from the Cox model
6
cox.miss <-
7
  readRDS('../../output/caliber-replicate-with-missing-survreg-6-linear-age-surv-boot.rds')
8
cox.miss.vars <- bootStats(cox.miss, uncertainty = '95ci')
9
cox.miss.vars$var <- rownames(cox.miss.vars)
10
cox.miss.vars <- subset(cox.miss.vars, startsWith(var, 'vimp.c.index'))
11
cox.miss.vars$var <-
12
  substring(cox.miss.vars$var, nchar('vimp.c.index.') + 1)
13
14
# Load the variable importances from the random forest
15
rf.boot <- read.csv('../../output/rfsrc-cv-nsplit-try3-boot-all.csv')
16
rf.boot.vars <- bootStatsDf(rf.boot)
17
rf.boot.vars$var <- rownames(rf.boot.vars)
18
rf.boot.vars <- subset(rf.boot.vars, startsWith(var, 'vimp.c.index'))
19
rf.boot.vars$var <-
20
  substring(rf.boot.vars$var, nchar('vimp.c.index.') + 1)
21
22
var.imp.compare <- merge(cox.miss.vars, rf.boot.vars, by = c('var'))
23
24
25
# Plot a scatterplot of them
26
rf.vs.cox <-
27
  ggplot(
28
    var.imp.compare,
29
    aes(
30
      x = val.x, xmin = lower.x, xmax = upper.x,
31
      y = val.y, ymin = lower.y, ymax = upper.y
32
    )
33
  ) +
34
    geom_point() +
35
    geom_errorbar() +
36
    geom_errorbarh() +
37
    coord_cartesian(xlim = c(0, 0.03), ylim = c(0, 0.03))
38
39
print('Spearman correlation coefficient of variable importances:')
40
print(cor(var.imp.compare$val.x, var.imp.compare$val.y, method = 'spearman'))
41
42
# Load the variable importances from the big data model
43
cox.bigdata <- read.csv('../../output/cox-bigdata-varsellogrank-01-boot-all.csv')
44
cox.bigdata.vars <- bootStatsDf(cox.bigdata)
45
cox.bigdata.vars$var <- rownames(cox.bigdata.vars)
46
cox.bigdata.vars <- subset(cox.bigdata.vars, startsWith(var, 'vimp.c.index'))
47
cox.bigdata.vars$var <-
48
  substring(cox.bigdata.vars$var, nchar('vimp.c.index.') + 1)
49
50
cox.bigdata.vars <-
51
  cox.bigdata.vars[order(cox.bigdata.vars$val, decreasing = TRUE)[1:20], ]
52
53
cox.bigdata.vars <-
54
  cox.bigdata.vars[order(cox.bigdata.vars$val, decreasing = FALSE), ]
55
56
cox.bigdata.vars$description <- lookUpDescriptions(cox.bigdata.vars$var)
57
58
cat('c(', paste0("'", as.character(cox.bigdata.vars$description), "',"), ')', sep = '\n')
59
60
cox.bigdata.vars$description.manual <-
61
  factorOrderedLevels(
62
      c(
63
        'ALT',
64
        'PVD',
65
        'Hb',
66
        'Dementia',
67
        'Albumin',
68
        'Cardiac glycosides',
69
        'LV failure',
70
        'Home visit',
71
        'Oestrogens/HRT',
72
        'Chest pain',
73
        'Na',
74
        'WCC',
75
        'ALP',
76
        'Lymphocyte count',
77
        'Diabetes',
78
        'BMI ',
79
        'Weight',
80
        'Loop diuretics',
81
        'Smoking status',
82
        'Age'
83
      )
84
  )
85
86
# Plot a bar graph of them
87
cox.bigdata.plot <-
88
  ggplot(
89
    cox.bigdata.vars,
90
    aes(x = description.manual, y = val, ymin = lower, ymax = upper)
91
    ) +
92
    geom_bar(stat = 'identity') + 
93
    geom_errorbar(width = 0.25) +
94
    coord_flip() +
95
  theme(axis.title.y = element_blank(), axis.text.y = element_text(size = 10)) +
96
  ylim(0, 0.17)
97
98
# Random forest big data
99
rf.bigdata <- read.csv('../../output/rf-bigdata-varsellogrank-02-boot-all.csv')
100
rf.bigdata.vars <- bootStatsDf(rf.bigdata)
101
rf.bigdata.vars$var <- rownames(rf.bigdata.vars)
102
rf.bigdata.vars <- subset(rf.bigdata.vars, startsWith(var, 'vimp.c.index'))
103
rf.bigdata.vars$var <-
104
  substring(rf.bigdata.vars$var, nchar('vimp.c.index.') + 1)
105
106
rf.bigdata.vars <-
107
  rf.bigdata.vars[order(rf.bigdata.vars$val, decreasing = TRUE)[1:20], ]
108
109
rf.bigdata.vars <-
110
  rf.bigdata.vars[order(rf.bigdata.vars$val, decreasing = FALSE), ]
111
112
cat('c(', paste0("'", as.character(rf.bigdata.vars$description), "',"), ')', sep = '\n')
113
114
rf.bigdata.vars$description <- lookUpDescriptions(rf.bigdata.vars$var)
115
116
rf.bigdata.vars$description.manual <-
117
  factorOrderedLevels(
118
    c(
119
      'Fit note',
120
      'Stimulant laxatives',
121
      'Urea',
122
      'Hypertension',
123
      'Cardiac glycosides',
124
      'Beta2 agonists',
125
      'Telephone encounter',
126
      'Feet examination',
127
      'Creatinine',
128
      'Screening',
129
      'Osmotic laxatives',
130
      'ACE inhibitors',
131
      'Beta blockers',
132
      'Home visit',
133
      'Analgesics',
134
      'Blood pressure',
135
      'Chest pain',
136
      'Loop diuretics',
137
      'Smoking status',
138
      'Age'
139
    )
140
  )
141
142
# Plot a bar graph of them
143
rf.bigdata.plot <-
144
  ggplot(
145
    rf.bigdata.vars,
146
    aes(x = description.manual, y = val, ymin = lower, ymax = upper)
147
  ) +
148
    geom_bar(stat = 'identity') + 
149
    geom_errorbar(width = 0.25) +
150
    coord_flip() +
151
  theme(axis.title.y = element_blank(), axis.text.y = element_text(size = 10)) +
152
  ylim(0, 0.17)
153
154
155
156
# Elastic net Cox model
157
elastic.bigdata <- read.csv('../../output/cox-discrete-elasticnet-01-boot-all.csv')
158
elastic.bigdata.vars <- bootStatsDf(elastic.bigdata)
159
elastic.bigdata.vars$var <- rownames(elastic.bigdata.vars)
160
elastic.bigdata.vars <- subset(elastic.bigdata.vars, startsWith(var, 'vimp.c.index'))
161
elastic.bigdata.vars$var <-
162
  substring(elastic.bigdata.vars$var, nchar('vimp.c.index.') + 1)
163
164
elastic.bigdata.vars <-
165
  elastic.bigdata.vars[order(elastic.bigdata.vars$val, decreasing = TRUE)[1:20], ]
166
167
elastic.bigdata.vars <-
168
  elastic.bigdata.vars[order(elastic.bigdata.vars$val, decreasing = FALSE), ]
169
170
cat('c(', paste0("'", as.character(elastic.bigdata.vars$description), "',"), ')', sep = '\n')
171
172
elastic.bigdata.vars$description <- factorOrderedLevels(lookUpDescriptions(elastic.bigdata.vars$var))
173
174
elastic.bigdata.vars$description.manual <-
175
  factorOrderedLevels(
176
    c(
177
      'Biguanides',
178
      'CKD',
179
      'Osmotic laxatives',
180
      'MCV',
181
      'IMD score',
182
      'Dementia',
183
      'Home visit',
184
      'Sulphonylureas',
185
      'Insulin',
186
      'LV failure',
187
      'Cardiac glycosides',
188
      'Telephone encounter',
189
      'Records held date',
190
      'Chest pain',
191
      'Diabetes',
192
      'Smoking status',
193
      'Loop diuretics',
194
      'Gender',
195
      'Type 2 diabetes',
196
      'Age'
197
      )
198
  )
199
200
# Plot a bar graph of them
201
elastic.bigdata.plot <-
202
  ggplot(
203
    elastic.bigdata.vars,
204
    aes(x = description.manual, y = val, ymin = lower, ymax = upper)
205
  ) +
206
  geom_bar(stat = 'identity') + 
207
  geom_errorbar(width = 0.25) +
208
  coord_flip() +
209
  theme(axis.title.y = element_blank(), axis.text.y = element_text(size = 10)) +
210
  ylim(0, 0.17)
211
212
213
# Combine for output
214
plot_grid(
215
  rf.bigdata.plot, cox.bigdata.plot, elastic.bigdata.plot,
216
  labels = c('A', 'B', 'C'),
217
  ncol = 3
218
)
219
220
ggsave(
221
  '../../output/variable-importances.pdf',
222
  width = 24,
223
  height = 8,
224
  units = 'cm',
225
  useDingbats = FALSE
226
)
227
228
229
# Print a helpful list of overlapping predictors
230
print('RF vs Cox')
231
print(
232
  paste(
233
    rf.bigdata.vars$description.manual[
234
      rf.bigdata.vars$description.manual %in%
235
        cox.bigdata.vars$description.manual
236
      ], collapse = ', ')
237
)
238
print('Cox vs elastic net')
239
print(
240
  paste(
241
    elastic.bigdata.vars$description.manual[
242
      elastic.bigdata.vars$description.manual %in%
243
        cox.bigdata.vars$description.manual
244
      ], collapse = ', ')
245
)
246
print('RF vs elastic net not Cox')
247
print(
248
  paste(
249
    elastic.bigdata.vars$description.manual[
250
      elastic.bigdata.vars$description.manual %in%
251
        rf.bigdata.vars$description.manual & !(
252
          elastic.bigdata.vars$description.manual %in%
253
          cox.bigdata.vars$description.manual)
254
      ], collapse = ', ')
255
)
256
print('Cox vs elastic net not RF')
257
print(
258
  paste(
259
    elastic.bigdata.vars$description.manual[
260
      elastic.bigdata.vars$description.manual %in%
261
        cox.bigdata.vars$description.manual & !(
262
          elastic.bigdata.vars$description.manual %in%
263
            rf.bigdata.vars$description.manual)
264
      ], collapse = ', ')
265
)
266
# Should be none or the graph will be challenging to draw!
267
268
# Aborted idea to draw a rank-change chart which is too messy to be useful...
269
elastic.bigdata.vars$model <- 'enet'
270
rf.bigdata.vars$model <- 'rf'
271
cox.bigdata.vars$model <- 'cox'
272
273
all.models <- rbind(cox.bigdata.vars, rf.bigdata.vars, elastic.bigdata.vars)
274
275
all.models$model <- factor(all.models$model, levels = c('rf', 'cox', 'enet'))
276
277
ggplot(
278
  all.models,
279
  aes(
280
    x = model, y = log(val), ymin = log(lower), ymax = log(upper),
281
    label = description.manual, group = description.manual
282
  )) +
283
  geom_text(position = position_dodge(width = 0.7)) +
284
  geom_errorbar(width = 0.1, position = position_dodge(width = 0.7)) +
285
  geom_point(position = position_dodge(width = 0.7))
286