|
a |
|
b/overview/variable-importances.R |
|
|
1 |
source('../lib/handymedical.R', chdir = TRUE) |
|
|
2 |
requirePlus('cowplot') |
|
|
3 |
|
|
|
4 |
|
|
|
5 |
# Load the variable importances from the Cox model |
|
|
6 |
cox.miss <- |
|
|
7 |
readRDS('../../output/caliber-replicate-with-missing-survreg-6-linear-age-surv-boot.rds') |
|
|
8 |
cox.miss.vars <- bootStats(cox.miss, uncertainty = '95ci') |
|
|
9 |
cox.miss.vars$var <- rownames(cox.miss.vars) |
|
|
10 |
cox.miss.vars <- subset(cox.miss.vars, startsWith(var, 'vimp.c.index')) |
|
|
11 |
cox.miss.vars$var <- |
|
|
12 |
substring(cox.miss.vars$var, nchar('vimp.c.index.') + 1) |
|
|
13 |
|
|
|
14 |
# Load the variable importances from the random forest |
|
|
15 |
rf.boot <- read.csv('../../output/rfsrc-cv-nsplit-try3-boot-all.csv') |
|
|
16 |
rf.boot.vars <- bootStatsDf(rf.boot) |
|
|
17 |
rf.boot.vars$var <- rownames(rf.boot.vars) |
|
|
18 |
rf.boot.vars <- subset(rf.boot.vars, startsWith(var, 'vimp.c.index')) |
|
|
19 |
rf.boot.vars$var <- |
|
|
20 |
substring(rf.boot.vars$var, nchar('vimp.c.index.') + 1) |
|
|
21 |
|
|
|
22 |
var.imp.compare <- merge(cox.miss.vars, rf.boot.vars, by = c('var')) |
|
|
23 |
|
|
|
24 |
|
|
|
25 |
# Plot a scatterplot of them |
|
|
26 |
rf.vs.cox <- |
|
|
27 |
ggplot( |
|
|
28 |
var.imp.compare, |
|
|
29 |
aes( |
|
|
30 |
x = val.x, xmin = lower.x, xmax = upper.x, |
|
|
31 |
y = val.y, ymin = lower.y, ymax = upper.y |
|
|
32 |
) |
|
|
33 |
) + |
|
|
34 |
geom_point() + |
|
|
35 |
geom_errorbar() + |
|
|
36 |
geom_errorbarh() + |
|
|
37 |
coord_cartesian(xlim = c(0, 0.03), ylim = c(0, 0.03)) |
|
|
38 |
|
|
|
39 |
print('Spearman correlation coefficient of variable importances:') |
|
|
40 |
print(cor(var.imp.compare$val.x, var.imp.compare$val.y, method = 'spearman')) |
|
|
41 |
|
|
|
42 |
# Load the variable importances from the big data model |
|
|
43 |
cox.bigdata <- read.csv('../../output/cox-bigdata-varsellogrank-01-boot-all.csv') |
|
|
44 |
cox.bigdata.vars <- bootStatsDf(cox.bigdata) |
|
|
45 |
cox.bigdata.vars$var <- rownames(cox.bigdata.vars) |
|
|
46 |
cox.bigdata.vars <- subset(cox.bigdata.vars, startsWith(var, 'vimp.c.index')) |
|
|
47 |
cox.bigdata.vars$var <- |
|
|
48 |
substring(cox.bigdata.vars$var, nchar('vimp.c.index.') + 1) |
|
|
49 |
|
|
|
50 |
cox.bigdata.vars <- |
|
|
51 |
cox.bigdata.vars[order(cox.bigdata.vars$val, decreasing = TRUE)[1:20], ] |
|
|
52 |
|
|
|
53 |
cox.bigdata.vars <- |
|
|
54 |
cox.bigdata.vars[order(cox.bigdata.vars$val, decreasing = FALSE), ] |
|
|
55 |
|
|
|
56 |
cox.bigdata.vars$description <- lookUpDescriptions(cox.bigdata.vars$var) |
|
|
57 |
|
|
|
58 |
cat('c(', paste0("'", as.character(cox.bigdata.vars$description), "',"), ')', sep = '\n') |
|
|
59 |
|
|
|
60 |
cox.bigdata.vars$description.manual <- |
|
|
61 |
factorOrderedLevels( |
|
|
62 |
c( |
|
|
63 |
'ALT', |
|
|
64 |
'PVD', |
|
|
65 |
'Hb', |
|
|
66 |
'Dementia', |
|
|
67 |
'Albumin', |
|
|
68 |
'Cardiac glycosides', |
|
|
69 |
'LV failure', |
|
|
70 |
'Home visit', |
|
|
71 |
'Oestrogens/HRT', |
|
|
72 |
'Chest pain', |
|
|
73 |
'Na', |
|
|
74 |
'WCC', |
|
|
75 |
'ALP', |
|
|
76 |
'Lymphocyte count', |
|
|
77 |
'Diabetes', |
|
|
78 |
'BMI ', |
|
|
79 |
'Weight', |
|
|
80 |
'Loop diuretics', |
|
|
81 |
'Smoking status', |
|
|
82 |
'Age' |
|
|
83 |
) |
|
|
84 |
) |
|
|
85 |
|
|
|
86 |
# Plot a bar graph of them |
|
|
87 |
cox.bigdata.plot <- |
|
|
88 |
ggplot( |
|
|
89 |
cox.bigdata.vars, |
|
|
90 |
aes(x = description.manual, y = val, ymin = lower, ymax = upper) |
|
|
91 |
) + |
|
|
92 |
geom_bar(stat = 'identity') + |
|
|
93 |
geom_errorbar(width = 0.25) + |
|
|
94 |
coord_flip() + |
|
|
95 |
theme(axis.title.y = element_blank(), axis.text.y = element_text(size = 10)) + |
|
|
96 |
ylim(0, 0.17) |
|
|
97 |
|
|
|
98 |
# Random forest big data |
|
|
99 |
rf.bigdata <- read.csv('../../output/rf-bigdata-varsellogrank-02-boot-all.csv') |
|
|
100 |
rf.bigdata.vars <- bootStatsDf(rf.bigdata) |
|
|
101 |
rf.bigdata.vars$var <- rownames(rf.bigdata.vars) |
|
|
102 |
rf.bigdata.vars <- subset(rf.bigdata.vars, startsWith(var, 'vimp.c.index')) |
|
|
103 |
rf.bigdata.vars$var <- |
|
|
104 |
substring(rf.bigdata.vars$var, nchar('vimp.c.index.') + 1) |
|
|
105 |
|
|
|
106 |
rf.bigdata.vars <- |
|
|
107 |
rf.bigdata.vars[order(rf.bigdata.vars$val, decreasing = TRUE)[1:20], ] |
|
|
108 |
|
|
|
109 |
rf.bigdata.vars <- |
|
|
110 |
rf.bigdata.vars[order(rf.bigdata.vars$val, decreasing = FALSE), ] |
|
|
111 |
|
|
|
112 |
cat('c(', paste0("'", as.character(rf.bigdata.vars$description), "',"), ')', sep = '\n') |
|
|
113 |
|
|
|
114 |
rf.bigdata.vars$description <- lookUpDescriptions(rf.bigdata.vars$var) |
|
|
115 |
|
|
|
116 |
rf.bigdata.vars$description.manual <- |
|
|
117 |
factorOrderedLevels( |
|
|
118 |
c( |
|
|
119 |
'Fit note', |
|
|
120 |
'Stimulant laxatives', |
|
|
121 |
'Urea', |
|
|
122 |
'Hypertension', |
|
|
123 |
'Cardiac glycosides', |
|
|
124 |
'Beta2 agonists', |
|
|
125 |
'Telephone encounter', |
|
|
126 |
'Feet examination', |
|
|
127 |
'Creatinine', |
|
|
128 |
'Screening', |
|
|
129 |
'Osmotic laxatives', |
|
|
130 |
'ACE inhibitors', |
|
|
131 |
'Beta blockers', |
|
|
132 |
'Home visit', |
|
|
133 |
'Analgesics', |
|
|
134 |
'Blood pressure', |
|
|
135 |
'Chest pain', |
|
|
136 |
'Loop diuretics', |
|
|
137 |
'Smoking status', |
|
|
138 |
'Age' |
|
|
139 |
) |
|
|
140 |
) |
|
|
141 |
|
|
|
142 |
# Plot a bar graph of them |
|
|
143 |
rf.bigdata.plot <- |
|
|
144 |
ggplot( |
|
|
145 |
rf.bigdata.vars, |
|
|
146 |
aes(x = description.manual, y = val, ymin = lower, ymax = upper) |
|
|
147 |
) + |
|
|
148 |
geom_bar(stat = 'identity') + |
|
|
149 |
geom_errorbar(width = 0.25) + |
|
|
150 |
coord_flip() + |
|
|
151 |
theme(axis.title.y = element_blank(), axis.text.y = element_text(size = 10)) + |
|
|
152 |
ylim(0, 0.17) |
|
|
153 |
|
|
|
154 |
|
|
|
155 |
|
|
|
156 |
# Elastic net Cox model |
|
|
157 |
elastic.bigdata <- read.csv('../../output/cox-discrete-elasticnet-01-boot-all.csv') |
|
|
158 |
elastic.bigdata.vars <- bootStatsDf(elastic.bigdata) |
|
|
159 |
elastic.bigdata.vars$var <- rownames(elastic.bigdata.vars) |
|
|
160 |
elastic.bigdata.vars <- subset(elastic.bigdata.vars, startsWith(var, 'vimp.c.index')) |
|
|
161 |
elastic.bigdata.vars$var <- |
|
|
162 |
substring(elastic.bigdata.vars$var, nchar('vimp.c.index.') + 1) |
|
|
163 |
|
|
|
164 |
elastic.bigdata.vars <- |
|
|
165 |
elastic.bigdata.vars[order(elastic.bigdata.vars$val, decreasing = TRUE)[1:20], ] |
|
|
166 |
|
|
|
167 |
elastic.bigdata.vars <- |
|
|
168 |
elastic.bigdata.vars[order(elastic.bigdata.vars$val, decreasing = FALSE), ] |
|
|
169 |
|
|
|
170 |
cat('c(', paste0("'", as.character(elastic.bigdata.vars$description), "',"), ')', sep = '\n') |
|
|
171 |
|
|
|
172 |
elastic.bigdata.vars$description <- factorOrderedLevels(lookUpDescriptions(elastic.bigdata.vars$var)) |
|
|
173 |
|
|
|
174 |
elastic.bigdata.vars$description.manual <- |
|
|
175 |
factorOrderedLevels( |
|
|
176 |
c( |
|
|
177 |
'Biguanides', |
|
|
178 |
'CKD', |
|
|
179 |
'Osmotic laxatives', |
|
|
180 |
'MCV', |
|
|
181 |
'IMD score', |
|
|
182 |
'Dementia', |
|
|
183 |
'Home visit', |
|
|
184 |
'Sulphonylureas', |
|
|
185 |
'Insulin', |
|
|
186 |
'LV failure', |
|
|
187 |
'Cardiac glycosides', |
|
|
188 |
'Telephone encounter', |
|
|
189 |
'Records held date', |
|
|
190 |
'Chest pain', |
|
|
191 |
'Diabetes', |
|
|
192 |
'Smoking status', |
|
|
193 |
'Loop diuretics', |
|
|
194 |
'Gender', |
|
|
195 |
'Type 2 diabetes', |
|
|
196 |
'Age' |
|
|
197 |
) |
|
|
198 |
) |
|
|
199 |
|
|
|
200 |
# Plot a bar graph of them |
|
|
201 |
elastic.bigdata.plot <- |
|
|
202 |
ggplot( |
|
|
203 |
elastic.bigdata.vars, |
|
|
204 |
aes(x = description.manual, y = val, ymin = lower, ymax = upper) |
|
|
205 |
) + |
|
|
206 |
geom_bar(stat = 'identity') + |
|
|
207 |
geom_errorbar(width = 0.25) + |
|
|
208 |
coord_flip() + |
|
|
209 |
theme(axis.title.y = element_blank(), axis.text.y = element_text(size = 10)) + |
|
|
210 |
ylim(0, 0.17) |
|
|
211 |
|
|
|
212 |
|
|
|
213 |
# Combine for output |
|
|
214 |
plot_grid( |
|
|
215 |
rf.bigdata.plot, cox.bigdata.plot, elastic.bigdata.plot, |
|
|
216 |
labels = c('A', 'B', 'C'), |
|
|
217 |
ncol = 3 |
|
|
218 |
) |
|
|
219 |
|
|
|
220 |
ggsave( |
|
|
221 |
'../../output/variable-importances.pdf', |
|
|
222 |
width = 24, |
|
|
223 |
height = 8, |
|
|
224 |
units = 'cm', |
|
|
225 |
useDingbats = FALSE |
|
|
226 |
) |
|
|
227 |
|
|
|
228 |
|
|
|
229 |
# Print a helpful list of overlapping predictors |
|
|
230 |
print('RF vs Cox') |
|
|
231 |
print( |
|
|
232 |
paste( |
|
|
233 |
rf.bigdata.vars$description.manual[ |
|
|
234 |
rf.bigdata.vars$description.manual %in% |
|
|
235 |
cox.bigdata.vars$description.manual |
|
|
236 |
], collapse = ', ') |
|
|
237 |
) |
|
|
238 |
print('Cox vs elastic net') |
|
|
239 |
print( |
|
|
240 |
paste( |
|
|
241 |
elastic.bigdata.vars$description.manual[ |
|
|
242 |
elastic.bigdata.vars$description.manual %in% |
|
|
243 |
cox.bigdata.vars$description.manual |
|
|
244 |
], collapse = ', ') |
|
|
245 |
) |
|
|
246 |
print('RF vs elastic net not Cox') |
|
|
247 |
print( |
|
|
248 |
paste( |
|
|
249 |
elastic.bigdata.vars$description.manual[ |
|
|
250 |
elastic.bigdata.vars$description.manual %in% |
|
|
251 |
rf.bigdata.vars$description.manual & !( |
|
|
252 |
elastic.bigdata.vars$description.manual %in% |
|
|
253 |
cox.bigdata.vars$description.manual) |
|
|
254 |
], collapse = ', ') |
|
|
255 |
) |
|
|
256 |
print('Cox vs elastic net not RF') |
|
|
257 |
print( |
|
|
258 |
paste( |
|
|
259 |
elastic.bigdata.vars$description.manual[ |
|
|
260 |
elastic.bigdata.vars$description.manual %in% |
|
|
261 |
cox.bigdata.vars$description.manual & !( |
|
|
262 |
elastic.bigdata.vars$description.manual %in% |
|
|
263 |
rf.bigdata.vars$description.manual) |
|
|
264 |
], collapse = ', ') |
|
|
265 |
) |
|
|
266 |
# Should be none or the graph will be challenging to draw! |
|
|
267 |
|
|
|
268 |
# Aborted idea to draw a rank-change chart which is too messy to be useful... |
|
|
269 |
elastic.bigdata.vars$model <- 'enet' |
|
|
270 |
rf.bigdata.vars$model <- 'rf' |
|
|
271 |
cox.bigdata.vars$model <- 'cox' |
|
|
272 |
|
|
|
273 |
all.models <- rbind(cox.bigdata.vars, rf.bigdata.vars, elastic.bigdata.vars) |
|
|
274 |
|
|
|
275 |
all.models$model <- factor(all.models$model, levels = c('rf', 'cox', 'enet')) |
|
|
276 |
|
|
|
277 |
ggplot( |
|
|
278 |
all.models, |
|
|
279 |
aes( |
|
|
280 |
x = model, y = log(val), ymin = log(lower), ymax = log(upper), |
|
|
281 |
label = description.manual, group = description.manual |
|
|
282 |
)) + |
|
|
283 |
geom_text(position = position_dodge(width = 0.7)) + |
|
|
284 |
geom_errorbar(width = 0.1, position = position_dodge(width = 0.7)) + |
|
|
285 |
geom_point(position = position_dodge(width = 0.7)) |
|
|
286 |
|