Switch to side-by-side view

--- a
+++ b/overview/variable-importances.R
@@ -0,0 +1,286 @@
+source('../lib/handymedical.R', chdir = TRUE)
+requirePlus('cowplot')
+
+
+# Load the variable importances from the Cox model
+cox.miss <-
+  readRDS('../../output/caliber-replicate-with-missing-survreg-6-linear-age-surv-boot.rds')
+cox.miss.vars <- bootStats(cox.miss, uncertainty = '95ci')
+cox.miss.vars$var <- rownames(cox.miss.vars)
+cox.miss.vars <- subset(cox.miss.vars, startsWith(var, 'vimp.c.index'))
+cox.miss.vars$var <-
+  substring(cox.miss.vars$var, nchar('vimp.c.index.') + 1)
+
+# Load the variable importances from the random forest
+rf.boot <- read.csv('../../output/rfsrc-cv-nsplit-try3-boot-all.csv')
+rf.boot.vars <- bootStatsDf(rf.boot)
+rf.boot.vars$var <- rownames(rf.boot.vars)
+rf.boot.vars <- subset(rf.boot.vars, startsWith(var, 'vimp.c.index'))
+rf.boot.vars$var <-
+  substring(rf.boot.vars$var, nchar('vimp.c.index.') + 1)
+
+var.imp.compare <- merge(cox.miss.vars, rf.boot.vars, by = c('var'))
+
+
+# Plot a scatterplot of them
+rf.vs.cox <-
+  ggplot(
+    var.imp.compare,
+    aes(
+      x = val.x, xmin = lower.x, xmax = upper.x,
+      y = val.y, ymin = lower.y, ymax = upper.y
+    )
+  ) +
+    geom_point() +
+    geom_errorbar() +
+    geom_errorbarh() +
+    coord_cartesian(xlim = c(0, 0.03), ylim = c(0, 0.03))
+
+print('Spearman correlation coefficient of variable importances:')
+print(cor(var.imp.compare$val.x, var.imp.compare$val.y, method = 'spearman'))
+
+# Load the variable importances from the big data model
+cox.bigdata <- read.csv('../../output/cox-bigdata-varsellogrank-01-boot-all.csv')
+cox.bigdata.vars <- bootStatsDf(cox.bigdata)
+cox.bigdata.vars$var <- rownames(cox.bigdata.vars)
+cox.bigdata.vars <- subset(cox.bigdata.vars, startsWith(var, 'vimp.c.index'))
+cox.bigdata.vars$var <-
+  substring(cox.bigdata.vars$var, nchar('vimp.c.index.') + 1)
+
+cox.bigdata.vars <-
+  cox.bigdata.vars[order(cox.bigdata.vars$val, decreasing = TRUE)[1:20], ]
+
+cox.bigdata.vars <-
+  cox.bigdata.vars[order(cox.bigdata.vars$val, decreasing = FALSE), ]
+
+cox.bigdata.vars$description <- lookUpDescriptions(cox.bigdata.vars$var)
+
+cat('c(', paste0("'", as.character(cox.bigdata.vars$description), "',"), ')', sep = '\n')
+
+cox.bigdata.vars$description.manual <-
+  factorOrderedLevels(
+      c(
+        'ALT',
+        'PVD',
+        'Hb',
+        'Dementia',
+        'Albumin',
+        'Cardiac glycosides',
+        'LV failure',
+        'Home visit',
+        'Oestrogens/HRT',
+        'Chest pain',
+        'Na',
+        'WCC',
+        'ALP',
+        'Lymphocyte count',
+        'Diabetes',
+        'BMI ',
+        'Weight',
+        'Loop diuretics',
+        'Smoking status',
+        'Age'
+      )
+  )
+
+# Plot a bar graph of them
+cox.bigdata.plot <-
+  ggplot(
+    cox.bigdata.vars,
+    aes(x = description.manual, y = val, ymin = lower, ymax = upper)
+    ) +
+    geom_bar(stat = 'identity') + 
+    geom_errorbar(width = 0.25) +
+    coord_flip() +
+  theme(axis.title.y = element_blank(), axis.text.y = element_text(size = 10)) +
+  ylim(0, 0.17)
+
+# Random forest big data
+rf.bigdata <- read.csv('../../output/rf-bigdata-varsellogrank-02-boot-all.csv')
+rf.bigdata.vars <- bootStatsDf(rf.bigdata)
+rf.bigdata.vars$var <- rownames(rf.bigdata.vars)
+rf.bigdata.vars <- subset(rf.bigdata.vars, startsWith(var, 'vimp.c.index'))
+rf.bigdata.vars$var <-
+  substring(rf.bigdata.vars$var, nchar('vimp.c.index.') + 1)
+
+rf.bigdata.vars <-
+  rf.bigdata.vars[order(rf.bigdata.vars$val, decreasing = TRUE)[1:20], ]
+
+rf.bigdata.vars <-
+  rf.bigdata.vars[order(rf.bigdata.vars$val, decreasing = FALSE), ]
+
+cat('c(', paste0("'", as.character(rf.bigdata.vars$description), "',"), ')', sep = '\n')
+
+rf.bigdata.vars$description <- lookUpDescriptions(rf.bigdata.vars$var)
+
+rf.bigdata.vars$description.manual <-
+  factorOrderedLevels(
+    c(
+      'Fit note',
+      'Stimulant laxatives',
+      'Urea',
+      'Hypertension',
+      'Cardiac glycosides',
+      'Beta2 agonists',
+      'Telephone encounter',
+      'Feet examination',
+      'Creatinine',
+      'Screening',
+      'Osmotic laxatives',
+      'ACE inhibitors',
+      'Beta blockers',
+      'Home visit',
+      'Analgesics',
+      'Blood pressure',
+      'Chest pain',
+      'Loop diuretics',
+      'Smoking status',
+      'Age'
+    )
+  )
+
+# Plot a bar graph of them
+rf.bigdata.plot <-
+  ggplot(
+    rf.bigdata.vars,
+    aes(x = description.manual, y = val, ymin = lower, ymax = upper)
+  ) +
+    geom_bar(stat = 'identity') + 
+    geom_errorbar(width = 0.25) +
+    coord_flip() +
+  theme(axis.title.y = element_blank(), axis.text.y = element_text(size = 10)) +
+  ylim(0, 0.17)
+
+
+
+# Elastic net Cox model
+elastic.bigdata <- read.csv('../../output/cox-discrete-elasticnet-01-boot-all.csv')
+elastic.bigdata.vars <- bootStatsDf(elastic.bigdata)
+elastic.bigdata.vars$var <- rownames(elastic.bigdata.vars)
+elastic.bigdata.vars <- subset(elastic.bigdata.vars, startsWith(var, 'vimp.c.index'))
+elastic.bigdata.vars$var <-
+  substring(elastic.bigdata.vars$var, nchar('vimp.c.index.') + 1)
+
+elastic.bigdata.vars <-
+  elastic.bigdata.vars[order(elastic.bigdata.vars$val, decreasing = TRUE)[1:20], ]
+
+elastic.bigdata.vars <-
+  elastic.bigdata.vars[order(elastic.bigdata.vars$val, decreasing = FALSE), ]
+
+cat('c(', paste0("'", as.character(elastic.bigdata.vars$description), "',"), ')', sep = '\n')
+
+elastic.bigdata.vars$description <- factorOrderedLevels(lookUpDescriptions(elastic.bigdata.vars$var))
+
+elastic.bigdata.vars$description.manual <-
+  factorOrderedLevels(
+    c(
+      'Biguanides',
+      'CKD',
+      'Osmotic laxatives',
+      'MCV',
+      'IMD score',
+      'Dementia',
+      'Home visit',
+      'Sulphonylureas',
+      'Insulin',
+      'LV failure',
+      'Cardiac glycosides',
+      'Telephone encounter',
+      'Records held date',
+      'Chest pain',
+      'Diabetes',
+      'Smoking status',
+      'Loop diuretics',
+      'Gender',
+      'Type 2 diabetes',
+      'Age'
+      )
+  )
+
+# Plot a bar graph of them
+elastic.bigdata.plot <-
+  ggplot(
+    elastic.bigdata.vars,
+    aes(x = description.manual, y = val, ymin = lower, ymax = upper)
+  ) +
+  geom_bar(stat = 'identity') + 
+  geom_errorbar(width = 0.25) +
+  coord_flip() +
+  theme(axis.title.y = element_blank(), axis.text.y = element_text(size = 10)) +
+  ylim(0, 0.17)
+
+
+# Combine for output
+plot_grid(
+  rf.bigdata.plot, cox.bigdata.plot, elastic.bigdata.plot,
+  labels = c('A', 'B', 'C'),
+  ncol = 3
+)
+
+ggsave(
+  '../../output/variable-importances.pdf',
+  width = 24,
+  height = 8,
+  units = 'cm',
+  useDingbats = FALSE
+)
+
+
+# Print a helpful list of overlapping predictors
+print('RF vs Cox')
+print(
+  paste(
+    rf.bigdata.vars$description.manual[
+      rf.bigdata.vars$description.manual %in%
+        cox.bigdata.vars$description.manual
+      ], collapse = ', ')
+)
+print('Cox vs elastic net')
+print(
+  paste(
+    elastic.bigdata.vars$description.manual[
+      elastic.bigdata.vars$description.manual %in%
+        cox.bigdata.vars$description.manual
+      ], collapse = ', ')
+)
+print('RF vs elastic net not Cox')
+print(
+  paste(
+    elastic.bigdata.vars$description.manual[
+      elastic.bigdata.vars$description.manual %in%
+        rf.bigdata.vars$description.manual & !(
+          elastic.bigdata.vars$description.manual %in%
+          cox.bigdata.vars$description.manual)
+      ], collapse = ', ')
+)
+print('Cox vs elastic net not RF')
+print(
+  paste(
+    elastic.bigdata.vars$description.manual[
+      elastic.bigdata.vars$description.manual %in%
+        cox.bigdata.vars$description.manual & !(
+          elastic.bigdata.vars$description.manual %in%
+            rf.bigdata.vars$description.manual)
+      ], collapse = ', ')
+)
+# Should be none or the graph will be challenging to draw!
+
+# Aborted idea to draw a rank-change chart which is too messy to be useful...
+elastic.bigdata.vars$model <- 'enet'
+rf.bigdata.vars$model <- 'rf'
+cox.bigdata.vars$model <- 'cox'
+
+all.models <- rbind(cox.bigdata.vars, rf.bigdata.vars, elastic.bigdata.vars)
+
+all.models$model <- factor(all.models$model, levels = c('rf', 'cox', 'enet'))
+
+ggplot(
+  all.models,
+  aes(
+    x = model, y = log(val), ymin = log(lower), ymax = log(upper),
+    label = description.manual, group = description.manual
+  )) +
+  geom_text(position = position_dodge(width = 0.7)) +
+  geom_errorbar(width = 0.1, position = position_dodge(width = 0.7)) +
+  geom_point(position = position_dodge(width = 0.7))
+