--- a
+++ b/overview/missing-values-risk.R
@@ -0,0 +1,176 @@
+source('../lib/handymedical.R', chdir = TRUE)
+
+requirePlus('survminer', 'cowplot')
+
+models.base <- '../../output'
+cox.missing.filename <- 'caliber-replicate-with-missing-model-survreg-bootstrap-1.rds'
+cox.missing.riskdists.filename <- 'caliber-replicate-with-missing-survreg-3-risk-violins.csv'
+cox.missing.riskcats.filename <- 'caliber-replicate-with-missing-survreg-3-risk-cats.csv'
+
+cox.imp.filename <- 'caliber-replicate-imputed-survreg-4-surv-boot-imp.rds'
+
+cox.disc.filename <- 'all-cv-survreg-boot-try5-surv-boot.csv'
+
+data.filename <- '../../data/cohort-sanitised.csv'
+
+survPlots <- function(...) {
+  surv.fits <- list(...)
+  
+  df <- data.frame()
+  for(i in 1:length(surv.fits)) {
+    df <-
+      rbind(
+        df,
+        data.frame(
+          variable = as.character(surv.fits[[i]]$call)[2],
+          stratum = '1',
+          time = surv.fits[[i]][1]$time,
+          surv = surv.fits[[i]][1]$surv,
+          lower = surv.fits[[i]][1]$lower,
+          upper = surv.fits[[i]][1]$upper
+        ),
+        data.frame(
+          variable = as.character(surv.fits[[i]]$call)[2],
+          stratum = '2',
+          time = surv.fits[[i]][2]$time,
+          surv = surv.fits[[i]][2]$surv,
+          lower = surv.fits[[i]][2]$lower,
+          upper = surv.fits[[i]][2]$upper
+        )
+      )
+  }
+  
+  ggplot(
+    df,
+    aes(x = time, y = surv, ymin = lower, ymax = upper)
+  ) +
+    geom_line(aes(colour = stratum)) +
+    geom_ribbon(aes(fill = stratum), alpha = 0.4) +
+    facet_grid(variable ~ .)
+}
+
+cox.missing.boot <- readRDS(file.path(models.base, cox.missing.filename))
+fit.risks.miss <- bootStats(cox.missing.boot, '95ci')
+fit.risks.miss$var <- rownames(fit.risks.miss)
+
+cox.imp.boot <- readRDS(file.path(models.base, cox.imp.filename))
+fit.risks.imp <- bootMIStats(cox.imp.boot, '95ci')
+fit.risks.imp$var <- rownames(fit.risks.imp)
+
+cox.disc.boot <- read.csv(file.path(models.base, cox.disc.filename))
+fit.risks.disc <- bootStatsDf(cox.disc.boot)
+fit.risks.disc$var <- rownames(fit.risks.disc)
+
+# Create two data frames, one for missing vs imputed and one for discrete 
+fit.risks.imp.vs.miss <- merge(fit.risks.imp, fit.risks.miss, by = c('var'))
+fit.risks.imp.vs.miss$model <- 'miss'
+# This is a slight cheat... The discrete model here was done by my home-made
+# bootstrap function, whose variable/level names have been preprocessed with
+# make.names for slightly annoying internal reasons. Luckily, all the logical
+# variables escape unscathed from this, and they're the only ones we can compare
+# anyway!
+fit.risks.imp.vs.disc <- merge(fit.risks.imp, fit.risks.disc, by = c('var'))
+fit.risks.imp.vs.disc$model <- 'disc'
+
+fit.risks <- rbind(fit.risks.imp.vs.miss, fit.risks.imp.vs.disc)
+
+# Lose a few irrelevant variables
+fit.risks <- 
+  subset(
+    fit.risks, !(var %in% c('(Intercept)', 'c.index', 'calibration.score'))
+  )
+
+# Make them
+
+disc.vs.cont.risks.plot <-
+  ggplot(
+    fit.risks,
+    aes(
+      x = val.x, xmin = lower.x, xmax = upper.x,
+      y = val.y, ymin = lower.y, ymax = upper.y,
+      label = var, colour = model
+    )
+  ) +
+  geom_point() +
+  geom_errorbar() +
+  geom_errorbarh() +
+  geom_text() +
+  # Remove the legend, massively compresses the plot horizontally
+  theme(legend.position="none")
+
+
+
+risk.dist.by.var <-
+  read.csv(file.path(models.base, cox.missing.riskdists.filename))
+risk.cats <- read.csv(file.path(models.base, cox.missing.riskcats.filename))
+
+
+# Plot the results
+risk.violins.plot <- 
+  ggplot() +
+    # First, and therefore at the bottom, draw the reference line at risk = 1
+    geom_hline(yintercept = 1) +
+    # Then, on top of that, draw the violin plot of the risk from the data
+    geom_violin(data = risk.dist.by.var, aes(x = quantity, y = risk)) +
+    geom_pointrange(
+      data = risk.cats,
+      aes(x = quantity, y = our_value, ymin = our_lower,
+          ymax = our_upper),
+      
+      position = position_jitter(width = 0.1)
+    ) +
+    geom_text(
+      data = risk.cats,
+      aes(
+        x = quantity,
+        y = our_value,
+        label = quantity.level
+      )
+    ) +
+  scale_y_continuous(breaks = c(0.75, 1.0, 1.25, 1.5))
+
+
+# Kaplan-Meier survival curves for a few example variables being missing
+COHORT <- fread(data.filename)
+COHORT <- subset(COHORT, !exclude & time_death > 0)
+
+# Calculate the curves
+km.hdl <-
+ survfit(
+    Surv(time_death, endpoint_death == 'Death') ~ is.na(hdl_6mo),
+    data = COHORT
+ )
+km.total_chol <-
+  survfit(
+    Surv(time_death, endpoint_death == 'Death') ~ is.na(total_chol_6mo),
+    data = COHORT
+  )
+km.crea <-
+  survfit(
+    Surv(time_death, endpoint_death == 'Death') ~ is.na(crea_6mo),
+    data = COHORT
+  )
+
+km.missingness <-
+  survPlots(km.hdl, km.total_chol, km.crea) +
+  theme(
+    legend.position = "none",
+    # Remove grey labels on facets
+    strip.background = element_blank(),
+    strip.text = element_blank()
+  )
+
+theme_set(theme_cowplot(font_size = 10))
+plot_grid(
+  disc.vs.cont.risks.plot, risk.violins.plot,
+  km.missingness,
+  labels = c("A", "B", "C"),
+  align = "v", nrow = 1
+)
+ggsave(
+  '../../output/missing-values-risk.pdf',
+  width = 16,
+  height = 5,
+  units = 'cm',
+  useDingbats = FALSE
+)