MLehealth / Git / [0375db] /overview/missing-values-risk.R

Models:
philipB/
MLehealth
Downloads: 1
[0375db]: / overview / missing-values-risk.R
History
Download this file
177 lines (149 with data), 5.1 kB

source('../lib/handymedical.R', chdir = TRUE)

requirePlus('survminer', 'cowplot')

models.base <- '../../output'
cox.missing.filename <- 'caliber-replicate-with-missing-model-survreg-bootstrap-1.rds'
cox.missing.riskdists.filename <- 'caliber-replicate-with-missing-survreg-3-risk-violins.csv'
cox.missing.riskcats.filename <- 'caliber-replicate-with-missing-survreg-3-risk-cats.csv'

cox.imp.filename <- 'caliber-replicate-imputed-survreg-4-surv-boot-imp.rds'

cox.disc.filename <- 'all-cv-survreg-boot-try5-surv-boot.csv'

data.filename <- '../../data/cohort-sanitised.csv'

survPlots <- function(...) {
  surv.fits <- list(...)
  
  df <- data.frame()
  for(i in 1:length(surv.fits)) {
    df <-
      rbind(
        df,
        data.frame(
          variable = as.character(surv.fits[[i]]$call)[2],
          stratum = '1',
          time = surv.fits[[i]][1]$time,
          surv = surv.fits[[i]][1]$surv,
          lower = surv.fits[[i]][1]$lower,
          upper = surv.fits[[i]][1]$upper
        ),
        data.frame(
          variable = as.character(surv.fits[[i]]$call)[2],
          stratum = '2',
          time = surv.fits[[i]][2]$time,
          surv = surv.fits[[i]][2]$surv,
          lower = surv.fits[[i]][2]$lower,
          upper = surv.fits[[i]][2]$upper
        )
      )
  }
  
  ggplot(
    df,
    aes(x = time, y = surv, ymin = lower, ymax = upper)
  ) +
    geom_line(aes(colour = stratum)) +
    geom_ribbon(aes(fill = stratum), alpha = 0.4) +
    facet_grid(variable ~ .)
}

cox.missing.boot <- readRDS(file.path(models.base, cox.missing.filename))
fit.risks.miss <- bootStats(cox.missing.boot, '95ci')
fit.risks.miss$var <- rownames(fit.risks.miss)

cox.imp.boot <- readRDS(file.path(models.base, cox.imp.filename))
fit.risks.imp <- bootMIStats(cox.imp.boot, '95ci')
fit.risks.imp$var <- rownames(fit.risks.imp)

cox.disc.boot <- read.csv(file.path(models.base, cox.disc.filename))
fit.risks.disc <- bootStatsDf(cox.disc.boot)
fit.risks.disc$var <- rownames(fit.risks.disc)

# Create two data frames, one for missing vs imputed and one for discrete 
fit.risks.imp.vs.miss <- merge(fit.risks.imp, fit.risks.miss, by = c('var'))
fit.risks.imp.vs.miss$model <- 'miss'
# This is a slight cheat... The discrete model here was done by my home-made
# bootstrap function, whose variable/level names have been preprocessed with
# make.names for slightly annoying internal reasons. Luckily, all the logical
# variables escape unscathed from this, and they're the only ones we can compare
# anyway!
fit.risks.imp.vs.disc <- merge(fit.risks.imp, fit.risks.disc, by = c('var'))
fit.risks.imp.vs.disc$model <- 'disc'

fit.risks <- rbind(fit.risks.imp.vs.miss, fit.risks.imp.vs.disc)

# Lose a few irrelevant variables
fit.risks <- 
  subset(
    fit.risks, !(var %in% c('(Intercept)', 'c.index', 'calibration.score'))
  )

# Make them

disc.vs.cont.risks.plot <-
  ggplot(
    fit.risks,
    aes(
      x = val.x, xmin = lower.x, xmax = upper.x,
      y = val.y, ymin = lower.y, ymax = upper.y,
      label = var, colour = model
    )
  ) +
  geom_point() +
  geom_errorbar() +
  geom_errorbarh() +
  geom_text() +
  # Remove the legend, massively compresses the plot horizontally
  theme(legend.position="none")



risk.dist.by.var <-
  read.csv(file.path(models.base, cox.missing.riskdists.filename))
risk.cats <- read.csv(file.path(models.base, cox.missing.riskcats.filename))


# Plot the results
risk.violins.plot <- 
  ggplot() +
    # First, and therefore at the bottom, draw the reference line at risk = 1
    geom_hline(yintercept = 1) +
    # Then, on top of that, draw the violin plot of the risk from the data
    geom_violin(data = risk.dist.by.var, aes(x = quantity, y = risk)) +
    geom_pointrange(
      data = risk.cats,
      aes(x = quantity, y = our_value, ymin = our_lower,
          ymax = our_upper),
      
      position = position_jitter(width = 0.1)
    ) +
    geom_text(
      data = risk.cats,
      aes(
        x = quantity,
        y = our_value,
        label = quantity.level
      )
    ) +
  scale_y_continuous(breaks = c(0.75, 1.0, 1.25, 1.5))


# Kaplan-Meier survival curves for a few example variables being missing
COHORT <- fread(data.filename)
COHORT <- subset(COHORT, !exclude & time_death > 0)

# Calculate the curves
km.hdl <-
 survfit(
    Surv(time_death, endpoint_death == 'Death') ~ is.na(hdl_6mo),
    data = COHORT
 )
km.total_chol <-
  survfit(
    Surv(time_death, endpoint_death == 'Death') ~ is.na(total_chol_6mo),
    data = COHORT
  )
km.crea <-
  survfit(
    Surv(time_death, endpoint_death == 'Death') ~ is.na(crea_6mo),
    data = COHORT
  )

km.missingness <-
  survPlots(km.hdl, km.total_chol, km.crea) +
  theme(
    legend.position = "none",
    # Remove grey labels on facets
    strip.background = element_blank(),
    strip.text = element_blank()
  )

theme_set(theme_cowplot(font_size = 10))
plot_grid(
  disc.vs.cont.risks.plot, risk.violins.plot,
  km.missingness,
  labels = c("A", "B", "C"),
  align = "v", nrow = 1
)
ggsave(
  '../../output/missing-values-risk.pdf',
  width = 16,
  height = 5,
  units = 'cm',
  useDingbats = FALSE
)