--- a
+++ b/cox-ph/caliber-replicate-with-imputation.R
@@ -0,0 +1,327 @@
+#+ knitr_setup, include = FALSE
+
+# Whether to cache the intensive code sections. Set to FALSE to recalculate
+# everything afresh.
+cacheoption <- FALSE
+# Disable lazy caching globally, because it fails for large objects, and all the
+# objects we wish to cache are large...
+opts_chunk$set(cache.lazy = FALSE)
+
+#' # Replicating Rapsomaniki _et al._ 2014
+#' 
+#' ## User variables
+#' 
+#' First, define some variables...
+
+#+ define_vars
+
+imputed.data.filename <- '../../data/COHORT_complete.rds'
+n.data <- NA # This is of full dataset...further rows may be excluded in prep
+endpoint <- 'death.imputed'
+
+old.coefficients.filename <- 'rapsomaniki-cox-values-from-paper.csv'
+
+output.filename.base <- '../../output/caliber-replicate-imputed-survreg-4'
+
+
+cox.var.imp.perm.filename <-
+  '../../output/caliber-replicate-imputed-survreg-bootstrap-var-imp-perm-1.csv'
+cox.var.imp.perm.missing.filename <-
+  '../../output/caliber-replicate-with-missing-survreg-bootstrap-var-imp-perm-1.csv'
+
+
+bootstraps <- 100
+n.threads <- 10
+
+#' ## Setup
+
+#+ setup, message=FALSE
+
+source('../lib/shared.R')
+require(xtable)
+require(ggrepel)
+
+# Load the data and convert to data frame to make column-selecting code in
+# prepData simpler
+imputed.data <- readRDS(imputed.data.filename)
+
+# Remove rows with death time of 0 to avoid fitting errors
+for(i in 1:length(imputed.data)) {
+  imputed.data[[i]] <- imputed.data[[i]][imputed.data[[i]][, surv.time] > 0, ]
+}
+
+# Define n.data based on the imputed data, which has already been preprocessed
+n.data <- nrow(imputed.data[[1]])
+# Define indices of test set
+test.set <- testSetIndices(imputed.data[[1]], random.seed = 78361)
+
+#' OK, we've now got **`r n.data`** patients, split into a training set of
+#' `r n.data - length(test.set)` and a test set of `r length(test.set)`.
+#'
+#'
+#' ## Transform variables
+#' 
+#' The model uses variables which have been standardised in various ways, so
+#' let's go through and transform our input variables in the same way...
+
+source('caliber-scale.R')
+# Remove the age splining
+ageSpline <- identity
+
+for(i in 1:length(imputed.data)) {
+  imputed.data[[i]] <- caliberScale(imputed.data[[i]], surv.time, surv.event)
+}
+
+#' ## Survival fitting
+#' 
+#' Fit a Cox model to the preprocessed data. The paper uses a Cox model with an
+#' exponential baseline hazard, as here. The standard errors were calculated
+#' with 200 bootstrap samples, which we're also doing here.
+
+#+ fit_cox_model, cache=cacheoption
+
+surv.formula <-
+  Surv(surv_time, surv_event) ~
+    ### Sociodemographic characteristics #######################################
+    ## Age in men, per year
+    ## Age in women, per year
+    ## Women vs. men
+    # ie include interaction between age and gender!
+    age*gender +
+    ## Most deprived quintile, yes vs. no
+    most_deprived +
+    ### SCAD diagnosis and severity ############################################
+    ## Other CHD vs. stable angina
+    ## Unstable angina vs. stable angina
+    ## NSTEMI vs. stable angina
+    ## STEMI vs. stable angina
+    diagnosis +
+    #diagnosis_missing +
+    ## PCI in last 6 months, yes vs. no
+    pci_6mo +
+    ## CABG in last 6 months, yes vs. no
+    cabg_6mo +
+    ## Previous/recurrent MI, yes vs. no
+    hx_mi +
+    ## Use of nitrates, yes vs. no
+    long_nitrate +
+    ### CVD risk factors #######################################################
+    ## Ex-smoker / current smoker / missing data vs. never
+    smokstatus +
+    ## Hypertension, present vs. absent
+    hypertension +
+    ## Diabetes mellitus, present vs. absent
+    diabetes_logical +
+    ## Total cholesterol, per 1 mmol/L increase
+    total_chol_6mo +
+    ## HDL, per 0.5 mmol/L increase
+    hdl_6mo +
+    ### CVD co-morbidities #####################################################
+    ## Heart failure, present vs. absent
+    heart_failure +
+    ## Peripheral arterial disease, present vs. absent
+    pad +
+    ## Atrial fibrillation, present vs. absent
+    hx_af +
+    ## Stroke, present vs. absent
+    hx_stroke +
+    ### Non-CVD comorbidities ##################################################
+    ## Chronic kidney disease, present vs. absent
+    hx_renal +
+    ## Chronic obstructive pulmonary disease, present vs. absent
+    hx_copd +
+    ## Cancer, present vs. absent
+    hx_cancer +
+    ## Chronic liver disease, present vs. absent
+    hx_liver +
+    ### Psychosocial characteristics ###########################################
+    ## Depression at diagnosis, present vs. absent
+    hx_depression +
+    ## Anxiety at diagnosis, present vs. absent
+    hx_anxiety +
+    ### Biomarkers #############################################################
+    ## Heart rate, per 10 b.p.m increase
+    pulse_6mo +
+    ## Creatinine, per 30 μmol/L increase
+    crea_6mo +
+    ## White cell count, per 1.5 109/L increase
+    total_wbc_6mo +
+    ## Haemoglobin, per 1.5 g/dL increase
+    haemoglobin_6mo
+
+# Do a quick and dirty fit on a single imputed dataset, to draw calibration
+# curve from
+fit.exp <- survreg(
+  formula = surv.formula,
+  data = imputed.data[[1]][-test.set, ],
+  dist = "exponential"
+)
+
+fit.exp.boot <- list()
+
+# Perform bootstrap fitting for every multiply imputed dataset
+for(i in 1:length(imputed.data)) {
+  fit.exp.boot[[i]] <-
+    boot(
+      formula = surv.formula,
+      data = imputed.data[[i]][-test.set, ],
+      statistic = bootstrapFitSurvreg,
+      R = bootstraps,
+      parallel = 'multicore',
+      ncpus = n.threads,
+      test.data = imputed.data[[i]][test.set, ]
+    )
+}
+
+# Save the fits, because it might've taken a while!
+saveRDS(fit.exp.boot, paste0(output.filename.base, '-surv-boot-imp.rds'))
+
+# Unpackage the uncertainties from the bootstrapped data
+fit.exp.boot.ests <-  bootMIStats(fit.exp.boot)
+
+# Save bootstrapped performance values
+varsToTable(
+  data.frame(
+    model = 'cox',
+    imputation = TRUE,
+    discretised = FALSE,
+    c.index = fit.exp.boot.ests['c.index', 'val'],
+    c.index.lower = fit.exp.boot.ests['c.index', 'lower'],
+    c.index.upper = fit.exp.boot.ests['c.index', 'upper'],
+    calibration.score = fit.exp.boot.ests['calibration.score', 'val'],
+    calibration.score.lower = fit.exp.boot.ests['calibration.score', 'lower'],
+    calibration.score.upper = fit.exp.boot.ests['calibration.score', 'upper']
+  ),
+  performance.file,
+  index.cols = c('model', 'imputation', 'discretised')
+)
+
+#' ## Performance
+#' 
+#' Having fitted the Cox model, how did we do? The c-indices were calculated as
+#' part of the bootstrapping, so we just need to take a look at those...
+#' 
+#' C-indices are **`r round(fit.exp.boot.ests['c.train', 'val'], 3)`
+#' (`r round(fit.exp.boot.ests['c.train', 'lower'], 3)` - 
+#' `r round(fit.exp.boot.ests['c.train', 'upper'], 3)`)** on the training set and
+#' **`r round(fit.exp.boot.ests['c.test', 'val'], 3)`
+#' (`r round(fit.exp.boot.ests['c.test', 'lower'], 3)` - 
+#' `r round(fit.exp.boot.ests['c.test', 'upper'], 3)`)** on the test set.
+#' Not too bad!
+#' 
+#' 
+#' ### Calibration
+#' 
+#' The bootstrapped calibration score is
+#' **`r round(fit.exp.boot.ests['calibration.score', 'val'], 3)`
+#' (`r round(fit.exp.boot.ests['calibration.score', 'lower'], 3)` - 
+#' `r round(fit.exp.boot.ests['calibration.score', 'upper'], 3)`)**.
+#' 
+#' Let's draw a representative curve from the unbootstrapped fit... (It would be
+#' better to draw all the curves from the bootstrap fit to get an idea of
+#' variability, but I've not implemented this yet.)
+#' 
+#+ calibration_plot
+
+calibration.table <-
+  calibrationTable(fit.exp, imputed.data[[i]][test.set, ])
+
+calibration.score <- calibrationScore(calibration.table)
+
+calibrationPlot(calibration.table)
+
+#' The area between the calibration curve and the diagonal is 
+#' **`r round(calibration.score, 3)`**.
+#'  
+#' ## Coefficients
+#' 
+#' As well as getting comparable C-indices, it's also worth checking to see how
+#' the risk coefficients calculated compare to those found in the original
+#' paper. Let's compare...
+
+# Load CSV of values from paper
+old.coefficients <- read.csv(old.coefficients.filename)
+
+# Get coefficients from this fit
+new.coefficients <-
+  bootMIStats(fit.exp.boot, uncertainty = '95ci', transform = negExp)
+names(new.coefficients) <- c('our_value', 'our_lower', 'our_upper')
+new.coefficients$quantity.level <- rownames(new.coefficients)
+
+# Create a data frame comparing them
+compare.coefficients <- merge(old.coefficients, new.coefficients)
+
+# Kludge because age:genderWomen is the pure interaction term, not the risk for
+# a woman per unit of advancing spline-transformed age
+compare.coefficients[
+  compare.coefficients$quantity.level == 'age:genderWomen', 'our_value'
+  ] <-
+  compare.coefficients[
+    compare.coefficients$quantity.level == 'age:genderWomen', 'our_value'
+    ] *
+  compare.coefficients[
+    compare.coefficients$quantity.level == 'age', 'our_value'
+    ]
+
+# Save CSV of results
+write.csv(compare.coefficients, output.filename)
+
+# Plot a graph by which to judge success
+ggplot(compare.coefficients, aes(x = their_value, y = our_value)) +
+  geom_abline(intercept = 0, slope = 1) +
+  geom_hline(yintercept = 1, colour = 'grey') +
+  geom_vline(xintercept = 1, colour = 'grey') +
+  geom_point() +
+  geom_errorbar(aes(ymin = our_lower, ymax = our_upper)) +
+  geom_errorbarh(aes(xmin = their_lower, xmax = their_upper)) +
+  geom_text_repel(aes(label = long_name)) +
+  theme_classic(base_size = 8)
+
+#+ coefficients_table, results='asis'
+
+print(
+  xtable(
+    data.frame(
+      variable =
+        paste(
+          compare.coefficients$long_name, compare.coefficients$unit, sep=', '
+        ),
+      compare.coefficients[c('our_value', 'their_value')]
+    ),
+    digits = c(0,0,3,3)
+  ),
+  type = 'html',
+  include.rownames = FALSE
+)
+
+#' ### Variable importance
+#' 
+#' Let's compare the variable importance from this method with accounting for
+#' missing values explicitly. Slight kludge as it's only using one imputed
+#' dataset and a fit based on another, but should give some idea.
+#' 
+#+ cox_variable_importance
+
+cox.var.imp.perm <- 
+  generalVarImp(
+    fit.exp, imputed.data[[2]][test.set, ], model.type = 'survreg'
+  )
+
+write.csv(cox.var.imp.perm, cox.var.imp.perm.filename, row.names = FALSE)
+
+cox.var.imp.perm.missing <- read.csv(cox.var.imp.perm.missing.filename)
+
+cox.var.imp.comparison <-
+  merge(
+    cox.var.imp.perm,
+    cox.var.imp.perm.missing,
+    by = 'var',
+    suffixes = c('', '.missing')
+  )
+
+ggplot(cox.var.imp.comparison, aes(x = var.imp.missing, y = var.imp)) +
+  geom_point() +
+  scale_x_log10() +
+  scale_y_log10()
+
+#' There's a good correlation!