--- a +++ b/cox-ph/cox-discrete-elasticnet.R @@ -0,0 +1,678 @@ +#+ knitr_setup, include = FALSE + +# Whether to cache the intensive code sections. Set to FALSE to recalculate +# everything afresh. +cacheoption <- TRUE +# Disable lazy caching globally, because it fails for large objects, and all the +# objects we wish to cache are large... +opts_chunk$set(cache.lazy = FALSE) + +#' # Variable selection in data-driven health records with discretised +#' # Cox models +#' +#' Having extracted around 600 variables which occur most frequently in patient +#' records, let's try to narrow these down using a methodology based on varSelRf +#' combined with survival modelling. We'll find the predictability of variables +#' as defined by the p-value of a logrank test on survival curves of different +#' categories within that variable, and then iteratively throw out unimportant +#' variables, cross-validating for optimum performance. +#' +#' ## User variables +#' +#+ user_variables + +output.filename.base <- '../../output/cox-discrete-elasticnet-08' + +bootstraps <- 100 +bootstrap.filename <- paste0(output.filename.base, '-boot-all.csv') + +n.data <- NA # This is after any variables being excluded in prep + +n.threads <- 16 + +#' ## Data set-up +#' +#+ data_setup + +data.filename.big <- '../../data/cohort-datadriven-02.csv' + +surv.predict.old <- c('age', 'smokstatus', 'imd_score', 'gender') +untransformed.vars <- c('time_death', 'endpoint_death', 'exclude') + +source('../lib/shared.R') +require(xtable) + +# Define these after shared.R or they will be overwritten! +exclude.vars <- + c( + # Entity type 4 is smoking status, which we already have + "clinical.values.4_data1", "clinical.values.4_data5", + "clinical.values.4_data6", + # Entity 13 data2 is the patient's weight centile, and not a single one is + # entered, but they come out as 0 so the algorithm, looking for NAs, thinks + # it's a useful column + "clinical.values.13_data2", + # Entities 148 and 149 are to do with death certification. I'm not sure how + # it made it into the dataset, but since all the datapoints in this are + # looking back in time, they're all NA. This causes rfsrc to fail. + "clinical.values.148_data1", "clinical.values.148_data2", + "clinical.values.148_data3", "clinical.values.148_data4", + "clinical.values.148_data5", + "clinical.values.149_data1", "clinical.values.149_data2", + # These are all the same value except where NA, which causes issues with + # discretisation + "clinical.values.14_data2", "clinical.values.62_data1", + "clinical.values.64_data1", "clinical.values.65_data1", + "clinical.values.65_data1", "clinical.values.67_data1", + "clinical.values.68_data2", "clinical.values.70_data1" + ) + +COHORT <- fread(data.filename.big) + +bigdata.prefixes <- + c( + 'hes.icd.', + 'hes.opcs.', + 'tests.enttype.', + 'clinical.history.', + 'clinical.values.', + 'bnf.' + ) + +bigdata.columns <- + colnames(COHORT)[ + which( + # Does is start with one of the data column names? + startsWithAny(names(COHORT), bigdata.prefixes) & + # And it's not one of the columns we want to exclude? + !(colnames(COHORT) %in% exclude.vars) + ) + ] + +COHORT.bigdata <- + COHORT[, c( + untransformed.vars, surv.predict.old, bigdata.columns + ), + with = FALSE + ] + +# Get the missingness before we start removing missing values +missingness <- sort(sapply(COHORT.bigdata, percentMissing)) +# Remove values for the 'untransformed.vars' above, which are the survival +# values plus exclude column +missingness <- missingness[!(names(missingness) %in% untransformed.vars)] + +# Deal appropriately with missing data +# Most of the variables are number of days since the first record of that type +time.based.vars <- + names(COHORT.bigdata)[ + startsWithAny( + names(COHORT.bigdata), + c('hes.icd.', 'hes.opcs.', 'clinical.history.') + ) + ] +# We're dealing with this as a logical, so we want non-NA values to be TRUE, +# is there is something in the history +for (j in time.based.vars) { + set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]])) +} + +# Again, taking this as a logical, set any non-NA value to TRUE. +prescriptions.vars <- names(COHORT.bigdata)[startsWith(names(COHORT.bigdata), 'bnf.')] +for (j in prescriptions.vars) { + set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]])) +} + +# This leaves tests and clinical.values, which are test results and should be +# imputed. + +# Manually fix clinical values items... +# +# "clinical.values.1_data1" "clinical.values.1_data2" +# These are just blood pressure values...fine to impute +# +# "clinical.values.13_data1" "clinical.values.13_data3" +# These are weight and BMI...also fine to impute +# +# Entity 5 is alcohol consumption status, 1 = Yes, 2 = No, 3 = Ex, so should be +# a factor, and NA can be a factor level +COHORT.bigdata$clinical.values.5_data1 <- + factorNAfix(factor(COHORT.bigdata$clinical.values.5_data1), NAval = 'missing') + +# Both gender and smokstatus are factors...fix that +COHORT.bigdata$gender <- factor(COHORT.bigdata$gender) +COHORT.bigdata$smokstatus <- + factorNAfix(factor(COHORT.bigdata$smokstatus), NAval = 'missing') + +# Exclude invalid patients +COHORT.bigdata <- COHORT.bigdata[!COHORT.bigdata$exclude] +COHORT.bigdata$exclude <- NULL + +# Remove negative survival times +COHORT.bigdata <- subset(COHORT.bigdata, time_death > 0) + +# Define test set +test.set <- testSetIndices(COHORT.bigdata, random.seed = 78361) + +# If n.data was specified, trim the data table down to size +if(!is.na(n.data)) { + COHORT.bigdata <- sample.df(COHORT.bigdata, n.data) +} + +# Create an appropraite survival column +COHORT.bigdata <- + prepSurvCol( + data.frame(COHORT.bigdata), 'time_death', 'endpoint_death', 'Death' + ) + +# Start by predicting survival with all the variables provided +surv.predict <- c(surv.predict.old, bigdata.columns) + +# Set up a csv file to store calibration data, or retrieve previous data +calibration.filename <- paste0(output.filename.base, '-varselcalibration.csv') + +# Create process settings + +# Variables to leave alone, including those whose logrank p-value is NA because +# that means there is only one value in the column and so it can't be discretised +# properly anyway +vars.noprocess <- c('surv_time', 'surv_event') +process.settings <- + list( + var = vars.noprocess, + method = rep(NA, length(vars.noprocess)), + settings = rep(list(NA), length(vars.noprocess)) + ) +# Find continuous variables which will need discretising +continuous.vars <- names(COHORT.bigdata)[sapply(COHORT.bigdata, class) %in% c('integer', 'numeric')] +# Remove those variables already explicitly excluded, mainly for those whose +# logrank score was NA +continuous.vars <- continuous.vars[!(continuous.vars %in% process.settings$var)] +process.settings$var <- c(process.settings$var, continuous.vars) +process.settings$method <- + c(process.settings$method, + rep('binByQuantile', length(continuous.vars)) + ) +process.settings$settings <- + c( + process.settings$settings, + rep( + list( + seq( + # Quantiles are obviously between 0 and 1 + 0, 1, + # All have the same number of bins + length.out = 10 + ) + ), + length(continuous.vars) + ) + ) + + +# Need a way to ID in advance those which are going to fail here, ie those where +# there are no quantiles. The + +COHORT.prep <- + prepData( + # Data for cross-validation excludes test set + COHORT.bigdata, + names(COHORT.bigdata), + process.settings, + 'surv_time', 'surv_event', + TRUE + ) + +# Kludge...remove surv_time.1 and rename surv_event.1 +COHORT.prep$surv_time.1 <- NULL +names(COHORT.prep)[names(COHORT.prep) == 'surv_event.1'] <- 'surv_event' + +COHORT.bin <- convertFactorsToBinaryColumns(COHORT.prep) +# model.matrix renames logicals to varTRUE, so fix that for status +colnames(COHORT.bin)[colnames(COHORT.bin) == 'surv_eventTRUE'] <- 'surv_event' + +# Coxnet code, should you ever decide to go that route +# test <- +# Coxnet( +# data.matrix(COHORT.bin[-test.set, !(colnames(COHORT.bin) %in% c('time', 'status'))]), +# data.matrix(COHORT.bin[-test.set, c('time', 'status')]), +# penalty = 'Enet', +# alpha = 0, +# nlambda = 50, nfolds = 10, maxit = 1e+5 +# ) + +#' ## Elastic net regression +#' +#' Run a loop over alphas running from LASSO to ridge regression, and see which +#' is best after tenfold cross-validation... +#' +#+ elastic_net_full + +require(glmnet) +initParallel(n.threads) + +time.start <- handyTimer() + +alphas <- seq(0, 1, length.out = 101) +mse <- c() + +for(alpha in alphas) { + cv.fit <- + cv.glmnet( + COHORT.bin[-test.set, !(colnames(COHORT.bin) %in% c('surv_time', 'surv_event'))], + Surv(COHORT.bin[-test.set, 'surv_time'], COHORT.bin[-test.set, 'surv_event']), + family = "cox", + maxit = 1000, + alpha = alpha, + parallel = TRUE + ) + best.lambda.i <- which(cv.fit$lambda == cv.fit$lambda.min) + mse <- c(mse, cv.fit$cvm[best.lambda.i]) +} + +time.cv <- handyTimer(time.start) + +write.csv( + data.frame( + alphas, mse + ), + paste0(output.filename.base, '-alpha-calibration.csv') +) + +#' `r length(alphas)` alpha values tested in `r time.cv` seconds! + +alpha.best <- alphas[which.min(mse)] + +# To avoid saving all the fits, let's just refit the best one +cv.fit <- + cv.glmnet( + COHORT.bin[-test.set, !(colnames(COHORT.bin) %in% c('surv_time', 'surv_event'))], + Surv(COHORT.bin[-test.set, 'surv_time'], COHORT.bin[-test.set, 'surv_event']), + family = "cox", + maxit = 1000, + alpha = alpha.best, + parallel = TRUE + ) + +# Save for future use +saveRDS(cv.fit, 'cv.fit.rds') + +#' The best alpha was `r alpha.best`, and the lambda with the lowest mean-square +#' error was `r cv.fit$lambda.min`. We'll be using the strictest lambda which is +#' within 1 se of the minimum, `r cv.fit$lambda.1se`. +#' +#' ## Performance +#' +#' ### C-index +#' +#' Calculate C-index manually. The glmnet interface requiring matrices is +#' sufficiently different to the usual one that I've not spent time integrating +#' it with the rest of the ``handymedical.R`` functions yet. +#' +#+ c_index + +glmnetCIndex <- function(model.fit, dm) { + test.predictions <- + getRisk( + model.fit, + dm[, !(colnames(dm) %in% c('surv_time', 'surv_event'))] + ) + + as.numeric( + survConcordance( + as.formula(paste0('Surv(surv_time, surv_event) ~ risk')), + data.frame( + surv_time = dm[, 'surv_time'], + surv_event = dm[, 'surv_event'], + risk = test.predictions + ) + )$concordance + ) +} + +c.index <- glmnetCIndex(cv.fit, COHORT.bin[test.set, ]) + +#' C-index is `r c.index`. +#' +#' ### Calibration +#' +#' For now, calibration is manual too just to get it working. It's surprisingly +#' hard... A package called `c060` should contain a function `predictProb`, but +#' on loading it, is says function not found. So here is a very manual solution, +#' creating a dummy Cox model using the `survival` package, inspired by +#' [this](https://stat.ethz.ch/pipermail/r-help/2012-May/312029.html). +#' +#+ calibration_plot + +glmnetCalibrationTable <- function(model.fit, dm, test.set, risk.time = 5) { + # Work out risks at risk.time for the special case of a glmnet model + + # Derive baseline hazard from cv.glmnet model, heavily based on the + # glmnet.survcurve and glmnet.basesurv functions in hdnom... + + # Get predictions from the training set, because it's the training set whose + # baseline hazard we need + + # This is the relevant section of glmnet.survcurve from 02-hdnom-nomogram.R: + # lp = as.numeric(predict(object, newx = data.matrix(x), + # s = object$'lambda', type = 'link')) + # lp means linear predictor from predict.glmnet, because type = 'link' + lp <- + as.numeric( + predict( + model.fit, + newx = + data.matrix( + dm[-test.set, !(colnames(dm) %in% c('surv_time', 'surv_event'))] + ), + s = model.fit$lambda.1se, + type = 'link' + ) + ) + # At all unique times in the training set... + t.unique <- + # MUST sort these or the cumulative sum below will go crazy! + sort(unique(dm[-test.set, 'surv_time'][dm[-test.set, 'surv_event'] == 1L])) + + alpha <- c() + for (i in 1:length(t.unique)) { + # ...loop over calculating the fraction of the population which dies at each + # timepoint + alpha[i] <- + sum( + # Training set + dm[-test.set, 'surv_time'][ + dm[-test.set, 'surv_event'] == 1 + ] == t.unique[i] + ) / + sum( + exp(lp[dm[-test.set, 'surv_time'] >= t.unique[i]]) + ) + } + + # Get the cumulative hazard at risk.time by interpolating... + baseline.cumhaz <- + approx( + t.unique, cumsum(alpha), yleft = 0, xout = risk.time, rule = 2 + )$y + + # Get predictions from the test set to modify the baseline hazard with + lp.test <- + as.numeric( + predict( + model.fit, + newx = + data.matrix( + dm[test.set, !(colnames(dm) %in% c('surv_time', 'surv_event'))] + ), + s = model.fit$lambda.1se, + type = 'link' + ) + ) + + # 1 minus to get % dead rather than alive + risks <- 1 - exp(-exp(lp.test) * (baseline.cumhaz)) + + calibration.table <- + data.frame( + surv_event = dm[test.set, 'surv_event'], + surv_time = dm[test.set, 'surv_time'], + risk = risks + ) + + # Was there an event? Start with NA, because default is unknown (ie censored) + calibration.table$event <- NA + # Event before risk.time + calibration.table$event[ + calibration.table$surv_event & calibration.table$surv_time <= risk.time + ] <- TRUE + # Event after, whether censorship or not, means no event by risk.time + calibration.table$event[calibration.table$surv_time > risk.time] <- FALSE + # Otherwise, censored before risk.time, leave as NA + + # Drop unnecessary columns and return + calibration.table[, c('risk', 'event')] +} + +calibration.table <- glmnetCalibrationTable(cv.fit, COHORT.bin, test.set) + +calibration.score <- calibrationScore(calibration.table) + +calibrationPlot(calibration.table, show.censored = TRUE, max.points = 10000) + +#' Calibration score is `r calibration.score`. + +#' ### Coefficients +#' +#' The elastic net regression generates coefficients for every factor/level +#' combination (for continuous values, this means that every decile gets its own +#' coefficient). The lambda penalty means that some amount of variable selection +#' is performed in this process, penalising too many large coefficients. +#' Depending on the value of alpha, quite a few of the coefficients can be +#' exactly zero. Let's have a look at what we got... +#' +#+ coefficients + +# Make a data frame of the coefficients in decreasing order +cv.fit.coefficients.ordered <- + data.frame( + factorlevel = # Names are ordered in decreasing order of absolute value + factorOrderedLevels( + colnames(COHORT.bin)[ + order(abs(coef(cv.fit, s = "lambda.1se")), decreasing = TRUE) + ] + ), + val = + coef(cv.fit, s = "lambda.1se")[ + order(abs(coef(cv.fit, s = "lambda.1se")), decreasing = TRUE) + ] + ) + +# Get the variable names by removing TRUE, (x,y] or missing from the end +cv.fit.coefficients.ordered$var <- + gsub('TRUE', '', cv.fit.coefficients.ordered$factorlevel) +cv.fit.coefficients.ordered$var <- + # Can contain numbers, decimals, e+/- notation and commas separating bounds + gsub('\\([0-9,.e\\+-]+\\]', '', cv.fit.coefficients.ordered$var) +cv.fit.coefficients.ordered$var <- + gsub('missing', '', cv.fit.coefficients.ordered$var) +# Kludgey manual fix for 5_data1 which can take values 1, 2 or 3 and is +# therefore very hard to catch +cv.fit.coefficients.ordered$var <- + gsub( + 'clinical.values.5_data1[0-9]', 'clinical.values.5_data1', + cv.fit.coefficients.ordered$var + ) + +# And then get human-readable descriptions +cv.fit.coefficients.ordered$desc <- + lookUpDescriptions(cv.fit.coefficients.ordered$var) + +#' #### Top 30 coefficients +#' +#+ coefficients_table, results='asis' +print( + xtable( + cv.fit.coefficients.ordered[1:30, c('desc', 'factorlevel', 'val')], + digits = c(0, 0, 0, 3) + ), + type = 'html', + include.rownames = FALSE +) + +#' #### Graph of all coefficient values +#' +#' Nonzero values are red, zero values are blue. + +ggplot(cv.fit.coefficients.ordered, aes(x = factorlevel, y = val, colour = val == 0)) + + geom_point() + + theme( + axis.title.x=element_blank(), axis.text.x=element_blank(), + axis.ticks.x=element_blank() + ) + +#' Overall, there are `r sum(cv.fit.coefficients.ordered$val != 0)` nonzero +#' coefficients out of `r nrow(cv.fit.coefficients.ordered)`. In the case of +#' multilevel factors or continuous values, multiple coefficients may result +#' from a single variable in the original data. Correcting for this, there are +#' `r length(unique(cv.fit.coefficients.ordered$var[cv.fit.coefficients.ordered$val != 0]))` +#' unique variables represented out of +#' `r length(unique(cv.fit.coefficients.ordered$var))` total variables. +#' +#' ## Bootstrapping +#' +#' Having got those results for a single run on all the data, now bootstrap to +#' find sample-induced variability in performance statistics. Again, because +#' glmnet requires a matrix rather than a data frame this would require a large +#' amount of code in ``handymedical.R``, so do this manually. +#' +#+ bootstrap_performance + +time.start <- handyTimer() + +# Instantiate a blank data frame +bootstrap.params <- data.frame() + +for(i in 1:bootstraps) { + # Take a bootstrap sample of the training set. We do this with COHORT.prep for + # the variable importance calculations later. + COHORT.prep.boot <- bootstrapSampleDf(COHORT.prep[-test.set, ]) + + # Create a binary matrix for fitting + COHORT.boot <- convertFactorsToBinaryColumns(COHORT.prep.boot) + # model.matrix renames logicals to varTRUE, so fix that for status + colnames(COHORT.boot)[colnames(COHORT.boot) == 'surv_eventTRUE'] <- 'surv_event' + + # Fit, but with alpha fixed on the optimal value + cv.fit.boot <- #readRDS('cv.fit.rds') + cv.glmnet( + COHORT.boot[, !(colnames(COHORT.boot) %in% c('surv_time', 'surv_event'))], + Surv(COHORT.boot[, 'surv_time'], COHORT.boot[, 'surv_event']), + family = "cox", + maxit = 1000, + alpha = alpha.best + ) + + c.index.boot <- glmnetCIndex(cv.fit.boot, COHORT.bin[test.set,]) + + calibration.table.boot <- + glmnetCalibrationTable( + cv.fit.boot, rbind(COHORT.boot, COHORT.bin[test.set, ]), + test.set = (nrow(COHORT.boot) + 1):(nrow(COHORT.boot) + nrow(COHORT.bin[test.set, ])) + ) + + calibration.boot <- calibrationScore(calibration.table.boot) + + print(calibrationPlot(calibration.table.boot)) + + var.imp.vector <- c() + # Loop over variables to get variable importance + for( + var in + colnames( + COHORT.prep.boot[, !(colnames(COHORT.prep.boot) %in% c('surv_time', 'surv_event'))] + ) + ) { + # Create a dummy data frame and scramble the column var + COHORT.vimp <- COHORT.prep.boot + COHORT.vimp[, var] <- sample(COHORT.vimp[, var], replace = TRUE) + # Make it into a model matrix for fitting + COHORT.vimp <- convertFactorsToBinaryColumns(COHORT.vimp) + # model.matrix renames logicals to varTRUE, so fix that for status + colnames(COHORT.vimp)[colnames(COHORT.vimp) == 'surv_eventTRUE'] <- 'surv_event' + # Calculate the new C-index + c.index.vimp <- glmnetCIndex(cv.fit.boot, COHORT.vimp) + + # Append the difference between the C-index with scrambling and the original + var.imp.vector <- + c( + var.imp.vector, + c.index.boot - c.index.vimp + ) + } + + names(var.imp.vector) <- + paste0( + 'vimp.c.index.', + colnames( + COHORT.prep.boot[ + , + !(colnames(COHORT.prep.boot) %in% c('surv_time', 'surv_event')) + ] + ) + ) + + bootstrap.params <- + rbind( + bootstrap.params, + data.frame( + t(var.imp.vector), + c.index = c.index.boot, + calibration.score = calibration.boot + ) + ) + + # Save the bootstrap parameters for later use + write.csv(bootstrap.params, bootstrap.filename) +} + +time.boot.final <- handyTimer(time.start) + +#' `r bootstraps` bootstrap fits completed in `r time.boot.final` seconds! + +# Get coefficients and variable importances from bootstrap fits +surv.model.fit.coeffs <- bootStatsDf(bootstrap.params) + +print(surv.model.fit.coeffs) + +# Save performance results +varsToTable( + data.frame( + model = 'cox-elnet', + imputation = FALSE, + discretised = TRUE, + c.index = surv.model.fit.coeffs['c.index', 'val'], + c.index.lower = surv.model.fit.coeffs['c.index', 'lower'], + c.index.upper = surv.model.fit.coeffs['c.index', 'upper'], + calibration.score = surv.model.fit.coeffs['calibration.score', 'val'], + calibration.score.lower = + surv.model.fit.coeffs['calibration.score', 'lower'], + calibration.score.upper = + surv.model.fit.coeffs['calibration.score', 'upper'] + ), + performance.file, + index.cols = c('model', 'imputation', 'discretised') +) + +#' The bootstrapped C-index is +#' **`r round(surv.model.fit.coeffs['c.index', 'val'], 3)` +#' (`r round(surv.model.fit.coeffs['c.index', 'lower'], 3)` - +#' `r round(surv.model.fit.coeffs['c.index', 'upper'], 3)`)** +#' on the held-out test set. +#' +#' The bootstrapped calibration score is +#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)` +#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` - +#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**. +#' +#' ### Variable importances +#' +#' Top 20 most important variables from the most recent bootstrap. (This is +#' obviously indicative but just to plot a quick graph and get an idea.) +#' +#+ bootstrap_var_imp + +boot.var.imp.ordered <- + data.frame( + var = textAfter(names(var.imp.vector), 'vimp.c.index.'), + val = var.imp.vector, + stringsAsFactors = FALSE + ) + +boot.var.imp.ordered$desc <- lookUpDescriptions(boot.var.imp.ordered$var) + +ggplot( + boot.var.imp.ordered[order(boot.var.imp.ordered$val[1:20], decreasing = TRUE), ], + aes(x = var, y = val) + ) + + geom_bar(stat = 'identity') \ No newline at end of file