--- a +++ b/random-forest/rf-varselrf-eqv.R @@ -0,0 +1,553 @@ +#+ knitr_setup, include = FALSE + +# Whether to cache the intensive code sections. Set to FALSE to recalculate +# everything afresh. +cacheoption <- TRUE +# Disable lazy caching globally, because it fails for large objects, and all the +# objects we wish to cache are large... +opts_chunk$set(cache.lazy = FALSE) + +#' # Variable selection in data-driven health records +#' +#' Having extracted around 600 variables which occur most frequently in patient +#' records, let's try to narrow these down using the methodology of varSelRf +#' with a couple of custom additions (potentially multiple variable importance +#' calculations at the outset, and cross-validation to choose the best number of +#' variables at the end.) +#' +#' ## User variables +#' +#+ user_variables + +output.filename.base <- '../../output/rf-bigdata-try12-varselrf2' + +nsplit <- 20 +n.trees.initial <- 500 +n.forests.initial <- 5 +n.trees.cv <- 500 +n.trees.final <- 2000 +split.rule <- 'logrank' +n.imputations <- 3 +cv.n.folds <- 3 +vars.drop.frac <- 0.2 # Fraction of variables to drop at each iteration +bootstraps <- 200 + +n.data <- NA # This is after any variables being excluded in prep + +n.threads <- 20 + +#' ## Data set-up +#' +#+ data_setup + +data.filename.big <- '../../data/cohort-datadriven-02.csv' + +surv.predict.old <- c('age', 'smokstatus', 'imd_score', 'gender') +untransformed.vars <- c('time_death', 'endpoint_death', 'exclude') + +source('../lib/shared.R') +require(xtable) + +# Define these after shared.R or they will be overwritten! +exclude.vars <- + c( + # Entity type 4 is smoking status, which we already have + "clinical.values.4_data1", "clinical.values.4_data5", + "clinical.values.4_data6", + # Entity 13 data2 is the patient's weight centile, and not a single one is + # entered, but they come out as 0 so the algorithm, looking for NAs, thinks + # it's a useful column + "clinical.values.13_data2", + # Entities 148 and 149 are to do with death certification. I'm not sure how + # it made it into the dataset, but since all the datapoints in this are + # looking back in time, they're all NA. This causes rfsrc to fail. + "clinical.values.148_data1", "clinical.values.148_data2", + "clinical.values.148_data3", "clinical.values.148_data4", + "clinical.values.148_data5", + "clinical.values.149_data1", "clinical.values.149_data2" + ) + +COHORT <- fread(data.filename.big) + +l2df <- function(x) { + # Simple function to turn a list into a data frame so we can compare variable + # importances across iterations of the initial forests + rownames <- unique(unlist(lapply(x, names))) + df <- + data.frame( + matrix( + unlist(lapply(x, FUN = function(x){x[rownames]})), + ncol = length(x), byrow = FALSE + ) + ) + rownames(df) <- rownames + df +} + +bigdata.prefixes <- + c( + 'hes.icd.', + 'hes.opcs.', + 'tests.enttype.', + 'clinical.history.', + 'clinical.values.', + 'bnf.' + ) + +bigdata.columns <- + colnames(COHORT)[ + which( + # Does is start with one of the data column names? + startsWithAny(names(COHORT), bigdata.prefixes) & + # And it's not one of the columns we want to exclude? + !(colnames(COHORT) %in% exclude.vars) + ) + ] + +COHORT.bigdata <- + COHORT[, c( + untransformed.vars, surv.predict.old, bigdata.columns + ), + with = FALSE + ] + +# Deal appropriately with missing data +# Most of the variables are number of days since the first record of that type +time.based.vars <- + names(COHORT.bigdata)[ + startsWithAny( + names(COHORT.bigdata), + c('hes.icd.', 'hes.opcs.', 'clinical.history.') + ) + ] +# We're dealing with this as a logical, so we want non-NA values to be TRUE, +# is there is something in the history +for (j in time.based.vars) { + set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]])) +} + +# Again, taking this as a logical, set any non-NA value to TRUE. +prescriptions.vars <- names(COHORT.bigdata)[startsWith(names(COHORT.bigdata), 'bnf.')] +for (j in prescriptions.vars) { + set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]])) +} + +# This leaves tests and clinical.values, which are test results and should be +# imputed. + +# Manually fix clinical values items... +# +# "clinical.values.1_data1" "clinical.values.1_data2" +# These are just blood pressure values...fine to impute +# +# "clinical.values.13_data1" "clinical.values.13_data3" +# These are weight and BMI...also fine to impute +# +# Entity 5 is alcohol consumption status, 1 = Yes, 2 = No, 3 = Ex, so should be +# a factor, and NA can be a factor level +COHORT.bigdata$clinical.values.5_data1 <- + factorNAfix(factor(COHORT.bigdata$clinical.values.5_data1), NAval = 'missing') + +# Both gender and smokstatus are factors...fix that +COHORT.bigdata$gender <- factor(COHORT.bigdata$gender) +COHORT.bigdata$smokstatus <- + factorNAfix(factor(COHORT.bigdata$smokstatus), NAval = 'missing') + +# Exclude invalid patients +COHORT.bigdata <- COHORT.bigdata[!COHORT.bigdata$exclude] +COHORT.bigdata$exclude <- NULL + +COHORT.bigdata <- + prepSurvCol(data.frame(COHORT.bigdata), 'time_death', 'endpoint_death', 'Death') + +# If n.data was specified, trim the data table down to size +if(!is.na(n.data)) { + COHORT.bigdata <- sample.df(COHORT.bigdata, n.data) +} + +# Define test set +test.set <- testSetIndices(COHORT.bigdata, random.seed = 78361) + +# Start by predicting survival with all the variables provided +surv.predict <- c(surv.predict.old, bigdata.columns) + +# Set up a csv file to store calibration data, or retrieve previous data +calibration.filename <- paste0(output.filename.base, '-varselcalibration.csv') + +#' ## Run random forest calibration +#' +#' If there's not already a calibration file, we run the rfVarSel methodology: +#' 1. Fit a big forest to the whole dataset to obtain variable importances. +#' 2. Cross-validate as number of most important variables kept is reduced. +#' +#' (If there is already a calibration file, just load the previous work.) +#' +#+ rf_var_sel_calibration + +# If we've not already done a calibration, then do one +if(!file.exists(calibration.filename)) { + # Start by fitting several forests to all the variables + vimps.initial <- list() + time.fit <- c() + time.vimp <- c() + + # If we haven't already made the initial forests... + if(!file.exists( + paste0(output.filename.base,'-varimp-', n.forests.initial,'.rds')) + ) { + for(i in 1:n.forests.initial) { + time.start <- handyTimer() + surv.model.fit.full <- + survivalFit( + surv.predict, + COHORT.bigdata[-test.set,], + model.type = 'rfsrc', + n.trees = n.trees.initial, + split.rule = split.rule, + n.threads = n.threads, + nimpute = 3, + nsplit = nsplit, + na.action = 'na.impute' + ) + time.fit <- c(time.fit, handyTimer(time.start)) + + # Save the model + saveRDS( + surv.model.fit.full, + paste0(output.filename.base,'-initialmodel-', i,'.rds') + ) + + # Calculate variable importance + time.start <- handyTimer() + var.imp <- vimp(surv.model.fit.full, importance = 'permute.ensemble') + time.vimp <- c(time.vimp, handyTimer(time.start)) + # Save it + saveRDS(var.imp, paste0(output.filename.base, '-varimp-', i,'.rds')) + + # Make a vector of variable importances and append to the list + vimps.initial[[i]] <- sort(var.imp$importance, decreasing = TRUE) + } + + cat('Total initial vimp time = ', sum(time.fit) + sum(time.vimp)) + cat('Average fit time = ', mean(time.fit)) + cat('Average vimp time = ', mean(time.vimp)) + + } else { + # If we already made the initial forests, just load them + for(i in 1:n.forests.initial) { + var.imp <- readRDS(paste0(output.filename.base, '-varimp-', i,'.rds')) + vimps.initial[[i]] <- sort(var.imp$importance, decreasing = TRUE) + } + } + + # Convert the vimps.initial list to a dataframe, rowwise + vimps.initial <- l2df(vimps.initial) + + # Take averages across rows to give variable importances to use + var.importances <- sort(apply(vimps.initial, 1, mean), decreasing = TRUE) + + # Save the result + saveRDS(var.importances, paste0(output.filename.base, '-varimp.rds')) + + # Create an empty data frame to aggregate stats per fold + cv.performance <- data.frame() + # Cross-validate over number of variables to try + cv.vars <- getVarNums(length(var.importances)) + COHORT.cv <- COHORT.bigdata[-test.set, ] + # Run crossvalidations. No need to parallelise because rfsrc is parallelised + for(i in 1:length(cv.vars)) { + # Get the subset of most important variables to use + surv.predict.partial <- names(var.importances)[1:cv.vars[i]] + + # Get folds for cross-validation + cv.folds <- cvFolds(nrow(COHORT.cv), cv.n.folds) + + cv.fold.performance <- data.frame() + + for(j in 1:cv.n.folds) { + time.start <- handyTimer() + # Fit model to the training set + surv.model.fit <- + survivalFit( + surv.predict.partial, + COHORT.cv[-cv.folds[[j]],], + model.type = 'rfsrc', + n.trees = n.trees.cv, + split.rule = split.rule, + n.threads = n.threads, + nsplit = nsplit, + nimpute = n.imputations, + na.action = 'na.impute' + ) + time.learn <- handyTimer(time.start) + + time.start <- handyTimer() + # Get C-index on validation set + c.index.val <- + cIndex( + surv.model.fit, COHORT.cv[cv.folds[[j]],], + na.action = 'na.impute' + ) + time.c.index <- handyTimer(time.start) + + time.start <- handyTimer() + # Get calibration score validation set + calibration.score <- + calibrationScore( + calibrationTable( + surv.model.fit, COHORT.cv[cv.folds[[j]],], na.action = 'na.impute' + ) + ) + time.calibration <- handyTimer(time.start) + + # Append the stats we've obtained from this fold + cv.fold.performance <- + rbind( + cv.fold.performance, + data.frame( + calibration = i, + cv.fold = j, + n.vars = cv.vars[i], + c.index.val, + calibration.score, + time.learn, + time.c.index, + time.calibration + ) + ) + + } # End cross-validation loop (j) + + + # rbind the performance by fold + cv.performance <- + rbind( + cv.performance, + cv.fold.performance + ) + + # Save output at the end of each loop + write.csv(cv.performance, calibration.filename) + + } # End calibration loop (i) +} else { + cv.performance <- read.csv(calibration.filename) + var.importances <- readRDS(paste0(output.filename.base, '-varimp.rds')) +} + +#' ## Find the best model from the calibrations +#' +#' ### Plot model performance +#' +#+ model_performance + +# Find the best calibration... +# First, average performance across cross-validation folds +cv.performance.average <- + aggregate( + c.index.val ~ n.vars, + data = cv.performance, + mean + ) + +cv.calibration.average <- + aggregate( + area ~ n.vars, + data = cv.performance, + mean + ) + +ggplot(cv.performance.average, aes(x = n.vars, y = c.index.val)) + + geom_line() + + geom_point(data = cv.performance) + + ggtitle(label = 'C-index by n.vars') + +ggplot(cv.calibration.average, aes(x = n.vars, y = area)) + + geom_line() + + geom_point(data = cv.performance) + + ggtitle(label = 'Calibration performance by n.vars') + +# Find the highest value +n.vars <- + cv.performance.average$n.vars[ + which.max(cv.performance.average$c.index.val) + ] + +# Fit a full model with the variables provided +surv.predict.partial <- names(var.importances)[1:n.vars] + +#' ## Best model +#' +#' The best model contained `r n.vars` variables. Let's see what those were... +#' +#+ variables_used + +vars.df <- + data.frame( + vars = surv.predict.partial + ) + +vars.df$descriptions <- lookUpDescriptions(surv.predict.partial) + +vars.df$vimp <- var.importances[1:n.vars]/max(var.importances) + +missingness <- sapply(COHORT, percentMissing) + +vars.df$missingness <- missingness[names(var.importances)[1:n.vars]] + +#+ variables_table, results='asis' + +print( + xtable(vars.df), + type = 'html', + include.rownames = FALSE +) + +#' ## Perform the final fit +#' +#' Having found the best number of variables by cross-validation, let's perform +#' the final fit with the full training set and `r n.trees.final` trees. +#' +#+ final_fit + +time.start <- handyTimer() +surv.model.fit.final <- + survivalFit( + surv.predict.partial, + COHORT.bigdata[-test.set,], + model.type = 'rfsrc', + n.trees = n.trees.final, + split.rule = split.rule, + n.threads = n.threads, + nimpute = 3, + nsplit = nsplit, + na.action = 'na.impute' + ) +time.fit.final <- handyTimer(time.start) + +saveRDS(surv.model.fit.final, paste0(output.filename.base, '-finalmodel.rds')) + +#' Final model of `r n.trees.final` trees fitted in `r round(time.fit.final)` +#' seconds! +#' +#' Also bootstrap this final fitting stage. A fully proper bootstrap would +#' iterate over the whole model-building process including variable selection, +#' but that would be prohibitive in terms of computational time. +#' +#+ bootstrap_final + +surv.model.fit.boot <- + survivalBootstrap( + surv.predict.partial, + COHORT.bigdata[-test.set,], # Training set + COHORT.bigdata[test.set,], # Test set + model.type = 'rfsrc', + n.trees = n.trees.final, + split.rule = split.rule, + n.threads = n.threads, + nimpute = 3, + nsplit = nsplit, + na.action = 'na.impute', + bootstraps = bootstraps + ) + +COHORT.bigdata$surv_time_round <- round_any(COHORT.bigdata$surv_time, tod.round) +# Create a survival formula with the provided variable names... +surv.formula <- + formula( + paste0( + # Survival object made in-formula + 'Surv(surv_time_round, surv_event) ~ ', + # Predictor variables then make up the other side + paste(surv.predict.partial, collapse = '+') + ) + ) + +# rfsrc, if you installed it correctly, controls threading by changing an +# environment variable +options(rf.cores = n.threads) + +surv.model.fit.boot <- + boot( + formula = surv.formula, + data = COHORT.bigdata[-test.set, ], + statistic = bootstrapFitRfsrc, + R = bootstraps, + parallel = 'no', + ncpus = 1, # disable parallelism because rfsrc can be run in parallel + n.trees = n.trees, + test.data = COHORT.bigdata[test.set, ], + # Boot requires named variables, so can't use ... here. This slight + # kludge means that this will fail unless these three variables are + # explicitly specified in the call. + nimpute = nimpute, + nsplit = nsplit, + na.action = 'na.impute' + ) + + +# Get coefficients and variable importances from bootstrap fits +surv.model.fit.coeffs <- bootStats(surv.model.fit.boot, uncertainty = '95ci') + +#' ## Performance +#' +#' ### C-index +#' +#' C-indices are **`r round(surv.model.fit.coeffs['c.train', 'val'], 3)` +#' (`r round(surv.model.fit.coeffs['c.train', 'lower'], 3)` - +#' `r round(surv.model.fit.coeffs['c.train', 'upper'], 3)`)** +#' on the training set and +#' **`r round(surv.model.fit.coeffs['c.test', 'val'], 3)` +#' (`r round(surv.model.fit.coeffs['c.test', 'lower'], 3)` - +#' `r round(surv.model.fit.coeffs['c.test', 'upper'], 3)`)** on the test set. +#' +#' +#' ### Calibration +#' +#' The bootstrapped calibration score is +#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)` +#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` - +#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**. +#' +#' Let's draw a representative curve from the unbootstrapped fit... (It would be +#' better to draw all the curves from the bootstrap fit to get an idea of +#' variability, but I've not implemented this yet.) +#' +#+ calibration_plot + +calibration.table <- + calibrationTable( + # Standard calibration options + surv.model.fit.final, COHORT.bigdata[test.set,], + # Always need to specify NA imputation for rfsrc + na.action = 'na.impute' + ) + +calibration.score <- calibrationScore(calibration.table) + +calibrationPlot(calibration.table) + +#' The area between the calibration curve and the diagonal is +#' **`r round(calibration.score[['area']], 3)`** +/- +#' **`r round(calibration.score[['se']], 3)`**. +#' +#+ save_results + +# Save performance results +varsToTable( + data.frame( + model = 'rf-varselrf2', + imputation = FALSE, + discretised = FALSE, + c.index = surv.model.fit.coeffs['c.train', 'val'], + c.index.lower = surv.model.fit.coeffs['c.train', 'lower'], + c.index.upper = surv.model.fit.coeffs['c.train', 'upper'], + calibration.score = surv.model.fit.coeffs['calibration.score', 'val'], + calibration.score.lower = + surv.model.fit.coeffs['calibration.score', 'lower'], + calibration.score.upper = + surv.model.fit.coeffs['calibration.score', 'upper'] + ), + performance.file, + index.cols = c('model', 'imputation', 'discretised') +) \ No newline at end of file