MLehealth / Git / Diff of /cox-ph/cox-discrete-elasticnet.R

Models:

philipB/

MLehealth

Downloads: 1

Diff of /cox-ph/cox-discrete-elasticnet.R [000000] .. [0375db]

Switch to unified view

 b/cox-ph/cox-discrete-elasticnet.R
+#+ knitr_setup, include = FALSE
+# Whether to cache the intensive code sections. Set to FALSE to recalculate
+# everything afresh.
+cacheoption <- TRUE
+# Disable lazy caching globally, because it fails for large objects, and all the
+# objects we wish to cache are large...
+opts_chunk$set(cache.lazy = FALSE)
+#' # Variable selection in data-driven health records with discretised
+#' # Cox models
+#'
+#' Having extracted around 600 variables which occur most frequently in patient
+#' records, let's try to narrow these down using a methodology based on varSelRf
+#' combined with survival modelling. We'll find the predictability of variables
+#' as defined by the p-value of a logrank test on survival curves of different
+#' categories within that variable, and then iteratively throw out unimportant
+#' variables, cross-validating for optimum performance.
+#'
+#' ## User variables
+#'
+#+ user_variables
+output.filename.base <- '../../output/cox-discrete-elasticnet-08'
+bootstraps <- 100
+bootstrap.filename <- paste0(output.filename.base, '-boot-all.csv')
+n.data <- NA # This is after any variables being excluded in prep
+n.threads <- 16
+#' ## Data set-up
+#'
+#+ data_setup
+data.filename.big <- '../../data/cohort-datadriven-02.csv'
+surv.predict.old <- c('age', 'smokstatus', 'imd_score', 'gender')
+untransformed.vars <- c('time_death', 'endpoint_death', 'exclude')
+source('../lib/shared.R')
+require(xtable)
+# Define these after shared.R or they will be overwritten!
+exclude.vars <-
+  c(
+    # Entity type 4 is smoking status, which we already have
+    "clinical.values.4_data1", "clinical.values.4_data5",
+    "clinical.values.4_data6",
+    # Entity 13 data2 is the patient's weight centile, and not a single one is
+    # entered, but they come out as 0 so the algorithm, looking for NAs, thinks
+    # it's a useful column
+    "clinical.values.13_data2",
+    # Entities 148 and 149 are to do with death certification. I'm not sure how
+    # it made it into the dataset, but since all the datapoints in this are
+    # looking back in time, they're all NA. This causes rfsrc to fail.
+    "clinical.values.148_data1", "clinical.values.148_data2",
+    "clinical.values.148_data3", "clinical.values.148_data4",
+    "clinical.values.148_data5",
+    "clinical.values.149_data1", "clinical.values.149_data2",
+    # These are all the same value except where NA, which causes issues with
+    # discretisation
+    "clinical.values.14_data2", "clinical.values.62_data1",
+    "clinical.values.64_data1", "clinical.values.65_data1",
+    "clinical.values.65_data1", "clinical.values.67_data1",
+    "clinical.values.68_data2", "clinical.values.70_data1"
+  )
+COHORT <- fread(data.filename.big)
+bigdata.prefixes <-
+  c(
+    'hes.icd.',
+    'hes.opcs.',
+    'tests.enttype.',
+    'clinical.history.',
+    'clinical.values.',
+    'bnf.'
+  )
+bigdata.columns <-
+  colnames(COHORT)[
+    which(
+      # Does is start with one of the data column names?
+      startsWithAny(names(COHORT), bigdata.prefixes) &
+        # And it's not one of the columns we want to exclude?
+        !(colnames(COHORT) %in% exclude.vars)
+    )
+    ]
+COHORT.bigdata <-
+  COHORT[, c(
+    untransformed.vars, surv.predict.old, bigdata.columns
+  ),
+  with = FALSE
+  ]
+# Get the missingness before we start removing missing values
+missingness <- sort(sapply(COHORT.bigdata, percentMissing))
+# Remove values for the 'untransformed.vars' above, which are the survival
+# values plus exclude column
+missingness <- missingness[!(names(missingness) %in% untransformed.vars)]
+# Deal appropriately with missing data
+# Most of the variables are number of days since the first record of that type
+time.based.vars <-
+  names(COHORT.bigdata)[
+    startsWithAny(
+      names(COHORT.bigdata),
+      c('hes.icd.', 'hes.opcs.', 'clinical.history.')
+    )
+    ]
+# We're dealing with this as a logical, so we want non-NA values to be TRUE,
+# is there is something in the history
+for (j in time.based.vars) {
+  set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]]))
+}
+# Again, taking this as a logical, set any non-NA value to TRUE.
+prescriptions.vars <- names(COHORT.bigdata)[startsWith(names(COHORT.bigdata), 'bnf.')]
+for (j in prescriptions.vars) {
+  set(COHORT.bigdata, j = j, value = !is.na(COHORT.bigdata[[j]]))
+}
+# This leaves tests and clinical.values, which are test results and should be
+# imputed.
+# Manually fix clinical values items...
+#
+# "clinical.values.1_data1"  "clinical.values.1_data2"
+# These are just blood pressure values...fine to impute
+#
+# "clinical.values.13_data1" "clinical.values.13_data3"
+# These are weight and BMI...also fine to impute
+#
+# Entity 5 is alcohol consumption status, 1 = Yes, 2 = No, 3 = Ex, so should be
+# a factor, and NA can be a factor level
+COHORT.bigdata$clinical.values.5_data1 <-
+  factorNAfix(factor(COHORT.bigdata$clinical.values.5_data1), NAval = 'missing')
+# Both gender and smokstatus are factors...fix that
+COHORT.bigdata$gender <- factor(COHORT.bigdata$gender)
+COHORT.bigdata$smokstatus <-
+  factorNAfix(factor(COHORT.bigdata$smokstatus), NAval = 'missing')
+# Exclude invalid patients
+COHORT.bigdata <- COHORT.bigdata[!COHORT.bigdata$exclude]
+COHORT.bigdata$exclude <- NULL
+# Remove negative survival times
+COHORT.bigdata <- subset(COHORT.bigdata, time_death > 0)
+# Define test set
+test.set <- testSetIndices(COHORT.bigdata, random.seed = 78361)
+# If n.data was specified, trim the data table down to size
+if(!is.na(n.data)) {
+  COHORT.bigdata <- sample.df(COHORT.bigdata, n.data)
+}
+# Create an appropraite survival column
+COHORT.bigdata <-
+  prepSurvCol(
+    data.frame(COHORT.bigdata), 'time_death', 'endpoint_death', 'Death'
+  )
+# Start by predicting survival with all the variables provided
+surv.predict <- c(surv.predict.old, bigdata.columns)
+# Set up a csv file to store calibration data, or retrieve previous data
+calibration.filename <- paste0(output.filename.base, '-varselcalibration.csv')
+# Create process settings
+# Variables to leave alone, including those whose logrank p-value is NA because
+# that means there is only one value in the column and so it can't be discretised
+# properly anyway
+vars.noprocess <- c('surv_time', 'surv_event')
+process.settings <-
+  list(
+    var        = vars.noprocess,
+    method     = rep(NA, length(vars.noprocess)),
+    settings   = rep(list(NA), length(vars.noprocess))
+  )
+# Find continuous variables which will need discretising
+continuous.vars <- names(COHORT.bigdata)[sapply(COHORT.bigdata, class) %in% c('integer', 'numeric')]
+# Remove those variables already explicitly excluded, mainly for those whose
+# logrank score was NA
+continuous.vars <- continuous.vars[!(continuous.vars %in% process.settings$var)]
+process.settings$var <- c(process.settings$var, continuous.vars)
+process.settings$method <-
+  c(process.settings$method,
+    rep('binByQuantile', length(continuous.vars))
+  )
+process.settings$settings <-
+  c(
+    process.settings$settings,
+    rep(
+      list(
+        seq(
+          # Quantiles are obviously between 0 and 1
+, 1,
+          # All have the same number of bins
+          length.out = 10
+        )
+      ),
+      length(continuous.vars)
+    )
+  )
+# Need a way to ID in advance those which are going to fail here, ie those where
+# there are no quantiles. The
+COHORT.prep <-
+  prepData(
+    # Data for cross-validation excludes test set
+    COHORT.bigdata,
+    names(COHORT.bigdata),
+    process.settings,
+    'surv_time', 'surv_event',
+    TRUE
+  )
+# Kludge...remove surv_time.1 and rename surv_event.1
+COHORT.prep$surv_time.1 <- NULL
+names(COHORT.prep)[names(COHORT.prep) == 'surv_event.1'] <- 'surv_event'
+COHORT.bin <- convertFactorsToBinaryColumns(COHORT.prep)
+# model.matrix renames logicals to varTRUE, so fix that for status
+colnames(COHORT.bin)[colnames(COHORT.bin) == 'surv_eventTRUE'] <- 'surv_event'
+# Coxnet code, should you ever decide to go that route
+# test <-
+#   Coxnet(
+#     data.matrix(COHORT.bin[-test.set, !(colnames(COHORT.bin) %in% c('time', 'status'))]),
+#     data.matrix(COHORT.bin[-test.set, c('time', 'status')]),
+#     penalty = 'Enet',
+#     alpha = 0,
+#     nlambda = 50, nfolds = 10, maxit = 1e+5
+#   )
+#' ## Elastic net regression
+#'
+#' Run a loop over alphas running from LASSO to ridge regression, and see which
+#' is best after tenfold cross-validation...
+#'
+#+ elastic_net_full
+require(glmnet)
+initParallel(n.threads)
+time.start <- handyTimer()
+alphas <- seq(0, 1, length.out = 101)
+mse <- c()
+for(alpha in alphas) {
+  cv.fit <-
+    cv.glmnet(
+      COHORT.bin[-test.set, !(colnames(COHORT.bin) %in% c('surv_time', 'surv_event'))],
+      Surv(COHORT.bin[-test.set, 'surv_time'], COHORT.bin[-test.set, 'surv_event']),
+      family = "cox",
+      maxit = 1000,
+      alpha = alpha,
+      parallel = TRUE
+    )
+  best.lambda.i <- which(cv.fit$lambda == cv.fit$lambda.min)
+  mse <- c(mse, cv.fit$cvm[best.lambda.i])
+}
+time.cv <- handyTimer(time.start)
+write.csv(
+  data.frame(
+    alphas, mse
+  ),
+  paste0(output.filename.base, '-alpha-calibration.csv')
+)
+#' `r length(alphas)` alpha values tested in `r time.cv` seconds!
+alpha.best <- alphas[which.min(mse)]
+# To avoid saving all the fits, let's just refit the best one
+cv.fit <-
+  cv.glmnet(
+    COHORT.bin[-test.set, !(colnames(COHORT.bin) %in% c('surv_time', 'surv_event'))],
+    Surv(COHORT.bin[-test.set, 'surv_time'], COHORT.bin[-test.set, 'surv_event']),
+    family = "cox",
+    maxit = 1000,
+    alpha = alpha.best,
+    parallel = TRUE
+  )
+# Save for future use
+saveRDS(cv.fit, 'cv.fit.rds')
+#' The best alpha was `r alpha.best`, and the lambda with the lowest mean-square
+#' error was `r cv.fit$lambda.min`. We'll be using the strictest lambda which is
+#' within 1 se of the minimum, `r cv.fit$lambda.1se`.
+#'
+#' ## Performance
+#'
+#' ### C-index
+#'
+#' Calculate C-index manually. The glmnet interface requiring matrices is
+#' sufficiently different to the usual one that I've not spent time integrating
+#' it with the rest of the ``handymedical.R`` functions yet.
+#'
+#+ c_index
+glmnetCIndex <- function(model.fit, dm) {
+  test.predictions <-
+    getRisk(
+      model.fit,
+      dm[, !(colnames(dm) %in% c('surv_time', 'surv_event'))]
+    )
+  as.numeric(
+    survConcordance(
+      as.formula(paste0('Surv(surv_time, surv_event) ~ risk')),
+      data.frame(
+        surv_time = dm[, 'surv_time'],
+        surv_event = dm[, 'surv_event'],
+        risk = test.predictions
+      )
+    )$concordance
+  )
+}
+c.index <- glmnetCIndex(cv.fit, COHORT.bin[test.set, ])
+#' C-index is `r c.index`.
+#'
+#' ### Calibration
+#'
+#' For now, calibration is manual too just to get it working. It's surprisingly
+#' hard... A package called `c060` should contain a function `predictProb`, but
+#' on loading it, is says function not found. So here is a very manual solution,
+#' creating a dummy Cox model using the `survival` package, inspired by
+#' [this](https://stat.ethz.ch/pipermail/r-help/2012-May/312029.html).
+#'
+#+ calibration_plot
+glmnetCalibrationTable <- function(model.fit, dm, test.set, risk.time = 5) {
+  # Work out risks at risk.time for the special case of a glmnet model
+  # Derive baseline hazard from cv.glmnet model, heavily based on the
+  # glmnet.survcurve and glmnet.basesurv functions in hdnom...
+  # Get predictions from the training set, because it's the training set whose
+  # baseline hazard we need
+  # This is the relevant section of glmnet.survcurve from 02-hdnom-nomogram.R:
+  # lp = as.numeric(predict(object, newx = data.matrix(x),
+  #                         s = object$'lambda', type = 'link'))
+  # lp means linear predictor from predict.glmnet, because type = 'link'
+  lp <-
+    as.numeric(
+      predict(
+        model.fit,
+        newx =
+          data.matrix(
+            dm[-test.set, !(colnames(dm) %in% c('surv_time', 'surv_event'))]
+          ),
+        s = model.fit$lambda.1se,
+        type = 'link'
+      )
+    )
+  # At all unique times in the training set...
+  t.unique <-
+    # MUST sort these or the cumulative sum below will go crazy!
+    sort(unique(dm[-test.set, 'surv_time'][dm[-test.set, 'surv_event'] == 1L]))
+  alpha <- c()
+  for (i in 1:length(t.unique)) {
+    # ...loop over calculating the fraction of the population which dies at each
+    # timepoint
+    alpha[i] <-
+      sum(
+        # Training set
+        dm[-test.set, 'surv_time'][
+          dm[-test.set, 'surv_event'] == 1
+        ] == t.unique[i]
+      ) /
+      sum(
+        exp(lp[dm[-test.set, 'surv_time'] >= t.unique[i]])
+      )
+  }
+  # Get the cumulative hazard at risk.time by interpolating...
+  baseline.cumhaz <-
+    approx(
+      t.unique, cumsum(alpha), yleft = 0, xout = risk.time, rule = 2
+    )$y
+  # Get predictions from the test set to modify the baseline hazard with
+  lp.test <-
+    as.numeric(
+      predict(
+        model.fit,
+        newx =
+          data.matrix(
+            dm[test.set, !(colnames(dm) %in% c('surv_time', 'surv_event'))]
+          ),
+        s = model.fit$lambda.1se,
+        type = 'link'
+      )
+    )
+  # 1 minus to get % dead rather than alive
+  risks <- 1 - exp(-exp(lp.test) * (baseline.cumhaz))
+  calibration.table <-
+    data.frame(
+      surv_event = dm[test.set, 'surv_event'],
+      surv_time = dm[test.set, 'surv_time'],
+      risk = risks
+    )
+  # Was there an event? Start with NA, because default is unknown (ie censored)
+  calibration.table$event <- NA
+  # Event before risk.time
+  calibration.table$event[
+    calibration.table$surv_event & calibration.table$surv_time <= risk.time
+    ] <- TRUE
+  # Event after, whether censorship or not, means no event by risk.time
+  calibration.table$event[calibration.table$surv_time > risk.time] <- FALSE
+  # Otherwise, censored before risk.time, leave as NA
+  # Drop unnecessary columns and return
+  calibration.table[, c('risk', 'event')]
+}
+calibration.table <- glmnetCalibrationTable(cv.fit, COHORT.bin, test.set)
+calibration.score <- calibrationScore(calibration.table)
+calibrationPlot(calibration.table, show.censored = TRUE, max.points = 10000)
+#' Calibration score is `r calibration.score`.
+#' ### Coefficients
+#'
+#' The elastic net regression generates coefficients for every factor/level
+#' combination (for continuous values, this means that every decile gets its own
+#' coefficient). The lambda penalty means that some amount of variable selection
+#' is performed in this process, penalising too many large coefficients.
+#' Depending on the value of alpha, quite a few of the coefficients can be
+#' exactly zero. Let's have a look at what we got...
+#'
+#+ coefficients
+# Make a data frame of the coefficients in decreasing order
+cv.fit.coefficients.ordered <-
+  data.frame(
+    factorlevel = # Names are ordered in decreasing order of absolute value
+      factorOrderedLevels(
+        colnames(COHORT.bin)[
+          order(abs(coef(cv.fit, s = "lambda.1se")), decreasing = TRUE)
+          ]
+      ),
+    val =
+      coef(cv.fit, s = "lambda.1se")[
+        order(abs(coef(cv.fit, s = "lambda.1se")), decreasing = TRUE)
+        ]
+  )
+# Get the variable names by removing TRUE, (x,y] or missing from the end
+cv.fit.coefficients.ordered$var <-
+  gsub('TRUE', '', cv.fit.coefficients.ordered$factorlevel)
+cv.fit.coefficients.ordered$var <-
+  # Can contain numbers, decimals, e+/- notation and commas separating bounds
+  gsub('\\([0-9,.e\\+-]+\\]', '', cv.fit.coefficients.ordered$var)
+cv.fit.coefficients.ordered$var <-
+  gsub('missing', '', cv.fit.coefficients.ordered$var)
+# Kludgey manual fix for 5_data1 which can take values 1, 2 or 3 and is
+# therefore very hard to catch
+cv.fit.coefficients.ordered$var <-
+  gsub(
+    'clinical.values.5_data1[0-9]', 'clinical.values.5_data1',
+    cv.fit.coefficients.ordered$var
+  )
+# And then get human-readable descriptions
+cv.fit.coefficients.ordered$desc <-
+  lookUpDescriptions(cv.fit.coefficients.ordered$var)
+#' #### Top 30 coefficients
+#'
+#+ coefficients_table, results='asis'
+print(
+  xtable(
+    cv.fit.coefficients.ordered[1:30, c('desc', 'factorlevel', 'val')],
+    digits = c(0, 0, 0, 3)
+  ),
+  type = 'html',
+  include.rownames = FALSE
+)
+#' #### Graph of all coefficient values
+#'
+#' Nonzero values are red, zero values are blue.
+ggplot(cv.fit.coefficients.ordered, aes(x = factorlevel, y = val, colour = val == 0)) +
+  geom_point() +
+  theme(
+    axis.title.x=element_blank(), axis.text.x=element_blank(),
+    axis.ticks.x=element_blank()
+  )
+#' Overall, there are `r sum(cv.fit.coefficients.ordered$val != 0)` nonzero
+#' coefficients out of `r nrow(cv.fit.coefficients.ordered)`. In the case of
+#' multilevel factors or continuous values, multiple coefficients may result
+#' from a single variable in the original data. Correcting for this, there are
+#' `r length(unique(cv.fit.coefficients.ordered$var[cv.fit.coefficients.ordered$val != 0]))`
+#' unique variables represented out of
+#' `r length(unique(cv.fit.coefficients.ordered$var))` total variables.
+#'
+#' ## Bootstrapping
+#'
+#' Having got those results for a single run on all the data, now bootstrap to
+#' find sample-induced variability in performance statistics. Again, because
+#' glmnet requires a matrix rather than a data frame this would require a large
+#' amount of code in ``handymedical.R``, so do this manually.
+#'
+#+ bootstrap_performance
+time.start <- handyTimer()
+# Instantiate a blank data frame
+bootstrap.params <- data.frame()
+for(i in 1:bootstraps) {
+  # Take a bootstrap sample of the training set. We do this with COHORT.prep for
+  # the variable importance calculations later.
+  COHORT.prep.boot <- bootstrapSampleDf(COHORT.prep[-test.set, ])
+  # Create a binary matrix for fitting
+  COHORT.boot <- convertFactorsToBinaryColumns(COHORT.prep.boot)
+  # model.matrix renames logicals to varTRUE, so fix that for status
+  colnames(COHORT.boot)[colnames(COHORT.boot) == 'surv_eventTRUE'] <- 'surv_event'
+  # Fit, but with alpha fixed on the optimal value
+  cv.fit.boot <- #readRDS('cv.fit.rds')
+    cv.glmnet(
+      COHORT.boot[, !(colnames(COHORT.boot) %in% c('surv_time', 'surv_event'))],
+      Surv(COHORT.boot[, 'surv_time'], COHORT.boot[, 'surv_event']),
+      family = "cox",
+      maxit = 1000,
+      alpha = alpha.best
+    )
+  c.index.boot <- glmnetCIndex(cv.fit.boot, COHORT.bin[test.set,])
+  calibration.table.boot <-
+    glmnetCalibrationTable(
+      cv.fit.boot, rbind(COHORT.boot, COHORT.bin[test.set, ]),
+      test.set = (nrow(COHORT.boot) + 1):(nrow(COHORT.boot) + nrow(COHORT.bin[test.set, ]))
+    )
+  calibration.boot <- calibrationScore(calibration.table.boot)
+  print(calibrationPlot(calibration.table.boot))
+  var.imp.vector <- c()
+  # Loop over variables to get variable importance
+  for(
+    var in
+    colnames(
+      COHORT.prep.boot[, !(colnames(COHORT.prep.boot) %in% c('surv_time', 'surv_event'))]
+    )
+  ) {
+    # Create a dummy data frame and scramble the column var
+    COHORT.vimp <- COHORT.prep.boot
+    COHORT.vimp[, var] <- sample(COHORT.vimp[, var], replace = TRUE)
+    # Make it into a model matrix for fitting
+    COHORT.vimp <- convertFactorsToBinaryColumns(COHORT.vimp)
+    # model.matrix renames logicals to varTRUE, so fix that for status
+    colnames(COHORT.vimp)[colnames(COHORT.vimp) == 'surv_eventTRUE'] <- 'surv_event'
+    # Calculate the new C-index
+    c.index.vimp <- glmnetCIndex(cv.fit.boot, COHORT.vimp)
+    # Append the difference between the C-index with scrambling and the original
+    var.imp.vector <-
+      c(
+        var.imp.vector,
+        c.index.boot - c.index.vimp
+      )
+  }
+  names(var.imp.vector) <-
+    paste0(
+      'vimp.c.index.',
+      colnames(
+        COHORT.prep.boot[
+          ,
+          !(colnames(COHORT.prep.boot) %in% c('surv_time', 'surv_event'))
+          ]
+      )
+    )
+  bootstrap.params <-
+    rbind(
+      bootstrap.params,
+      data.frame(
+        t(var.imp.vector),
+        c.index = c.index.boot,
+        calibration.score = calibration.boot
+      )
+    )
+  # Save the bootstrap parameters for later use
+  write.csv(bootstrap.params, bootstrap.filename)
+}
+time.boot.final <- handyTimer(time.start)
+#' `r bootstraps` bootstrap fits completed in `r time.boot.final` seconds!
+# Get coefficients and variable importances from bootstrap fits
+surv.model.fit.coeffs <- bootStatsDf(bootstrap.params)
+print(surv.model.fit.coeffs)
+# Save performance results
+varsToTable(
+  data.frame(
+    model = 'cox-elnet',
+    imputation = FALSE,
+    discretised = TRUE,
+    c.index = surv.model.fit.coeffs['c.index', 'val'],
+    c.index.lower = surv.model.fit.coeffs['c.index', 'lower'],
+    c.index.upper = surv.model.fit.coeffs['c.index', 'upper'],
+    calibration.score = surv.model.fit.coeffs['calibration.score', 'val'],
+    calibration.score.lower =
+      surv.model.fit.coeffs['calibration.score', 'lower'],
+    calibration.score.upper =
+      surv.model.fit.coeffs['calibration.score', 'upper']
+  ),
+  performance.file,
+  index.cols = c('model', 'imputation', 'discretised')
+)
+#' The bootstrapped C-index is
+#' **`r round(surv.model.fit.coeffs['c.index', 'val'], 3)`
+#' (`r round(surv.model.fit.coeffs['c.index', 'lower'], 3)` -
+#' `r round(surv.model.fit.coeffs['c.index', 'upper'], 3)`)**
+#' on the held-out test set.
+#'
+#' The bootstrapped calibration score is
+#' **`r round(surv.model.fit.coeffs['calibration.score', 'val'], 3)`
+#' (`r round(surv.model.fit.coeffs['calibration.score', 'lower'], 3)` -
+#' `r round(surv.model.fit.coeffs['calibration.score', 'upper'], 3)`)**.
+#'
+#' ### Variable importances
+#'
+#' Top 20 most important variables from the most recent bootstrap. (This is
+#' obviously indicative but just to plot a quick graph and get an idea.)
+#'
+#+ bootstrap_var_imp
+boot.var.imp.ordered <-
+  data.frame(
+    var = textAfter(names(var.imp.vector), 'vimp.c.index.'),
+    val = var.imp.vector,
+    stringsAsFactors = FALSE
+  )
+boot.var.imp.ordered$desc <- lookUpDescriptions(boot.var.imp.ordered$var)
+ggplot(
+    boot.var.imp.ordered[order(boot.var.imp.ordered$val[1:20], decreasing = TRUE), ],
+    aes(x = var, y = val)
+  ) +
+  geom_bar(stat = 'identity')