--- a +++ b/R/projection.R @@ -0,0 +1,350 @@ +##============================================================================= +## +## Copyright (c) 2017-2021 Marco Colombo and Paul McKeigue +## +## Parts of the code are based on https://github.com/jpiironen/rstan-varsel +## Portions copyright (c) 2015-2017 Juho Piironen +## +## This program is free software: you can redistribute it and/or modify +## it under the terms of the GNU General Public License as published by +## the Free Software Foundation, either version 3 of the License, or +## (at your option) any later version. +## +## This program is distributed in the hope that it will be useful, +## but WITHOUT ANY WARRANTY; without even the implied warranty of +## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +## GNU General Public License for more details. +## +## You should have received a copy of the GNU General Public License +## along with this program. If not, see <http://www.gnu.org/licenses/>. +## +##============================================================================= + + +#' Compute projections of full predictors on to subspace of predictors +#' +#' @param x Design matrix. +#' @param fit Matrix of fitted values for the full model. +#' @param sigma2 Residual variance (1 for logistic regression). +#' @param indproj Vector of indices of the columns of `x` that form the +#' projection subspace. +#' @param is.logistic Set to `TRUE` for a logistic regression model. +#' +#' @importFrom stats binomial glm.fit quasibinomial +#' @keywords internal +lm_proj <- function(x, fit, sigma2, indproj, is.logistic) { + + P <- ncol(x) + S <- ncol(fit) + + ## pick the Q columns of x that form the projection subspace + xp <- x[, indproj, drop=FALSE] # matrix of dimension N x Q + + ## logistic regression model + if (is.logistic) { + + ## compute the projection for each sample + par.fun <- function(s) glm.fit(xp, fit[, s], + family=quasibinomial())$coefficients + if (.Platform$OS.type != "windows") { + wp <- parallel::mclapply(X=1:S, mc.preschedule=TRUE, FUN=par.fun) + } else { # windows + cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1)) + on.exit(parallel::stopCluster(cl)) + wp <- parallel::parLapply(X=1:S, cl=cl, fun=par.fun) + } + wp <- matrix(unlist(wp, use.names=FALSE), ncol=S) + + ## estimate the KL divergence between full and projected model + fitp <- binomial()$linkinv(multiplyAB(xp, wp)) + kl <- 0.5 * mean(colMeans(binomial()$dev.resids(fit, fitp, 1))) + sigma2p <- 1 + } + + ## linear regression model + else { + ## solve the projection equations + wp <- solve(crossprod(xp, xp), multiplyAtB(xp, fit)) # Q x S matrix + + ## fit of the projected model + fitp <- multiplyAB(xp, wp) + + ## estimate the KL divergence between full and projected model + sigma2p <- sigma2 + colMeans((fit - fitp)^2) + kl <- mean(0.5 * log(sigma2p / sigma2)) + } + + ## reshape wp so that it has same dimension (P x S) as t(x), and zeros for + ## those variables that are not included in the projected model + wp.all <- matrix(0, P, S) + wp.all[indproj, ] <- wp + return(list(w=wp.all, sigma2=sigma2p, fit=fitp, kl=kl)) +} + +#' Next variable to enter the current submodel +#' +#' Return the index of the variable that should be added to the current model +#' according to the smallest KL-divergence (linear regression) or the largest +#' score test (logistic regression). +#' +#' @param x Design matrix. +#' @param sigma2 Residual variance (1 for logistic regression). +#' @param fit Matrix of fitted values for the full model. +#' @param fitp Matrix of fitted values for the projected model. +#' @param chosen Vector of indices of the columns of `x` in the current submodel. +#' @param is.logistic Set to `TRUE` for a logistic regression model. +#' +#' @keywords internal +choose.next <- function(x, sigma2, fit, fitp, chosen, is.logistic) { + notchosen <- setdiff(1:ncol(x), chosen) + par.fun <- function(idx) { + lm_proj(x, fit, sigma2, sort(c(chosen, notchosen[idx])), FALSE)$kl + } + if (!is.logistic) { + if (.Platform$OS.type != "windows") { + kl <- parallel::mclapply(X=1:length(notchosen), mc.preschedule=TRUE, + FUN=par.fun) + } else { # windows + cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1)) + on.exit(parallel::stopCluster(cl)) + kl <- parallel::parLapply(X=1:length(notchosen), cl=cl, fun=par.fun) + } + idx.selected <- which.min(unlist(kl)) + return(notchosen[idx.selected]) + } + + ## score test + yminusexp <- fit - fitp + dinv.link <- fitp * (1 - fitp) + U <- colMeans(multiplyAtB(yminusexp, x[, notchosen])) + V <- colMeans(multiplyAtB(dinv.link, x[, notchosen]^2)) + idx.selected <- which.max(U^2 / V) + return(notchosen[idx.selected]) +} + +#' Compute the fit of a submodel +#' +#' @param x Design matrix. +#' @param sigma2 Residual variance (1 for logistic regression). +#' @param mu Matrix of fitted values for the full model. +#' @param chosen Vector of indices of the columns of `x` in the current submodel. +#' @param xt Design matrix for test data. +#' @param yt Outcome variable for test data. +#' @param logistic Set to `TRUE` for a logistic regression model. +#' +#' @keywords internal +fit.submodel <- function(x, sigma2, mu, chosen, xt, yt, logistic) { + ## projected parameters + submodel <- lm_proj(x, mu, sigma2, chosen, logistic) + eta <- multiplyAB(xt, submodel$w) + return(c(submodel, elpd=elpd(yt, eta, submodel$sigma2, logistic))) +} + +#' Expected log predictive density +#' +#' @param yt Outcome variable. +#' @param eta Matrix of linear predictors, with as many rows as the number of +#' observations. +#' @param sigma2 Residual variance (1 for logistic regression). +#' @param logistic Set to `TRUE` for a logistic regression model. +#' +#' @noRd +elpd <- function(yt, eta, sigma2, logistic) { + if (logistic) + pd <- stats::dbinom(yt, 1, binomial()$linkinv(eta), log=TRUE) + else { + sigma <- sqrt(sigma2) + pd <- t(cbind(sapply(1:nrow(eta), + function(z) stats::dnorm(yt[z], eta[z, ], + sigma, log=TRUE)))) + } + sum(rowMeans(pd)) +} + +#' Forward selection minimizing KL-divergence in projection +#' +#' @param obj Object of class `hsstan`. +#' @param max.iters Maximum number of iterations (number of predictors selected) +#' after which the selection procedure should stop. +#' @param start.from Vector of variable names to be used in the starting +#' submodel. If `NULL` (default), selection starts from the set of +#' unpenalized covariates if the model contains penalized predictors, +#' otherwise selection starts from the intercept-only model. +#' @param out.csv If not `NULL`, the name of a CSV file to save the +#' output to. +#' +#' @return +#' A data frame of class `projsel` where each row corresponds to a +#' forward-selected submodel that contains all variables listed up to that row. +#' Attribute `start.from` reports the predictors in the initial model. +#' The data frame contains the following columns: +#' \item{var}{names of the variables selected.} +#' \item{kl}{KL-divergence from the full model to the submodel.} +#' \item{rel.kl.null}{relative explanatory power of predictors starting from the +#' intercept-only model.} +#' \item{rel.kl}{relative explanatory power of predictors starting from the +#' initial submodel.} +#' \item{elpd}{the expected log predictive density of the submodels.} +#' \item{delta.elpd}{the difference in elpd from the full model.} +#' +#' @examples +#' \donttest{ +#' \dontshow{utils::example("hsstan", echo=FALSE)} +#' \dontshow{oldopts <- options(mc.cores=2)} +#' # continued from ?hsstan +#' sel <- projsel(hs.biom, max.iters=3) +#' plot(sel) +#' \dontshow{options(oldopts)} +#' } +#' +#' @importFrom utils write.csv +#' @export +projsel <- function(obj, max.iters=30, start.from=NULL, + out.csv=NULL) { + validate.hsstan(obj) + validate.samples(obj) + validate.nonnegative.scalar(max.iters, "max.iters", int=TRUE) + + x <- xt <- validate.newdata(obj, obj$data) + yt <- obj$data[[obj$model.terms$outcome]] + is.logistic <- is.logistic(obj) + sigma2 <- if (is.logistic) 1 else as.matrix(obj$stanfit, pars="sigma")^2 + + ## set of variables in the initial submodel + vsf <- validate.start.from(obj, start.from) + start.from <- vsf$start.from + chosen <- start.idx <- vsf$idx + + ## number of model parameters (including intercept) + P <- sum(sapply(obj$betas, length)) + + ## number of iterations to run + K <- min(P - length(start.idx), max.iters) + + message(sprintf("%58s %8s %11s", "Model", "KL", "ELPD")) + report.iter <- function(msg, kl, elpd) + message(sprintf("%58s %8.5f %8.5f", substr(msg, 1, 55), kl, elpd)) + + ## fitted values for the full model (N x S) + fit <- t(posterior_linpred(obj, transform=TRUE)) + + ## intercept only model + sub <- fit.submodel(x, sigma2, fit, 1, xt, yt, is.logistic) + kl.elpd <- cbind(sub$kl, sub$elpd) + report.iter("Intercept only", sub$kl, sub$elpd) + + ## initial submodel with the set of chosen covariates + if (length(start.idx) > 1) { + sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic) + kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd)) + report.iter("Initial submodel", sub$kl, sub$elpd) + } + + ## add variables one at a time + for (iter in seq_len(K)) { + sel.idx <- choose.next(x, sigma2, fit, sub$fit, chosen, is.logistic) + chosen <- c(chosen, sel.idx) + + ## evaluate current submodel according to projected parameters + sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic) + kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd)) + report.iter(colnames(x)[sel.idx], sub$kl, sub$elpd) + + ## check if the current submodel is fully saturated + if (sub$kl < 1e-8 && iter < K) { + message("Fully saturated model reached, selection terminated.") + break() + } + } + + ## evaluate the full model if selection stopped before reaching it + full.elpd <- if (length(chosen) == P) + sub$elpd else elpd(yt, t(posterior_linpred(obj)), sigma2, is.logistic) + + kl <- kl.elpd[, 1] + rel.kl <- c(NA, if (length(kl) > 1) 1 - kl[-1] / kl[2]) + if (length(rel.kl) == 2 && is.nan(rel.kl[2])) + rel.kl[2] <- 1 + + res <- data.frame(var=c("Intercept only", + if (length(start.idx) > 1) "Initial submodel", + colnames(x)[setdiff(chosen, start.idx)]), + kl=kl, + rel.kl.null=1 - kl / kl[1], + rel.kl=rel.kl, + elpd=kl.elpd[, 2], + delta.elpd=kl.elpd[, 2] - full.elpd, + stringsAsFactors=FALSE, row.names=NULL) + attr(res, "start.from") <- start.from + if (!is.null(out.csv)) + write.csv(file=out.csv, res, row.names=FALSE) + + class(res) <- c("projsel", "data.frame") + return(res) +} + +#' Plot of relative explanatory power of predictors +#' +#' The function plots the relative explanatory power of each predictor in order +#' of selection. The relative explanatory power of predictors is computed +#' according to the KL divergence from the full model to each submodel, scaled +#' in such a way that the baseline model (either the null model or the model +#' containing only unpenalized covariates) is at 0, while the full model is at 1. +#' +#' @param x A data frame created by [projsel()]. +#' @param title Title of the plot. If `NULL`, no title is displayed. +#' @param max.points Maximum number of predictors to be plotted. If `NULL` +#' (default) or 0, all points are plotted. +#' @param max.labels Maximum number of predictors to be labelled. If `NULL` +#' (default), all predictor labels present in `x` are displayed, which +#' may result in overprinting. +#' @param from.covariates Whether the plotting should start from the unpenalized +#' covariates (`TRUE` by default). If set to `FALSE`, the plot includes a +#' point for the null (intercept-only) model. +#' @param font.size Size of the textual elements (labels and axes). +#' @param hadj,vadj Horizontal and vertical adjustment for the labels. +#' @param ... Currently ignored. +#' +#' @return +#' A **ggplot2** object showing the relative incremental contribution of each +#' predictor starting from the initial set of unpenalized covariates. +#' +#' @import ggplot2 +#' @method plot projsel +#' @export +plot.projsel <- function(x, title=NULL, max.points=NULL, max.labels=NULL, + from.covariates=TRUE, + font.size=12, hadj=0.05, vadj=0, ...) { + + ## prepare the set of points to plot + sel <- x + if (!is.null(max.points) && max.points > 0) + sel <- utils::head(sel, n=max.points + 2) + if (!is.null(max.labels)) + sel$var[-c(1:(max.labels + 2))] <- "" + if (from.covariates) + sel <- sel[-1, ] + labs <- sel$var + + ## convert from points to millimetres + geom.text.size <- font.size * 25.4 / 72 + + x <- seq(nrow(sel)) - 1 + text_idx <- x < mean(x) | x - floor(x / 2) * 2 == 1 + rel <- if (from.covariates) sel$rel.kl else sel$rel.kl.null + p <- ggplot(data=sel, aes(x=x, y=rel, label=labs)) + + coord_cartesian(ylim=range(c(0, 1))) + + geom_line() + geom_point(size=geom.text.size / 3) + + geom_text(aes(x=x + ifelse(text_idx, hadj, -hadj), + y=rel + ifelse(text_idx, -vadj, vadj)), + size=geom.text.size, + hjust=ifelse(text_idx, "left", "right")) + + xlab("Number of biomarkers") + + ylab("Relative explanatory power") + + theme(text=element_text(size=font.size)) + + if (!is.null(title)) + p <- p + ggtitle(title) + + return(p) +}