Diff of /R/projection.R [000000] .. [ede2d4]

Switch to side-by-side view

--- a
+++ b/R/projection.R
@@ -0,0 +1,350 @@
+##=============================================================================
+##
+## Copyright (c) 2017-2021 Marco Colombo and Paul McKeigue
+##
+## Parts of the code are based on https://github.com/jpiironen/rstan-varsel
+## Portions copyright (c) 2015-2017 Juho Piironen
+##
+## This program is free software: you can redistribute it and/or modify
+## it under the terms of the GNU General Public License as published by
+## the Free Software Foundation, either version 3 of the License, or
+## (at your option) any later version.
+##
+## This program is distributed in the hope that it will be useful,
+## but WITHOUT ANY WARRANTY; without even the implied warranty of
+## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+## GNU General Public License for more details.
+##
+## You should have received a copy of the GNU General Public License
+## along with this program.  If not, see <http://www.gnu.org/licenses/>.
+##
+##=============================================================================
+
+
+#' Compute projections of full predictors on to subspace of predictors
+#'
+#' @param x Design matrix.
+#' @param fit Matrix of fitted values for the full model.
+#' @param sigma2 Residual variance (1 for logistic regression).
+#' @param indproj Vector of indices of the columns of `x` that form the
+#'        projection subspace.
+#' @param is.logistic Set to `TRUE` for a logistic regression model.
+#'
+#' @importFrom stats binomial glm.fit quasibinomial
+#' @keywords internal
+lm_proj <- function(x, fit, sigma2, indproj, is.logistic) {
+
+    P <- ncol(x)
+    S <- ncol(fit)
+
+    ## pick the Q columns of x that form the projection subspace
+    xp <- x[, indproj, drop=FALSE] # matrix of dimension N x Q
+
+    ## logistic regression model
+    if (is.logistic) {
+
+        ## compute the projection for each sample
+        par.fun <- function(s) glm.fit(xp, fit[, s],
+                                       family=quasibinomial())$coefficients
+        if (.Platform$OS.type != "windows") {
+            wp <- parallel::mclapply(X=1:S, mc.preschedule=TRUE, FUN=par.fun)
+        } else { # windows
+            cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1))
+            on.exit(parallel::stopCluster(cl))
+            wp <- parallel::parLapply(X=1:S, cl=cl, fun=par.fun)
+        }
+        wp <- matrix(unlist(wp, use.names=FALSE), ncol=S)
+
+        ## estimate the KL divergence between full and projected model
+        fitp <- binomial()$linkinv(multiplyAB(xp, wp))
+        kl <- 0.5 * mean(colMeans(binomial()$dev.resids(fit, fitp, 1)))
+        sigma2p <- 1
+    }
+
+    ## linear regression model
+    else {
+        ## solve the projection equations
+        wp <- solve(crossprod(xp, xp), multiplyAtB(xp, fit)) # Q x S matrix
+
+        ## fit of the projected model
+        fitp <- multiplyAB(xp, wp)
+
+        ## estimate the KL divergence between full and projected model
+        sigma2p <- sigma2 + colMeans((fit - fitp)^2)
+        kl <- mean(0.5 * log(sigma2p / sigma2))
+    }
+
+    ## reshape wp so that it has same dimension (P x S) as t(x), and zeros for
+    ## those variables that are not included in the projected model
+    wp.all <- matrix(0, P, S)
+    wp.all[indproj, ] <- wp
+    return(list(w=wp.all, sigma2=sigma2p, fit=fitp, kl=kl))
+}
+
+#' Next variable to enter the current submodel
+#'
+#' Return the index of the variable that should be added to the current model
+#' according to the smallest KL-divergence (linear regression) or the largest
+#' score test (logistic regression).
+#'
+#' @param x Design matrix.
+#' @param sigma2 Residual variance (1 for logistic regression).
+#' @param fit Matrix of fitted values for the full model.
+#' @param fitp Matrix of fitted values for the projected model.
+#' @param chosen Vector of indices of the columns of `x` in the current submodel.
+#' @param is.logistic Set to `TRUE` for a logistic regression model.
+#'
+#' @keywords internal
+choose.next <- function(x, sigma2, fit, fitp, chosen, is.logistic) {
+    notchosen <- setdiff(1:ncol(x), chosen)
+    par.fun <- function(idx) {
+        lm_proj(x, fit, sigma2, sort(c(chosen, notchosen[idx])), FALSE)$kl
+    }
+    if (!is.logistic) {
+        if (.Platform$OS.type != "windows") {
+            kl <- parallel::mclapply(X=1:length(notchosen), mc.preschedule=TRUE,
+                                     FUN=par.fun)
+        } else { # windows
+            cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1))
+            on.exit(parallel::stopCluster(cl))
+            kl <- parallel::parLapply(X=1:length(notchosen), cl=cl, fun=par.fun)
+        }
+        idx.selected <- which.min(unlist(kl))
+        return(notchosen[idx.selected])
+    }
+
+    ## score test
+    yminusexp <- fit - fitp
+    dinv.link <- fitp * (1 - fitp)
+    U <- colMeans(multiplyAtB(yminusexp, x[, notchosen]))
+    V <- colMeans(multiplyAtB(dinv.link, x[, notchosen]^2))
+    idx.selected <- which.max(U^2 / V)
+    return(notchosen[idx.selected])
+}
+
+#' Compute the fit of a submodel
+#'
+#' @param x Design matrix.
+#' @param sigma2 Residual variance (1 for logistic regression).
+#' @param mu Matrix of fitted values for the full model.
+#' @param chosen Vector of indices of the columns of `x` in the current submodel.
+#' @param xt Design matrix for test data.
+#' @param yt Outcome variable for test data.
+#' @param logistic Set to `TRUE` for a logistic regression model.
+#'
+#' @keywords internal
+fit.submodel <- function(x, sigma2, mu, chosen, xt, yt, logistic) {
+    ## projected parameters
+    submodel <- lm_proj(x, mu, sigma2, chosen, logistic)
+    eta <- multiplyAB(xt, submodel$w)
+    return(c(submodel, elpd=elpd(yt, eta, submodel$sigma2, logistic)))
+}
+
+#' Expected log predictive density
+#'
+#' @param yt Outcome variable.
+#' @param eta Matrix of linear predictors, with as many rows as the number of
+#'        observations.
+#' @param sigma2 Residual variance (1 for logistic regression).
+#' @param logistic Set to `TRUE` for a logistic regression model.
+#'
+#' @noRd
+elpd <- function(yt, eta, sigma2, logistic) {
+    if (logistic)
+        pd <- stats::dbinom(yt, 1, binomial()$linkinv(eta), log=TRUE)
+    else {
+        sigma <- sqrt(sigma2)
+        pd <- t(cbind(sapply(1:nrow(eta),
+                             function(z) stats::dnorm(yt[z], eta[z, ],
+                                                      sigma, log=TRUE))))
+    }
+    sum(rowMeans(pd))
+}
+
+#' Forward selection minimizing KL-divergence in projection
+#'
+#' @param obj Object of class `hsstan`.
+#' @param max.iters Maximum number of iterations (number of predictors selected)
+#'        after which the selection procedure should stop.
+#' @param start.from Vector of variable names to be used in the starting
+#'        submodel. If `NULL` (default), selection starts from the set of
+#'        unpenalized covariates if the model contains penalized predictors,
+#'        otherwise selection starts from the intercept-only model.
+#' @param out.csv If not `NULL`, the name of a CSV file to save the
+#'        output to.
+#'
+#' @return
+#' A data frame of class `projsel` where each row corresponds to a
+#' forward-selected submodel that contains all variables listed up to that row.
+#' Attribute `start.from` reports the predictors in the initial model.
+#' The data frame contains the following columns:
+#' \item{var}{names of the variables selected.}
+#' \item{kl}{KL-divergence from the full model to the submodel.}
+#' \item{rel.kl.null}{relative explanatory power of predictors starting from the
+#'       intercept-only model.}
+#' \item{rel.kl}{relative explanatory power of predictors starting from the
+#'       initial submodel.}
+#' \item{elpd}{the expected log predictive density of the submodels.}
+#' \item{delta.elpd}{the difference in elpd from the full model.}
+#'
+#' @examples
+#' \donttest{
+#' \dontshow{utils::example("hsstan", echo=FALSE)}
+#' \dontshow{oldopts <- options(mc.cores=2)}
+#' # continued from ?hsstan
+#' sel <- projsel(hs.biom, max.iters=3)
+#' plot(sel)
+#' \dontshow{options(oldopts)}
+#' }
+#'
+#' @importFrom utils write.csv
+#' @export
+projsel <- function(obj, max.iters=30, start.from=NULL,
+                    out.csv=NULL) {
+    validate.hsstan(obj)
+    validate.samples(obj)
+    validate.nonnegative.scalar(max.iters, "max.iters", int=TRUE)
+
+    x <- xt <- validate.newdata(obj, obj$data)
+    yt <- obj$data[[obj$model.terms$outcome]]
+    is.logistic <- is.logistic(obj)
+    sigma2 <- if (is.logistic) 1 else as.matrix(obj$stanfit, pars="sigma")^2
+
+    ## set of variables in the initial submodel
+    vsf <- validate.start.from(obj, start.from)
+    start.from <- vsf$start.from
+    chosen <- start.idx <- vsf$idx
+
+    ## number of model parameters (including intercept)
+    P <- sum(sapply(obj$betas, length))
+
+    ## number of iterations to run
+    K <- min(P - length(start.idx), max.iters)
+
+    message(sprintf("%58s  %8s %11s", "Model", "KL", "ELPD"))
+    report.iter <- function(msg, kl, elpd)
+        message(sprintf("%58s  %8.5f  %8.5f", substr(msg, 1, 55), kl, elpd))
+
+    ## fitted values for the full model (N x S)
+    fit <- t(posterior_linpred(obj, transform=TRUE))
+
+    ## intercept only model
+    sub <- fit.submodel(x, sigma2, fit, 1, xt, yt, is.logistic)
+    kl.elpd <- cbind(sub$kl, sub$elpd)
+    report.iter("Intercept only", sub$kl, sub$elpd)
+
+    ## initial submodel with the set of chosen covariates
+    if (length(start.idx) > 1) {
+        sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic)
+        kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd))
+        report.iter("Initial submodel", sub$kl, sub$elpd)
+    }
+
+    ## add variables one at a time
+    for (iter in seq_len(K)) {
+        sel.idx <- choose.next(x, sigma2, fit, sub$fit, chosen, is.logistic)
+        chosen <- c(chosen, sel.idx)
+
+        ## evaluate current submodel according to projected parameters
+        sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic)
+        kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd))
+        report.iter(colnames(x)[sel.idx], sub$kl, sub$elpd)
+
+        ## check if the current submodel is fully saturated
+        if (sub$kl < 1e-8 && iter < K) {
+            message("Fully saturated model reached, selection terminated.")
+            break()
+        }
+    }
+
+    ## evaluate the full model if selection stopped before reaching it
+    full.elpd <- if (length(chosen) == P)
+        sub$elpd else elpd(yt, t(posterior_linpred(obj)), sigma2, is.logistic)
+
+    kl <- kl.elpd[, 1]
+    rel.kl <- c(NA, if (length(kl) > 1) 1 - kl[-1] / kl[2])
+    if (length(rel.kl) == 2 && is.nan(rel.kl[2]))
+        rel.kl[2] <- 1
+
+    res <- data.frame(var=c("Intercept only",
+                            if (length(start.idx) > 1) "Initial submodel",
+                            colnames(x)[setdiff(chosen, start.idx)]),
+                      kl=kl,
+                      rel.kl.null=1 - kl / kl[1],
+                      rel.kl=rel.kl,
+                      elpd=kl.elpd[, 2],
+                      delta.elpd=kl.elpd[, 2] - full.elpd,
+                      stringsAsFactors=FALSE, row.names=NULL)
+    attr(res, "start.from") <- start.from
+    if (!is.null(out.csv))
+        write.csv(file=out.csv, res, row.names=FALSE)
+
+    class(res) <- c("projsel", "data.frame")
+    return(res)
+}
+
+#' Plot of relative explanatory power of predictors
+#'
+#' The function plots the relative explanatory power of each predictor in order
+#' of selection. The relative explanatory power of predictors is computed
+#' according to the KL divergence from the full model to each submodel, scaled
+#' in such a way that the baseline model (either the null model or the model
+#' containing only unpenalized covariates) is at 0, while the full model is at 1.
+#'
+#' @param x A data frame created by [projsel()].
+#' @param title Title of the plot. If `NULL`, no title is displayed.
+#' @param max.points Maximum number of predictors to be plotted. If `NULL`
+#'        (default) or 0, all points are plotted.
+#' @param max.labels Maximum number of predictors to be labelled. If `NULL`
+#'        (default), all predictor labels present in `x` are displayed, which
+#'        may result in overprinting.
+#' @param from.covariates Whether the plotting should start from the unpenalized
+#'        covariates (`TRUE` by default). If set to `FALSE`, the plot includes a
+#'        point for the null (intercept-only) model.
+#' @param font.size Size of the textual elements (labels and axes).
+#' @param hadj,vadj Horizontal and vertical adjustment for the labels.
+#' @param ... Currently ignored.
+#'
+#' @return
+#' A **ggplot2** object showing the relative incremental contribution of each
+#' predictor starting from the initial set of unpenalized covariates.
+#'
+#' @import ggplot2
+#' @method plot projsel
+#' @export
+plot.projsel <- function(x, title=NULL, max.points=NULL, max.labels=NULL,
+                         from.covariates=TRUE,
+                         font.size=12, hadj=0.05, vadj=0, ...) {
+
+    ## prepare the set of points to plot
+    sel <- x
+    if (!is.null(max.points) && max.points > 0)
+        sel <- utils::head(sel, n=max.points + 2)
+    if (!is.null(max.labels))
+        sel$var[-c(1:(max.labels + 2))] <- ""
+    if (from.covariates)
+        sel <- sel[-1, ]
+    labs <- sel$var
+
+    ## convert from points to millimetres
+    geom.text.size <- font.size * 25.4 / 72
+
+    x <- seq(nrow(sel)) - 1
+    text_idx <- x < mean(x) | x - floor(x / 2) * 2 == 1
+    rel <- if (from.covariates) sel$rel.kl else sel$rel.kl.null
+    p <- ggplot(data=sel, aes(x=x, y=rel, label=labs)) +
+      coord_cartesian(ylim=range(c(0, 1))) +
+      geom_line() + geom_point(size=geom.text.size / 3) +
+      geom_text(aes(x=x + ifelse(text_idx, hadj, -hadj),
+                    y=rel + ifelse(text_idx, -vadj, vadj)),
+                size=geom.text.size,
+                hjust=ifelse(text_idx, "left", "right")) +
+      xlab("Number of biomarkers") +
+      ylab("Relative explanatory power") +
+      theme(text=element_text(size=font.size))
+
+    if (!is.null(title))
+        p <- p + ggtitle(title)
+
+    return(p)
+}