Diff of /R/projection.R [000000] .. [ede2d4]

Switch to unified view

a b/R/projection.R
1
##=============================================================================
2
##
3
## Copyright (c) 2017-2021 Marco Colombo and Paul McKeigue
4
##
5
## Parts of the code are based on https://github.com/jpiironen/rstan-varsel
6
## Portions copyright (c) 2015-2017 Juho Piironen
7
##
8
## This program is free software: you can redistribute it and/or modify
9
## it under the terms of the GNU General Public License as published by
10
## the Free Software Foundation, either version 3 of the License, or
11
## (at your option) any later version.
12
##
13
## This program is distributed in the hope that it will be useful,
14
## but WITHOUT ANY WARRANTY; without even the implied warranty of
15
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
## GNU General Public License for more details.
17
##
18
## You should have received a copy of the GNU General Public License
19
## along with this program.  If not, see <http://www.gnu.org/licenses/>.
20
##
21
##=============================================================================
22
23
24
#' Compute projections of full predictors on to subspace of predictors
25
#'
26
#' @param x Design matrix.
27
#' @param fit Matrix of fitted values for the full model.
28
#' @param sigma2 Residual variance (1 for logistic regression).
29
#' @param indproj Vector of indices of the columns of `x` that form the
30
#'        projection subspace.
31
#' @param is.logistic Set to `TRUE` for a logistic regression model.
32
#'
33
#' @importFrom stats binomial glm.fit quasibinomial
34
#' @keywords internal
35
lm_proj <- function(x, fit, sigma2, indproj, is.logistic) {
36
37
    P <- ncol(x)
38
    S <- ncol(fit)
39
40
    ## pick the Q columns of x that form the projection subspace
41
    xp <- x[, indproj, drop=FALSE] # matrix of dimension N x Q
42
43
    ## logistic regression model
44
    if (is.logistic) {
45
46
        ## compute the projection for each sample
47
        par.fun <- function(s) glm.fit(xp, fit[, s],
48
                                       family=quasibinomial())$coefficients
49
        if (.Platform$OS.type != "windows") {
50
            wp <- parallel::mclapply(X=1:S, mc.preschedule=TRUE, FUN=par.fun)
51
        } else { # windows
52
            cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1))
53
            on.exit(parallel::stopCluster(cl))
54
            wp <- parallel::parLapply(X=1:S, cl=cl, fun=par.fun)
55
        }
56
        wp <- matrix(unlist(wp, use.names=FALSE), ncol=S)
57
58
        ## estimate the KL divergence between full and projected model
59
        fitp <- binomial()$linkinv(multiplyAB(xp, wp))
60
        kl <- 0.5 * mean(colMeans(binomial()$dev.resids(fit, fitp, 1)))
61
        sigma2p <- 1
62
    }
63
64
    ## linear regression model
65
    else {
66
        ## solve the projection equations
67
        wp <- solve(crossprod(xp, xp), multiplyAtB(xp, fit)) # Q x S matrix
68
69
        ## fit of the projected model
70
        fitp <- multiplyAB(xp, wp)
71
72
        ## estimate the KL divergence between full and projected model
73
        sigma2p <- sigma2 + colMeans((fit - fitp)^2)
74
        kl <- mean(0.5 * log(sigma2p / sigma2))
75
    }
76
77
    ## reshape wp so that it has same dimension (P x S) as t(x), and zeros for
78
    ## those variables that are not included in the projected model
79
    wp.all <- matrix(0, P, S)
80
    wp.all[indproj, ] <- wp
81
    return(list(w=wp.all, sigma2=sigma2p, fit=fitp, kl=kl))
82
}
83
84
#' Next variable to enter the current submodel
85
#'
86
#' Return the index of the variable that should be added to the current model
87
#' according to the smallest KL-divergence (linear regression) or the largest
88
#' score test (logistic regression).
89
#'
90
#' @param x Design matrix.
91
#' @param sigma2 Residual variance (1 for logistic regression).
92
#' @param fit Matrix of fitted values for the full model.
93
#' @param fitp Matrix of fitted values for the projected model.
94
#' @param chosen Vector of indices of the columns of `x` in the current submodel.
95
#' @param is.logistic Set to `TRUE` for a logistic regression model.
96
#'
97
#' @keywords internal
98
choose.next <- function(x, sigma2, fit, fitp, chosen, is.logistic) {
99
    notchosen <- setdiff(1:ncol(x), chosen)
100
    par.fun <- function(idx) {
101
        lm_proj(x, fit, sigma2, sort(c(chosen, notchosen[idx])), FALSE)$kl
102
    }
103
    if (!is.logistic) {
104
        if (.Platform$OS.type != "windows") {
105
            kl <- parallel::mclapply(X=1:length(notchosen), mc.preschedule=TRUE,
106
                                     FUN=par.fun)
107
        } else { # windows
108
            cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1))
109
            on.exit(parallel::stopCluster(cl))
110
            kl <- parallel::parLapply(X=1:length(notchosen), cl=cl, fun=par.fun)
111
        }
112
        idx.selected <- which.min(unlist(kl))
113
        return(notchosen[idx.selected])
114
    }
115
116
    ## score test
117
    yminusexp <- fit - fitp
118
    dinv.link <- fitp * (1 - fitp)
119
    U <- colMeans(multiplyAtB(yminusexp, x[, notchosen]))
120
    V <- colMeans(multiplyAtB(dinv.link, x[, notchosen]^2))
121
    idx.selected <- which.max(U^2 / V)
122
    return(notchosen[idx.selected])
123
}
124
125
#' Compute the fit of a submodel
126
#'
127
#' @param x Design matrix.
128
#' @param sigma2 Residual variance (1 for logistic regression).
129
#' @param mu Matrix of fitted values for the full model.
130
#' @param chosen Vector of indices of the columns of `x` in the current submodel.
131
#' @param xt Design matrix for test data.
132
#' @param yt Outcome variable for test data.
133
#' @param logistic Set to `TRUE` for a logistic regression model.
134
#'
135
#' @keywords internal
136
fit.submodel <- function(x, sigma2, mu, chosen, xt, yt, logistic) {
137
    ## projected parameters
138
    submodel <- lm_proj(x, mu, sigma2, chosen, logistic)
139
    eta <- multiplyAB(xt, submodel$w)
140
    return(c(submodel, elpd=elpd(yt, eta, submodel$sigma2, logistic)))
141
}
142
143
#' Expected log predictive density
144
#'
145
#' @param yt Outcome variable.
146
#' @param eta Matrix of linear predictors, with as many rows as the number of
147
#'        observations.
148
#' @param sigma2 Residual variance (1 for logistic regression).
149
#' @param logistic Set to `TRUE` for a logistic regression model.
150
#'
151
#' @noRd
152
elpd <- function(yt, eta, sigma2, logistic) {
153
    if (logistic)
154
        pd <- stats::dbinom(yt, 1, binomial()$linkinv(eta), log=TRUE)
155
    else {
156
        sigma <- sqrt(sigma2)
157
        pd <- t(cbind(sapply(1:nrow(eta),
158
                             function(z) stats::dnorm(yt[z], eta[z, ],
159
                                                      sigma, log=TRUE))))
160
    }
161
    sum(rowMeans(pd))
162
}
163
164
#' Forward selection minimizing KL-divergence in projection
165
#'
166
#' @param obj Object of class `hsstan`.
167
#' @param max.iters Maximum number of iterations (number of predictors selected)
168
#'        after which the selection procedure should stop.
169
#' @param start.from Vector of variable names to be used in the starting
170
#'        submodel. If `NULL` (default), selection starts from the set of
171
#'        unpenalized covariates if the model contains penalized predictors,
172
#'        otherwise selection starts from the intercept-only model.
173
#' @param out.csv If not `NULL`, the name of a CSV file to save the
174
#'        output to.
175
#'
176
#' @return
177
#' A data frame of class `projsel` where each row corresponds to a
178
#' forward-selected submodel that contains all variables listed up to that row.
179
#' Attribute `start.from` reports the predictors in the initial model.
180
#' The data frame contains the following columns:
181
#' \item{var}{names of the variables selected.}
182
#' \item{kl}{KL-divergence from the full model to the submodel.}
183
#' \item{rel.kl.null}{relative explanatory power of predictors starting from the
184
#'       intercept-only model.}
185
#' \item{rel.kl}{relative explanatory power of predictors starting from the
186
#'       initial submodel.}
187
#' \item{elpd}{the expected log predictive density of the submodels.}
188
#' \item{delta.elpd}{the difference in elpd from the full model.}
189
#'
190
#' @examples
191
#' \donttest{
192
#' \dontshow{utils::example("hsstan", echo=FALSE)}
193
#' \dontshow{oldopts <- options(mc.cores=2)}
194
#' # continued from ?hsstan
195
#' sel <- projsel(hs.biom, max.iters=3)
196
#' plot(sel)
197
#' \dontshow{options(oldopts)}
198
#' }
199
#'
200
#' @importFrom utils write.csv
201
#' @export
202
projsel <- function(obj, max.iters=30, start.from=NULL,
203
                    out.csv=NULL) {
204
    validate.hsstan(obj)
205
    validate.samples(obj)
206
    validate.nonnegative.scalar(max.iters, "max.iters", int=TRUE)
207
208
    x <- xt <- validate.newdata(obj, obj$data)
209
    yt <- obj$data[[obj$model.terms$outcome]]
210
    is.logistic <- is.logistic(obj)
211
    sigma2 <- if (is.logistic) 1 else as.matrix(obj$stanfit, pars="sigma")^2
212
213
    ## set of variables in the initial submodel
214
    vsf <- validate.start.from(obj, start.from)
215
    start.from <- vsf$start.from
216
    chosen <- start.idx <- vsf$idx
217
218
    ## number of model parameters (including intercept)
219
    P <- sum(sapply(obj$betas, length))
220
221
    ## number of iterations to run
222
    K <- min(P - length(start.idx), max.iters)
223
224
    message(sprintf("%58s  %8s %11s", "Model", "KL", "ELPD"))
225
    report.iter <- function(msg, kl, elpd)
226
        message(sprintf("%58s  %8.5f  %8.5f", substr(msg, 1, 55), kl, elpd))
227
228
    ## fitted values for the full model (N x S)
229
    fit <- t(posterior_linpred(obj, transform=TRUE))
230
231
    ## intercept only model
232
    sub <- fit.submodel(x, sigma2, fit, 1, xt, yt, is.logistic)
233
    kl.elpd <- cbind(sub$kl, sub$elpd)
234
    report.iter("Intercept only", sub$kl, sub$elpd)
235
236
    ## initial submodel with the set of chosen covariates
237
    if (length(start.idx) > 1) {
238
        sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic)
239
        kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd))
240
        report.iter("Initial submodel", sub$kl, sub$elpd)
241
    }
242
243
    ## add variables one at a time
244
    for (iter in seq_len(K)) {
245
        sel.idx <- choose.next(x, sigma2, fit, sub$fit, chosen, is.logistic)
246
        chosen <- c(chosen, sel.idx)
247
248
        ## evaluate current submodel according to projected parameters
249
        sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic)
250
        kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd))
251
        report.iter(colnames(x)[sel.idx], sub$kl, sub$elpd)
252
253
        ## check if the current submodel is fully saturated
254
        if (sub$kl < 1e-8 && iter < K) {
255
            message("Fully saturated model reached, selection terminated.")
256
            break()
257
        }
258
    }
259
260
    ## evaluate the full model if selection stopped before reaching it
261
    full.elpd <- if (length(chosen) == P)
262
        sub$elpd else elpd(yt, t(posterior_linpred(obj)), sigma2, is.logistic)
263
264
    kl <- kl.elpd[, 1]
265
    rel.kl <- c(NA, if (length(kl) > 1) 1 - kl[-1] / kl[2])
266
    if (length(rel.kl) == 2 && is.nan(rel.kl[2]))
267
        rel.kl[2] <- 1
268
269
    res <- data.frame(var=c("Intercept only",
270
                            if (length(start.idx) > 1) "Initial submodel",
271
                            colnames(x)[setdiff(chosen, start.idx)]),
272
                      kl=kl,
273
                      rel.kl.null=1 - kl / kl[1],
274
                      rel.kl=rel.kl,
275
                      elpd=kl.elpd[, 2],
276
                      delta.elpd=kl.elpd[, 2] - full.elpd,
277
                      stringsAsFactors=FALSE, row.names=NULL)
278
    attr(res, "start.from") <- start.from
279
    if (!is.null(out.csv))
280
        write.csv(file=out.csv, res, row.names=FALSE)
281
282
    class(res) <- c("projsel", "data.frame")
283
    return(res)
284
}
285
286
#' Plot of relative explanatory power of predictors
287
#'
288
#' The function plots the relative explanatory power of each predictor in order
289
#' of selection. The relative explanatory power of predictors is computed
290
#' according to the KL divergence from the full model to each submodel, scaled
291
#' in such a way that the baseline model (either the null model or the model
292
#' containing only unpenalized covariates) is at 0, while the full model is at 1.
293
#'
294
#' @param x A data frame created by [projsel()].
295
#' @param title Title of the plot. If `NULL`, no title is displayed.
296
#' @param max.points Maximum number of predictors to be plotted. If `NULL`
297
#'        (default) or 0, all points are plotted.
298
#' @param max.labels Maximum number of predictors to be labelled. If `NULL`
299
#'        (default), all predictor labels present in `x` are displayed, which
300
#'        may result in overprinting.
301
#' @param from.covariates Whether the plotting should start from the unpenalized
302
#'        covariates (`TRUE` by default). If set to `FALSE`, the plot includes a
303
#'        point for the null (intercept-only) model.
304
#' @param font.size Size of the textual elements (labels and axes).
305
#' @param hadj,vadj Horizontal and vertical adjustment for the labels.
306
#' @param ... Currently ignored.
307
#'
308
#' @return
309
#' A **ggplot2** object showing the relative incremental contribution of each
310
#' predictor starting from the initial set of unpenalized covariates.
311
#'
312
#' @import ggplot2
313
#' @method plot projsel
314
#' @export
315
plot.projsel <- function(x, title=NULL, max.points=NULL, max.labels=NULL,
316
                         from.covariates=TRUE,
317
                         font.size=12, hadj=0.05, vadj=0, ...) {
318
319
    ## prepare the set of points to plot
320
    sel <- x
321
    if (!is.null(max.points) && max.points > 0)
322
        sel <- utils::head(sel, n=max.points + 2)
323
    if (!is.null(max.labels))
324
        sel$var[-c(1:(max.labels + 2))] <- ""
325
    if (from.covariates)
326
        sel <- sel[-1, ]
327
    labs <- sel$var
328
329
    ## convert from points to millimetres
330
    geom.text.size <- font.size * 25.4 / 72
331
332
    x <- seq(nrow(sel)) - 1
333
    text_idx <- x < mean(x) | x - floor(x / 2) * 2 == 1
334
    rel <- if (from.covariates) sel$rel.kl else sel$rel.kl.null
335
    p <- ggplot(data=sel, aes(x=x, y=rel, label=labs)) +
336
      coord_cartesian(ylim=range(c(0, 1))) +
337
      geom_line() + geom_point(size=geom.text.size / 3) +
338
      geom_text(aes(x=x + ifelse(text_idx, hadj, -hadj),
339
                    y=rel + ifelse(text_idx, -vadj, vadj)),
340
                size=geom.text.size,
341
                hjust=ifelse(text_idx, "left", "right")) +
342
      xlab("Number of biomarkers") +
343
      ylab("Relative explanatory power") +
344
      theme(text=element_text(size=font.size))
345
346
    if (!is.null(title))
347
        p <- p + ggtitle(title)
348
349
    return(p)
350
}