[ede2d4]: / R / projection.R

Download this file

351 lines (313 with data), 14.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
##=============================================================================
##
## Copyright (c) 2017-2021 Marco Colombo and Paul McKeigue
##
## Parts of the code are based on https://github.com/jpiironen/rstan-varsel
## Portions copyright (c) 2015-2017 Juho Piironen
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
##
##=============================================================================
#' Compute projections of full predictors on to subspace of predictors
#'
#' @param x Design matrix.
#' @param fit Matrix of fitted values for the full model.
#' @param sigma2 Residual variance (1 for logistic regression).
#' @param indproj Vector of indices of the columns of `x` that form the
#' projection subspace.
#' @param is.logistic Set to `TRUE` for a logistic regression model.
#'
#' @importFrom stats binomial glm.fit quasibinomial
#' @keywords internal
lm_proj <- function(x, fit, sigma2, indproj, is.logistic) {
P <- ncol(x)
S <- ncol(fit)
## pick the Q columns of x that form the projection subspace
xp <- x[, indproj, drop=FALSE] # matrix of dimension N x Q
## logistic regression model
if (is.logistic) {
## compute the projection for each sample
par.fun <- function(s) glm.fit(xp, fit[, s],
family=quasibinomial())$coefficients
if (.Platform$OS.type != "windows") {
wp <- parallel::mclapply(X=1:S, mc.preschedule=TRUE, FUN=par.fun)
} else { # windows
cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1))
on.exit(parallel::stopCluster(cl))
wp <- parallel::parLapply(X=1:S, cl=cl, fun=par.fun)
}
wp <- matrix(unlist(wp, use.names=FALSE), ncol=S)
## estimate the KL divergence between full and projected model
fitp <- binomial()$linkinv(multiplyAB(xp, wp))
kl <- 0.5 * mean(colMeans(binomial()$dev.resids(fit, fitp, 1)))
sigma2p <- 1
}
## linear regression model
else {
## solve the projection equations
wp <- solve(crossprod(xp, xp), multiplyAtB(xp, fit)) # Q x S matrix
## fit of the projected model
fitp <- multiplyAB(xp, wp)
## estimate the KL divergence between full and projected model
sigma2p <- sigma2 + colMeans((fit - fitp)^2)
kl <- mean(0.5 * log(sigma2p / sigma2))
}
## reshape wp so that it has same dimension (P x S) as t(x), and zeros for
## those variables that are not included in the projected model
wp.all <- matrix(0, P, S)
wp.all[indproj, ] <- wp
return(list(w=wp.all, sigma2=sigma2p, fit=fitp, kl=kl))
}
#' Next variable to enter the current submodel
#'
#' Return the index of the variable that should be added to the current model
#' according to the smallest KL-divergence (linear regression) or the largest
#' score test (logistic regression).
#'
#' @param x Design matrix.
#' @param sigma2 Residual variance (1 for logistic regression).
#' @param fit Matrix of fitted values for the full model.
#' @param fitp Matrix of fitted values for the projected model.
#' @param chosen Vector of indices of the columns of `x` in the current submodel.
#' @param is.logistic Set to `TRUE` for a logistic regression model.
#'
#' @keywords internal
choose.next <- function(x, sigma2, fit, fitp, chosen, is.logistic) {
notchosen <- setdiff(1:ncol(x), chosen)
par.fun <- function(idx) {
lm_proj(x, fit, sigma2, sort(c(chosen, notchosen[idx])), FALSE)$kl
}
if (!is.logistic) {
if (.Platform$OS.type != "windows") {
kl <- parallel::mclapply(X=1:length(notchosen), mc.preschedule=TRUE,
FUN=par.fun)
} else { # windows
cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1))
on.exit(parallel::stopCluster(cl))
kl <- parallel::parLapply(X=1:length(notchosen), cl=cl, fun=par.fun)
}
idx.selected <- which.min(unlist(kl))
return(notchosen[idx.selected])
}
## score test
yminusexp <- fit - fitp
dinv.link <- fitp * (1 - fitp)
U <- colMeans(multiplyAtB(yminusexp, x[, notchosen]))
V <- colMeans(multiplyAtB(dinv.link, x[, notchosen]^2))
idx.selected <- which.max(U^2 / V)
return(notchosen[idx.selected])
}
#' Compute the fit of a submodel
#'
#' @param x Design matrix.
#' @param sigma2 Residual variance (1 for logistic regression).
#' @param mu Matrix of fitted values for the full model.
#' @param chosen Vector of indices of the columns of `x` in the current submodel.
#' @param xt Design matrix for test data.
#' @param yt Outcome variable for test data.
#' @param logistic Set to `TRUE` for a logistic regression model.
#'
#' @keywords internal
fit.submodel <- function(x, sigma2, mu, chosen, xt, yt, logistic) {
## projected parameters
submodel <- lm_proj(x, mu, sigma2, chosen, logistic)
eta <- multiplyAB(xt, submodel$w)
return(c(submodel, elpd=elpd(yt, eta, submodel$sigma2, logistic)))
}
#' Expected log predictive density
#'
#' @param yt Outcome variable.
#' @param eta Matrix of linear predictors, with as many rows as the number of
#' observations.
#' @param sigma2 Residual variance (1 for logistic regression).
#' @param logistic Set to `TRUE` for a logistic regression model.
#'
#' @noRd
elpd <- function(yt, eta, sigma2, logistic) {
if (logistic)
pd <- stats::dbinom(yt, 1, binomial()$linkinv(eta), log=TRUE)
else {
sigma <- sqrt(sigma2)
pd <- t(cbind(sapply(1:nrow(eta),
function(z) stats::dnorm(yt[z], eta[z, ],
sigma, log=TRUE))))
}
sum(rowMeans(pd))
}
#' Forward selection minimizing KL-divergence in projection
#'
#' @param obj Object of class `hsstan`.
#' @param max.iters Maximum number of iterations (number of predictors selected)
#' after which the selection procedure should stop.
#' @param start.from Vector of variable names to be used in the starting
#' submodel. If `NULL` (default), selection starts from the set of
#' unpenalized covariates if the model contains penalized predictors,
#' otherwise selection starts from the intercept-only model.
#' @param out.csv If not `NULL`, the name of a CSV file to save the
#' output to.
#'
#' @return
#' A data frame of class `projsel` where each row corresponds to a
#' forward-selected submodel that contains all variables listed up to that row.
#' Attribute `start.from` reports the predictors in the initial model.
#' The data frame contains the following columns:
#' \item{var}{names of the variables selected.}
#' \item{kl}{KL-divergence from the full model to the submodel.}
#' \item{rel.kl.null}{relative explanatory power of predictors starting from the
#' intercept-only model.}
#' \item{rel.kl}{relative explanatory power of predictors starting from the
#' initial submodel.}
#' \item{elpd}{the expected log predictive density of the submodels.}
#' \item{delta.elpd}{the difference in elpd from the full model.}
#'
#' @examples
#' \donttest{
#' \dontshow{utils::example("hsstan", echo=FALSE)}
#' \dontshow{oldopts <- options(mc.cores=2)}
#' # continued from ?hsstan
#' sel <- projsel(hs.biom, max.iters=3)
#' plot(sel)
#' \dontshow{options(oldopts)}
#' }
#'
#' @importFrom utils write.csv
#' @export
projsel <- function(obj, max.iters=30, start.from=NULL,
out.csv=NULL) {
validate.hsstan(obj)
validate.samples(obj)
validate.nonnegative.scalar(max.iters, "max.iters", int=TRUE)
x <- xt <- validate.newdata(obj, obj$data)
yt <- obj$data[[obj$model.terms$outcome]]
is.logistic <- is.logistic(obj)
sigma2 <- if (is.logistic) 1 else as.matrix(obj$stanfit, pars="sigma")^2
## set of variables in the initial submodel
vsf <- validate.start.from(obj, start.from)
start.from <- vsf$start.from
chosen <- start.idx <- vsf$idx
## number of model parameters (including intercept)
P <- sum(sapply(obj$betas, length))
## number of iterations to run
K <- min(P - length(start.idx), max.iters)
message(sprintf("%58s %8s %11s", "Model", "KL", "ELPD"))
report.iter <- function(msg, kl, elpd)
message(sprintf("%58s %8.5f %8.5f", substr(msg, 1, 55), kl, elpd))
## fitted values for the full model (N x S)
fit <- t(posterior_linpred(obj, transform=TRUE))
## intercept only model
sub <- fit.submodel(x, sigma2, fit, 1, xt, yt, is.logistic)
kl.elpd <- cbind(sub$kl, sub$elpd)
report.iter("Intercept only", sub$kl, sub$elpd)
## initial submodel with the set of chosen covariates
if (length(start.idx) > 1) {
sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic)
kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd))
report.iter("Initial submodel", sub$kl, sub$elpd)
}
## add variables one at a time
for (iter in seq_len(K)) {
sel.idx <- choose.next(x, sigma2, fit, sub$fit, chosen, is.logistic)
chosen <- c(chosen, sel.idx)
## evaluate current submodel according to projected parameters
sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic)
kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd))
report.iter(colnames(x)[sel.idx], sub$kl, sub$elpd)
## check if the current submodel is fully saturated
if (sub$kl < 1e-8 && iter < K) {
message("Fully saturated model reached, selection terminated.")
break()
}
}
## evaluate the full model if selection stopped before reaching it
full.elpd <- if (length(chosen) == P)
sub$elpd else elpd(yt, t(posterior_linpred(obj)), sigma2, is.logistic)
kl <- kl.elpd[, 1]
rel.kl <- c(NA, if (length(kl) > 1) 1 - kl[-1] / kl[2])
if (length(rel.kl) == 2 && is.nan(rel.kl[2]))
rel.kl[2] <- 1
res <- data.frame(var=c("Intercept only",
if (length(start.idx) > 1) "Initial submodel",
colnames(x)[setdiff(chosen, start.idx)]),
kl=kl,
rel.kl.null=1 - kl / kl[1],
rel.kl=rel.kl,
elpd=kl.elpd[, 2],
delta.elpd=kl.elpd[, 2] - full.elpd,
stringsAsFactors=FALSE, row.names=NULL)
attr(res, "start.from") <- start.from
if (!is.null(out.csv))
write.csv(file=out.csv, res, row.names=FALSE)
class(res) <- c("projsel", "data.frame")
return(res)
}
#' Plot of relative explanatory power of predictors
#'
#' The function plots the relative explanatory power of each predictor in order
#' of selection. The relative explanatory power of predictors is computed
#' according to the KL divergence from the full model to each submodel, scaled
#' in such a way that the baseline model (either the null model or the model
#' containing only unpenalized covariates) is at 0, while the full model is at 1.
#'
#' @param x A data frame created by [projsel()].
#' @param title Title of the plot. If `NULL`, no title is displayed.
#' @param max.points Maximum number of predictors to be plotted. If `NULL`
#' (default) or 0, all points are plotted.
#' @param max.labels Maximum number of predictors to be labelled. If `NULL`
#' (default), all predictor labels present in `x` are displayed, which
#' may result in overprinting.
#' @param from.covariates Whether the plotting should start from the unpenalized
#' covariates (`TRUE` by default). If set to `FALSE`, the plot includes a
#' point for the null (intercept-only) model.
#' @param font.size Size of the textual elements (labels and axes).
#' @param hadj,vadj Horizontal and vertical adjustment for the labels.
#' @param ... Currently ignored.
#'
#' @return
#' A **ggplot2** object showing the relative incremental contribution of each
#' predictor starting from the initial set of unpenalized covariates.
#'
#' @import ggplot2
#' @method plot projsel
#' @export
plot.projsel <- function(x, title=NULL, max.points=NULL, max.labels=NULL,
from.covariates=TRUE,
font.size=12, hadj=0.05, vadj=0, ...) {
## prepare the set of points to plot
sel <- x
if (!is.null(max.points) && max.points > 0)
sel <- utils::head(sel, n=max.points + 2)
if (!is.null(max.labels))
sel$var[-c(1:(max.labels + 2))] <- ""
if (from.covariates)
sel <- sel[-1, ]
labs <- sel$var
## convert from points to millimetres
geom.text.size <- font.size * 25.4 / 72
x <- seq(nrow(sel)) - 1
text_idx <- x < mean(x) | x - floor(x / 2) * 2 == 1
rel <- if (from.covariates) sel$rel.kl else sel$rel.kl.null
p <- ggplot(data=sel, aes(x=x, y=rel, label=labs)) +
coord_cartesian(ylim=range(c(0, 1))) +
geom_line() + geom_point(size=geom.text.size / 3) +
geom_text(aes(x=x + ifelse(text_idx, hadj, -hadj),
y=rel + ifelse(text_idx, -vadj, vadj)),
size=geom.text.size,
hjust=ifelse(text_idx, "left", "right")) +
xlab("Number of biomarkers") +
ylab("Relative explanatory power") +
theme(text=element_text(size=font.size))
if (!is.null(title))
p <- p + ggtitle(title)
return(p)
}