|
a |
|
b/R/projection.R |
|
|
1 |
##============================================================================= |
|
|
2 |
## |
|
|
3 |
## Copyright (c) 2017-2021 Marco Colombo and Paul McKeigue |
|
|
4 |
## |
|
|
5 |
## Parts of the code are based on https://github.com/jpiironen/rstan-varsel |
|
|
6 |
## Portions copyright (c) 2015-2017 Juho Piironen |
|
|
7 |
## |
|
|
8 |
## This program is free software: you can redistribute it and/or modify |
|
|
9 |
## it under the terms of the GNU General Public License as published by |
|
|
10 |
## the Free Software Foundation, either version 3 of the License, or |
|
|
11 |
## (at your option) any later version. |
|
|
12 |
## |
|
|
13 |
## This program is distributed in the hope that it will be useful, |
|
|
14 |
## but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
|
15 |
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
|
16 |
## GNU General Public License for more details. |
|
|
17 |
## |
|
|
18 |
## You should have received a copy of the GNU General Public License |
|
|
19 |
## along with this program. If not, see <http://www.gnu.org/licenses/>. |
|
|
20 |
## |
|
|
21 |
##============================================================================= |
|
|
22 |
|
|
|
23 |
|
|
|
24 |
#' Compute projections of full predictors on to subspace of predictors |
|
|
25 |
#' |
|
|
26 |
#' @param x Design matrix. |
|
|
27 |
#' @param fit Matrix of fitted values for the full model. |
|
|
28 |
#' @param sigma2 Residual variance (1 for logistic regression). |
|
|
29 |
#' @param indproj Vector of indices of the columns of `x` that form the |
|
|
30 |
#' projection subspace. |
|
|
31 |
#' @param is.logistic Set to `TRUE` for a logistic regression model. |
|
|
32 |
#' |
|
|
33 |
#' @importFrom stats binomial glm.fit quasibinomial |
|
|
34 |
#' @keywords internal |
|
|
35 |
lm_proj <- function(x, fit, sigma2, indproj, is.logistic) { |
|
|
36 |
|
|
|
37 |
P <- ncol(x) |
|
|
38 |
S <- ncol(fit) |
|
|
39 |
|
|
|
40 |
## pick the Q columns of x that form the projection subspace |
|
|
41 |
xp <- x[, indproj, drop=FALSE] # matrix of dimension N x Q |
|
|
42 |
|
|
|
43 |
## logistic regression model |
|
|
44 |
if (is.logistic) { |
|
|
45 |
|
|
|
46 |
## compute the projection for each sample |
|
|
47 |
par.fun <- function(s) glm.fit(xp, fit[, s], |
|
|
48 |
family=quasibinomial())$coefficients |
|
|
49 |
if (.Platform$OS.type != "windows") { |
|
|
50 |
wp <- parallel::mclapply(X=1:S, mc.preschedule=TRUE, FUN=par.fun) |
|
|
51 |
} else { # windows |
|
|
52 |
cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1)) |
|
|
53 |
on.exit(parallel::stopCluster(cl)) |
|
|
54 |
wp <- parallel::parLapply(X=1:S, cl=cl, fun=par.fun) |
|
|
55 |
} |
|
|
56 |
wp <- matrix(unlist(wp, use.names=FALSE), ncol=S) |
|
|
57 |
|
|
|
58 |
## estimate the KL divergence between full and projected model |
|
|
59 |
fitp <- binomial()$linkinv(multiplyAB(xp, wp)) |
|
|
60 |
kl <- 0.5 * mean(colMeans(binomial()$dev.resids(fit, fitp, 1))) |
|
|
61 |
sigma2p <- 1 |
|
|
62 |
} |
|
|
63 |
|
|
|
64 |
## linear regression model |
|
|
65 |
else { |
|
|
66 |
## solve the projection equations |
|
|
67 |
wp <- solve(crossprod(xp, xp), multiplyAtB(xp, fit)) # Q x S matrix |
|
|
68 |
|
|
|
69 |
## fit of the projected model |
|
|
70 |
fitp <- multiplyAB(xp, wp) |
|
|
71 |
|
|
|
72 |
## estimate the KL divergence between full and projected model |
|
|
73 |
sigma2p <- sigma2 + colMeans((fit - fitp)^2) |
|
|
74 |
kl <- mean(0.5 * log(sigma2p / sigma2)) |
|
|
75 |
} |
|
|
76 |
|
|
|
77 |
## reshape wp so that it has same dimension (P x S) as t(x), and zeros for |
|
|
78 |
## those variables that are not included in the projected model |
|
|
79 |
wp.all <- matrix(0, P, S) |
|
|
80 |
wp.all[indproj, ] <- wp |
|
|
81 |
return(list(w=wp.all, sigma2=sigma2p, fit=fitp, kl=kl)) |
|
|
82 |
} |
|
|
83 |
|
|
|
84 |
#' Next variable to enter the current submodel |
|
|
85 |
#' |
|
|
86 |
#' Return the index of the variable that should be added to the current model |
|
|
87 |
#' according to the smallest KL-divergence (linear regression) or the largest |
|
|
88 |
#' score test (logistic regression). |
|
|
89 |
#' |
|
|
90 |
#' @param x Design matrix. |
|
|
91 |
#' @param sigma2 Residual variance (1 for logistic regression). |
|
|
92 |
#' @param fit Matrix of fitted values for the full model. |
|
|
93 |
#' @param fitp Matrix of fitted values for the projected model. |
|
|
94 |
#' @param chosen Vector of indices of the columns of `x` in the current submodel. |
|
|
95 |
#' @param is.logistic Set to `TRUE` for a logistic regression model. |
|
|
96 |
#' |
|
|
97 |
#' @keywords internal |
|
|
98 |
choose.next <- function(x, sigma2, fit, fitp, chosen, is.logistic) { |
|
|
99 |
notchosen <- setdiff(1:ncol(x), chosen) |
|
|
100 |
par.fun <- function(idx) { |
|
|
101 |
lm_proj(x, fit, sigma2, sort(c(chosen, notchosen[idx])), FALSE)$kl |
|
|
102 |
} |
|
|
103 |
if (!is.logistic) { |
|
|
104 |
if (.Platform$OS.type != "windows") { |
|
|
105 |
kl <- parallel::mclapply(X=1:length(notchosen), mc.preschedule=TRUE, |
|
|
106 |
FUN=par.fun) |
|
|
107 |
} else { # windows |
|
|
108 |
cl <- parallel::makePSOCKcluster(getOption("mc.cores", 1)) |
|
|
109 |
on.exit(parallel::stopCluster(cl)) |
|
|
110 |
kl <- parallel::parLapply(X=1:length(notchosen), cl=cl, fun=par.fun) |
|
|
111 |
} |
|
|
112 |
idx.selected <- which.min(unlist(kl)) |
|
|
113 |
return(notchosen[idx.selected]) |
|
|
114 |
} |
|
|
115 |
|
|
|
116 |
## score test |
|
|
117 |
yminusexp <- fit - fitp |
|
|
118 |
dinv.link <- fitp * (1 - fitp) |
|
|
119 |
U <- colMeans(multiplyAtB(yminusexp, x[, notchosen])) |
|
|
120 |
V <- colMeans(multiplyAtB(dinv.link, x[, notchosen]^2)) |
|
|
121 |
idx.selected <- which.max(U^2 / V) |
|
|
122 |
return(notchosen[idx.selected]) |
|
|
123 |
} |
|
|
124 |
|
|
|
125 |
#' Compute the fit of a submodel |
|
|
126 |
#' |
|
|
127 |
#' @param x Design matrix. |
|
|
128 |
#' @param sigma2 Residual variance (1 for logistic regression). |
|
|
129 |
#' @param mu Matrix of fitted values for the full model. |
|
|
130 |
#' @param chosen Vector of indices of the columns of `x` in the current submodel. |
|
|
131 |
#' @param xt Design matrix for test data. |
|
|
132 |
#' @param yt Outcome variable for test data. |
|
|
133 |
#' @param logistic Set to `TRUE` for a logistic regression model. |
|
|
134 |
#' |
|
|
135 |
#' @keywords internal |
|
|
136 |
fit.submodel <- function(x, sigma2, mu, chosen, xt, yt, logistic) { |
|
|
137 |
## projected parameters |
|
|
138 |
submodel <- lm_proj(x, mu, sigma2, chosen, logistic) |
|
|
139 |
eta <- multiplyAB(xt, submodel$w) |
|
|
140 |
return(c(submodel, elpd=elpd(yt, eta, submodel$sigma2, logistic))) |
|
|
141 |
} |
|
|
142 |
|
|
|
143 |
#' Expected log predictive density |
|
|
144 |
#' |
|
|
145 |
#' @param yt Outcome variable. |
|
|
146 |
#' @param eta Matrix of linear predictors, with as many rows as the number of |
|
|
147 |
#' observations. |
|
|
148 |
#' @param sigma2 Residual variance (1 for logistic regression). |
|
|
149 |
#' @param logistic Set to `TRUE` for a logistic regression model. |
|
|
150 |
#' |
|
|
151 |
#' @noRd |
|
|
152 |
elpd <- function(yt, eta, sigma2, logistic) { |
|
|
153 |
if (logistic) |
|
|
154 |
pd <- stats::dbinom(yt, 1, binomial()$linkinv(eta), log=TRUE) |
|
|
155 |
else { |
|
|
156 |
sigma <- sqrt(sigma2) |
|
|
157 |
pd <- t(cbind(sapply(1:nrow(eta), |
|
|
158 |
function(z) stats::dnorm(yt[z], eta[z, ], |
|
|
159 |
sigma, log=TRUE)))) |
|
|
160 |
} |
|
|
161 |
sum(rowMeans(pd)) |
|
|
162 |
} |
|
|
163 |
|
|
|
164 |
#' Forward selection minimizing KL-divergence in projection |
|
|
165 |
#' |
|
|
166 |
#' @param obj Object of class `hsstan`. |
|
|
167 |
#' @param max.iters Maximum number of iterations (number of predictors selected) |
|
|
168 |
#' after which the selection procedure should stop. |
|
|
169 |
#' @param start.from Vector of variable names to be used in the starting |
|
|
170 |
#' submodel. If `NULL` (default), selection starts from the set of |
|
|
171 |
#' unpenalized covariates if the model contains penalized predictors, |
|
|
172 |
#' otherwise selection starts from the intercept-only model. |
|
|
173 |
#' @param out.csv If not `NULL`, the name of a CSV file to save the |
|
|
174 |
#' output to. |
|
|
175 |
#' |
|
|
176 |
#' @return |
|
|
177 |
#' A data frame of class `projsel` where each row corresponds to a |
|
|
178 |
#' forward-selected submodel that contains all variables listed up to that row. |
|
|
179 |
#' Attribute `start.from` reports the predictors in the initial model. |
|
|
180 |
#' The data frame contains the following columns: |
|
|
181 |
#' \item{var}{names of the variables selected.} |
|
|
182 |
#' \item{kl}{KL-divergence from the full model to the submodel.} |
|
|
183 |
#' \item{rel.kl.null}{relative explanatory power of predictors starting from the |
|
|
184 |
#' intercept-only model.} |
|
|
185 |
#' \item{rel.kl}{relative explanatory power of predictors starting from the |
|
|
186 |
#' initial submodel.} |
|
|
187 |
#' \item{elpd}{the expected log predictive density of the submodels.} |
|
|
188 |
#' \item{delta.elpd}{the difference in elpd from the full model.} |
|
|
189 |
#' |
|
|
190 |
#' @examples |
|
|
191 |
#' \donttest{ |
|
|
192 |
#' \dontshow{utils::example("hsstan", echo=FALSE)} |
|
|
193 |
#' \dontshow{oldopts <- options(mc.cores=2)} |
|
|
194 |
#' # continued from ?hsstan |
|
|
195 |
#' sel <- projsel(hs.biom, max.iters=3) |
|
|
196 |
#' plot(sel) |
|
|
197 |
#' \dontshow{options(oldopts)} |
|
|
198 |
#' } |
|
|
199 |
#' |
|
|
200 |
#' @importFrom utils write.csv |
|
|
201 |
#' @export |
|
|
202 |
projsel <- function(obj, max.iters=30, start.from=NULL, |
|
|
203 |
out.csv=NULL) { |
|
|
204 |
validate.hsstan(obj) |
|
|
205 |
validate.samples(obj) |
|
|
206 |
validate.nonnegative.scalar(max.iters, "max.iters", int=TRUE) |
|
|
207 |
|
|
|
208 |
x <- xt <- validate.newdata(obj, obj$data) |
|
|
209 |
yt <- obj$data[[obj$model.terms$outcome]] |
|
|
210 |
is.logistic <- is.logistic(obj) |
|
|
211 |
sigma2 <- if (is.logistic) 1 else as.matrix(obj$stanfit, pars="sigma")^2 |
|
|
212 |
|
|
|
213 |
## set of variables in the initial submodel |
|
|
214 |
vsf <- validate.start.from(obj, start.from) |
|
|
215 |
start.from <- vsf$start.from |
|
|
216 |
chosen <- start.idx <- vsf$idx |
|
|
217 |
|
|
|
218 |
## number of model parameters (including intercept) |
|
|
219 |
P <- sum(sapply(obj$betas, length)) |
|
|
220 |
|
|
|
221 |
## number of iterations to run |
|
|
222 |
K <- min(P - length(start.idx), max.iters) |
|
|
223 |
|
|
|
224 |
message(sprintf("%58s %8s %11s", "Model", "KL", "ELPD")) |
|
|
225 |
report.iter <- function(msg, kl, elpd) |
|
|
226 |
message(sprintf("%58s %8.5f %8.5f", substr(msg, 1, 55), kl, elpd)) |
|
|
227 |
|
|
|
228 |
## fitted values for the full model (N x S) |
|
|
229 |
fit <- t(posterior_linpred(obj, transform=TRUE)) |
|
|
230 |
|
|
|
231 |
## intercept only model |
|
|
232 |
sub <- fit.submodel(x, sigma2, fit, 1, xt, yt, is.logistic) |
|
|
233 |
kl.elpd <- cbind(sub$kl, sub$elpd) |
|
|
234 |
report.iter("Intercept only", sub$kl, sub$elpd) |
|
|
235 |
|
|
|
236 |
## initial submodel with the set of chosen covariates |
|
|
237 |
if (length(start.idx) > 1) { |
|
|
238 |
sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic) |
|
|
239 |
kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd)) |
|
|
240 |
report.iter("Initial submodel", sub$kl, sub$elpd) |
|
|
241 |
} |
|
|
242 |
|
|
|
243 |
## add variables one at a time |
|
|
244 |
for (iter in seq_len(K)) { |
|
|
245 |
sel.idx <- choose.next(x, sigma2, fit, sub$fit, chosen, is.logistic) |
|
|
246 |
chosen <- c(chosen, sel.idx) |
|
|
247 |
|
|
|
248 |
## evaluate current submodel according to projected parameters |
|
|
249 |
sub <- fit.submodel(x, sigma2, fit, chosen, xt, yt, is.logistic) |
|
|
250 |
kl.elpd <- rbind(kl.elpd, c(sub$kl, sub$elpd)) |
|
|
251 |
report.iter(colnames(x)[sel.idx], sub$kl, sub$elpd) |
|
|
252 |
|
|
|
253 |
## check if the current submodel is fully saturated |
|
|
254 |
if (sub$kl < 1e-8 && iter < K) { |
|
|
255 |
message("Fully saturated model reached, selection terminated.") |
|
|
256 |
break() |
|
|
257 |
} |
|
|
258 |
} |
|
|
259 |
|
|
|
260 |
## evaluate the full model if selection stopped before reaching it |
|
|
261 |
full.elpd <- if (length(chosen) == P) |
|
|
262 |
sub$elpd else elpd(yt, t(posterior_linpred(obj)), sigma2, is.logistic) |
|
|
263 |
|
|
|
264 |
kl <- kl.elpd[, 1] |
|
|
265 |
rel.kl <- c(NA, if (length(kl) > 1) 1 - kl[-1] / kl[2]) |
|
|
266 |
if (length(rel.kl) == 2 && is.nan(rel.kl[2])) |
|
|
267 |
rel.kl[2] <- 1 |
|
|
268 |
|
|
|
269 |
res <- data.frame(var=c("Intercept only", |
|
|
270 |
if (length(start.idx) > 1) "Initial submodel", |
|
|
271 |
colnames(x)[setdiff(chosen, start.idx)]), |
|
|
272 |
kl=kl, |
|
|
273 |
rel.kl.null=1 - kl / kl[1], |
|
|
274 |
rel.kl=rel.kl, |
|
|
275 |
elpd=kl.elpd[, 2], |
|
|
276 |
delta.elpd=kl.elpd[, 2] - full.elpd, |
|
|
277 |
stringsAsFactors=FALSE, row.names=NULL) |
|
|
278 |
attr(res, "start.from") <- start.from |
|
|
279 |
if (!is.null(out.csv)) |
|
|
280 |
write.csv(file=out.csv, res, row.names=FALSE) |
|
|
281 |
|
|
|
282 |
class(res) <- c("projsel", "data.frame") |
|
|
283 |
return(res) |
|
|
284 |
} |
|
|
285 |
|
|
|
286 |
#' Plot of relative explanatory power of predictors |
|
|
287 |
#' |
|
|
288 |
#' The function plots the relative explanatory power of each predictor in order |
|
|
289 |
#' of selection. The relative explanatory power of predictors is computed |
|
|
290 |
#' according to the KL divergence from the full model to each submodel, scaled |
|
|
291 |
#' in such a way that the baseline model (either the null model or the model |
|
|
292 |
#' containing only unpenalized covariates) is at 0, while the full model is at 1. |
|
|
293 |
#' |
|
|
294 |
#' @param x A data frame created by [projsel()]. |
|
|
295 |
#' @param title Title of the plot. If `NULL`, no title is displayed. |
|
|
296 |
#' @param max.points Maximum number of predictors to be plotted. If `NULL` |
|
|
297 |
#' (default) or 0, all points are plotted. |
|
|
298 |
#' @param max.labels Maximum number of predictors to be labelled. If `NULL` |
|
|
299 |
#' (default), all predictor labels present in `x` are displayed, which |
|
|
300 |
#' may result in overprinting. |
|
|
301 |
#' @param from.covariates Whether the plotting should start from the unpenalized |
|
|
302 |
#' covariates (`TRUE` by default). If set to `FALSE`, the plot includes a |
|
|
303 |
#' point for the null (intercept-only) model. |
|
|
304 |
#' @param font.size Size of the textual elements (labels and axes). |
|
|
305 |
#' @param hadj,vadj Horizontal and vertical adjustment for the labels. |
|
|
306 |
#' @param ... Currently ignored. |
|
|
307 |
#' |
|
|
308 |
#' @return |
|
|
309 |
#' A **ggplot2** object showing the relative incremental contribution of each |
|
|
310 |
#' predictor starting from the initial set of unpenalized covariates. |
|
|
311 |
#' |
|
|
312 |
#' @import ggplot2 |
|
|
313 |
#' @method plot projsel |
|
|
314 |
#' @export |
|
|
315 |
plot.projsel <- function(x, title=NULL, max.points=NULL, max.labels=NULL, |
|
|
316 |
from.covariates=TRUE, |
|
|
317 |
font.size=12, hadj=0.05, vadj=0, ...) { |
|
|
318 |
|
|
|
319 |
## prepare the set of points to plot |
|
|
320 |
sel <- x |
|
|
321 |
if (!is.null(max.points) && max.points > 0) |
|
|
322 |
sel <- utils::head(sel, n=max.points + 2) |
|
|
323 |
if (!is.null(max.labels)) |
|
|
324 |
sel$var[-c(1:(max.labels + 2))] <- "" |
|
|
325 |
if (from.covariates) |
|
|
326 |
sel <- sel[-1, ] |
|
|
327 |
labs <- sel$var |
|
|
328 |
|
|
|
329 |
## convert from points to millimetres |
|
|
330 |
geom.text.size <- font.size * 25.4 / 72 |
|
|
331 |
|
|
|
332 |
x <- seq(nrow(sel)) - 1 |
|
|
333 |
text_idx <- x < mean(x) | x - floor(x / 2) * 2 == 1 |
|
|
334 |
rel <- if (from.covariates) sel$rel.kl else sel$rel.kl.null |
|
|
335 |
p <- ggplot(data=sel, aes(x=x, y=rel, label=labs)) + |
|
|
336 |
coord_cartesian(ylim=range(c(0, 1))) + |
|
|
337 |
geom_line() + geom_point(size=geom.text.size / 3) + |
|
|
338 |
geom_text(aes(x=x + ifelse(text_idx, hadj, -hadj), |
|
|
339 |
y=rel + ifelse(text_idx, -vadj, vadj)), |
|
|
340 |
size=geom.text.size, |
|
|
341 |
hjust=ifelse(text_idx, "left", "right")) + |
|
|
342 |
xlab("Number of biomarkers") + |
|
|
343 |
ylab("Relative explanatory power") + |
|
|
344 |
theme(text=element_text(size=font.size)) |
|
|
345 |
|
|
|
346 |
if (!is.null(title)) |
|
|
347 |
p <- p + ggtitle(title) |
|
|
348 |
|
|
|
349 |
return(p) |
|
|
350 |
} |