[ede2d4]: / R / stan.R

Download this file

340 lines (314 with data), 15.2 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
##=============================================================================
##
## Copyright (c) 2017-2019 Marco Colombo and Paul McKeigue
##
## kfold.hsstan() is based on code from https://github.com/stan-dev/rstanarm
## Portions copyright (C) 2015, 2016, 2017 Trustees of Columbia University
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
##
##=============================================================================
#' Hierarchical shrinkage models
#'
#' Run the No-U-Turn Sampler (NUTS) as implemented in Stan to fit a hierarchical
#' shrinkage model.
#'
#' @param x Data frame containing outcome, covariates and penalized predictors.
#' Continuous predictors and outcome variable should be standardized
#' before fitting the models as priors assume them to have mean zero and
#' unit variance.
#' @param covs.model Formula containing the unpenalized covariates.
#' @param penalized Names of the variables to be used as penalized predictors.
#' Any variable that is already part of the `covs.model` formula will be
#' penalized. If `NULL` or an empty vector, a model with only unpenalized
#' covariates is fitted.
#' @param family Type of model fitted: either `gaussian()` for linear regression
#' (default) or `binomial()` for logistic regression.
#' @param seed Optional integer defining the seed for the pseudo-random number
#' generator.
#' @param qr Whether the thin QR decomposition should be used to decorrelate the
#' predictors (`TRUE` by default). This is silently set to `FALSE` if
#' there are more predictors than observations.
#' @param adapt.delta Target average proposal acceptance probability for
#' adaptation, a value between 0.8 and 1 (excluded). If unspecified,
#' it's set to 0.99 for hierarchical shrinkage models and to 0.95 for
#' base models.
#' @param iter Total number of iterations in each chain, including warmup
#' (2000 by default).
#' @param warmup Number of warmup iterations per chain (by default, half the
#' total number of iterations).
#' @param scale.u Prior scale (standard deviation) for the unpenalized
#' covariates.
#' @param regularized If `TRUE` (default), the regularized horseshoe prior
#' is used as opposed to the original horseshoe prior.
#' @param nu Number of degrees of freedom of the half-Student-t prior on the
#' local shrinkage parameters (by default, 1 if `regularized=TRUE`
#' and 3 otherwise).
#' @param par.ratio Expected ratio of non-zero to zero coefficients (ignored
#' if `regularized=FALSE`). The scale of the global shrinkage parameter
#' corresponds to `par.ratio` divided by the square root of the number of
#' observations; for linear regression only, it's further multiplied by
#' the residual standard deviation `sigma`.
#' @param global.df Number of degrees of freedom for the global shrinkage
#' parameter (ignored if `regularized=FALSE`). Larger values induce more
#' shrinkage.
#' @param slab.scale Scale of the regularization parameter (ignored if
#' `regularized=FALSE`).
#' @param slab.df Number of degrees of freedom of the regularization parameter
#' (ignored if `regularized=FALSE`).
#' @param keep.hs.pars Whether the parameters for the horseshoe prior should be
#' kept in the `stanfit` object returned (`FALSE` by default).
#' @param ... Further arguments passed to [rstan::sampling()],
#' such as `chains` (4 by default), `cores` (the value of
#' `options("mc.cores")` by default), `refresh` (`iter / 10` by default).
#'
#' @return
#' An object of class `hsstan` containing the following fields:
#' \item{stanfit}{an object of class `stanfit` containing the output
#' produced by Stan, including posterior samples and diagnostic summaries.
#' It can be manipulated using methods from the **rstan** package.}
#' \item{betas}{posterior means of the unpenalized and penalized regression
#' parameters.}
#' \item{call}{the matched call.}
#' \item{data}{the dataset used in fitting the model.}
#' \item{model.terms}{a list of names for the outcome variable, the unpenalized
#' covariates and the penalized predictors.}
#' \item{family}{the `family` object used.}
#' \item{hsstan.settings}{the optional settings used in the model.}
#'
#' @seealso
#' [kfold()] for cross-validating a fitted object.
#'
#' @examples
#' \dontshow{oldopts <- options(mc.cores=2)}
#' data(diabetes)
#'
#' # non-default settings for speed of the example
#' df <- diabetes[1:50, ]
#' hs.biom <- hsstan(df, Y ~ age + sex, penalized=colnames(df)[5:10],
#' chains=2, iter=250)
#' \dontshow{options(oldopts)}
#'
#' @importFrom stats gaussian
#' @export
hsstan <- function(x, covs.model, penalized=NULL, family=gaussian,
iter=2000, warmup=floor(iter / 2),
scale.u=2, regularized=TRUE, nu=ifelse(regularized, 1, 3),
par.ratio=0.05, global.df=1, slab.scale=2, slab.df=4,
qr=TRUE, seed=123, adapt.delta=NULL,
keep.hs.pars=FALSE, ...) {
model.terms <- validate.model(covs.model, penalized)
x <- validate.data(x, model.terms)
y <- x[[model.terms$outcome]]
family <- validate.family(family, y)
regularized <- as.integer(regularized)
validate.positive.scalar(iter, "iter", int=TRUE)
validate.positive.scalar(warmup, "warmup", int=TRUE)
if (iter <= warmup)
stop("'warmup' must be smaller than 'iter'.", call.=FALSE)
## stop if options to be passed to rstan::sampling are not valid, as
## to work around rstan issue #681
validate.rstan.args(...)
## parameter names not to include by default in the stanfit object
hs.pars <- c("lambda", "tau", "z", "c2")
if (keep.hs.pars)
hs.pars <- NA
## retrieve the call and its actual argument values
call <- match.call(expand.dots=TRUE)
args <- c(as.list(environment()), list(...))
for (nm in names(call)[-c(1:2)]) # exclude "" and "x"
call[[nm]] <- args[[nm]]
## choose the model to be fitted
model <- ifelse(length(penalized) == 0, "base0", "hs")
if (family$family == "binomial") model <- paste0(model, "_logit")
## set or check adapt.delta
if (is.null(adapt.delta)) {
adapt.delta <- ifelse(grepl("hs", model), 0.99, 0.95)
} else {
validate.adapt.delta(adapt.delta)
}
## create the design matrix
X <- ordered.model.matrix(x, model.terms$unpenalized, model.terms$penalized)
N <- nrow(X)
P <- ncol(X)
## number of penalized and unpenalized columns in the design matrix
K <- length(expand.terms(x, model.terms$penalized)[-1])
U <- P - K
## thin QR decomposition
if (P > N) qr <- FALSE
if (qr) {
qr.dec <- qr(X)
Q.qr <- qr.Q(qr.dec)
R.inv <- qr.solve(qr.dec, Q.qr) * sqrt(N - 1)
Q.qr <- Q.qr * sqrt(N - 1)
}
## global scale for regularized horseshoe prior
global.scale <- if (regularized) par.ratio / sqrt(N) else 1
## core block
{
## parameters not used by a model are ignored
data.input <- list(X=if (qr) Q.qr else X, y=y, N=N,
P=P, U=U, scale_u=scale.u,
regularized=regularized, nu=nu,
global_scale=global.scale, global_df=global.df,
slab_scale=slab.scale, slab_df=slab.df)
## run the stan model
samples <- rstan::sampling(stanmodels[[model]], data=data.input,
iter=iter, warmup=warmup, seed=seed, ...,
pars=hs.pars, include=keep.hs.pars,
control=list(adapt_delta=adapt.delta))
if (is.na(nrow(samples)))
stop("rstan::sampling failed, see error message above.", call.=FALSE)
## assign proper names
par.idx <- grep("^beta_[up]", names(samples))
stopifnot(length(par.idx) == ncol(X))
names(samples)[par.idx] <- colnames(X)
if (qr) {
pars <- grep("beta_", samples@sim$pars_oi, value=TRUE)
stopifnot(pars[1] == "beta_u")
beta.tilde <- rstan::extract(samples, pars=pars,
inc_warmup=TRUE, permuted=FALSE)
B <- apply(beta.tilde, 1:2, FUN=function(z) R.inv %*% z)
chains <- ncol(beta.tilde)
for (chain in 1:chains) {
for (p in 1:P)
samples@sim$samples[[chain]][[par.idx[p]]] <- B[p, , chain]
}
}
## store the hierarchical shrinkage settings
opts <- list(adapt.delta=adapt.delta, qr=qr, seed=seed, scale.u=scale.u)
if (K > 0)
opts <- c(opts, regularized=regularized, nu=nu, par.ratio=par.ratio,
global.scale=global.scale, global.df=global.df,
slab.scale=slab.scale, slab.df=slab.df)
## compute the posterior means of the regression coefficients
betas <- list(unpenalized=colMeans(as.matrix(samples, pars="beta_u")),
penalized=tryCatch(colMeans(as.matrix(samples,
pars="beta_p")),
error=function(e) NULL))
obj <- list(stanfit=samples, betas=betas, call=call, data=x,
model.terms=model.terms, family=family, hsstan.settings=opts)
class(obj) <- "hsstan"
}
return(obj)
}
#' K-fold cross-validation
#'
#' Perform K-fold cross-validation using the same settings used when fitting
#' the model on the whole data.
#'
#' @param x An object of class `hsstan`.
#' @param folds Integer vector with one element per observation indicating the
#' cross-validation fold in which the observation should be withdrawn.
#' @param chains Number of Markov chains to run. By default this is set to 1,
#' independently of the number of chains used for `x`.
#' @param store.fits Whether the fitted models for each fold should be stored
#' in the returned object (`TRUE` by default).
#' @param cores Number of cores to use for parallelization (the value of
#' `options("mc.cores")` by default). The cross-validation folds will
#' be distributed to the available cores, and the Markov chains for each
#' model will be run sequentially.
#' @param ... Further arguments passed to [rstan::sampling()].
#'
#' @return
#' An object with classes `kfold` and `loo` that has a similar structure as the
#' objects returned by [loo()] and [waic()] and is compatible with the
#' \code{\link[loo]{loo_compare}} function for
#' comparing models. The object contains the following fields:
#' \item{estimates}{a matrix containing point estimates and standard errors of
#' the expected log pointwise predictive density ("elpd_kfold"),
#' the effective number of parameters ("p_kfold", always `NA`) and the
#' K-fold information criterion "kfoldic" (which is `-2 * elpd_kfold`,
#' i.e., converted to the deviance scale).}
#' \item{pointwise}{a matrix containing the pointwise contributions of
#' "elpd_kfold", "p_kfold" and "kfoldic".}
#' \item{fits}{a matrix with two columns and number of rows equal to the number
#' of cross-validation folds. Column `fit` contains the fitted
#' `hsstan` objects for each fold, and column `test.idx` contains
#' the indices of the withdrawn observations for each fold. This is not
#' present if `store.fits=FALSE`.}
#' \item{data}{the dataset used in fitting the model (before withdrawing
#' observations). This is not present if `store.fits=FALSE`.}
#'
#' @examples
#' \donttest{
#' \dontshow{utils::example("hsstan", echo=FALSE)}
#' # continued from ?hsstan
#' # only 2 folds for speed of example
#' folds <- rep(1:2, length.out=length(df$Y))
#' cv.biom <- kfold(hs.biom, folds=folds, cores=2)
#' }
#'
#' @importFrom loo kfold
#' @method kfold hsstan
#' @aliases kfold
#' @export kfold
#' @export
kfold.hsstan <- function(x, folds, chains=1, store.fits=TRUE,
cores=getOption("mc.cores", 1), ...) {
data <- x$data
N <- nrow(data)
folds <- validate.folds(folds, N)
num.folds <- max(folds)
validate.positive.scalar(chains, "chains", int=TRUE)
validate.rstan.args(...)
## collect the list of calls to be evaluated in parallel
calls <- list()
for (fold in 1:num.folds) {
test.idx <- which(folds == fold)
fit.call <- stats::update(object=x, x=data[-test.idx, , drop=FALSE],
chains=chains, cores=1, refresh=0,
open_progress=FALSE, evaluate=FALSE, ...)
fit.call$x <- eval(fit.call$x)
calls[[fold]] <- fit.call
}
## evaluate the models
message("Fitting ", num.folds, " models using ",
min(cores, num.folds), " cores")
par.fun <- function(fold) {
fit <- eval(calls[[fold]])
## log pointwise predictive densities (pointwise test log-likelihood)
lppd <- log_lik(fit, newdata=data[which(folds == fold), , drop=FALSE])
return(list(lppd=lppd, fit=if (store.fits) fit else NULL))
}
if (.Platform$OS.type != "windows") {
cv <- parallel::mclapply(X=1:num.folds, mc.cores=cores,
mc.preschedule=FALSE, FUN=par.fun)
} else { # windows
cl <- parallel::makePSOCKcluster(cores)
on.exit(parallel::stopCluster(cl))
cv <- parallel::parLapply(X=1:num.folds, cl=cl, fun=par.fun)
}
## expected log predictive densities
elpds.unord <- unlist(lapply(cv, function(z) apply(z$lppd, 2, logMeanExp)))
obs.idx <- unlist(lapply(1:num.folds, function(z) which(folds == z)))
elpds <- elpds.unord[obs.idx]
pointwise <- cbind(elpd_kfold=elpds, p_kfold=NA, kfoldic=-2 * elpds)
estimates <- colSums(pointwise)
se.est <- sqrt(N * apply(pointwise, 2, stats::var))
out <- list(estimates=cbind(Estimate=estimates, SE=se.est),
pointwise=pointwise)
rownames(out$estimates) <- colnames(pointwise)
if (store.fits) {
fits <- array(list(), c(num.folds, 2), list(NULL, c("fit", "test.idx")))
for (fold in 1:num.folds)
fits[fold, ] <- list(fit=cv[[fold]][["fit"]],
test.idx=which(folds == fold))
out$fits <- fits
out$data <- data
}
attr(out, "K") <- num.folds
class(out) <- c("kfold", "loo")
return(out)
}