[d84c2d]: / R / biotmle.R

Download this file

269 lines (252 with data), 10.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
utils::globalVariables(c("assay<-"))
#' Biomarker Evaluation with Targeted Minimum Loss Estimation of the ATE
#'
#' Computes the causal target parameter defined as the difference between the
#' biomarker expression values under treatment and those same values under no
#' treatment, using Targeted Minimum Loss Estimation.
#'
#' @param se A \code{SummarizedExperiment} containing microarray expression
#' or next-generation sequencing data in the \code{assays} slot and a matrix
#' of phenotype-level data in the \code{colData} slot.
#' @param varInt A \code{numeric} indicating the column of the design matrix
#' corresponding to the treatment or outcome of interest (in the
#' \code{colData} slot of the \code{SummarizedExperiment} argument "se").
#' @param normalized A \code{logical} indicating whether the data included in
#' the \code{assay} slot of the input \code{SummarizedExperiment} object has
#' been normalized externally. The default is set to \code{TRUE} with the
#' expectation that an appropriate normalization method has been applied. If
#' set to \code{FALSE}, median normalization is performed for microarray data.
#' @param ngscounts A \code{logical} indicating whether the data are counts
#' generated from a next-generation sequencing experiment (e.g., RNA-seq). The
#' default setting assumes continuous expression measures as generated by
#' microarray platforms.
#' @param bppar_type A parallelization option specified by \code{BiocParallel}.
#' Consult the manual page for \code{\link[BiocParallel]{BiocParallelParam}}
#' for possible types and their descriptions. The default for this argument is
#' \code{\link[BiocParallel]{MulticoreParam}}, for multicore evaluation.
#' @param bppar_debug A \code{logical} indicating whether or not to rely upon
#' pkg{BiocParallel}. Setting this argument to \code{TRUE}, replaces the call
#' to \code{\link[BiocParallel]{bplapply}} by a call to \code{lapply}, which
#' significantly reduces the overhead of debugging. Note that invoking this
#' option overrides all other parallelization arguments.
#' @param cv_folds A \code{numeric} scalar indicating how many folds to use in
#' performing targeted minimum loss estimation. Cross-validated estimates have
#' been demonstrated to allow relaxation of certain theoretical conditions and
#' and accommodate the construction of more conservative variance estimates.
#' @param g_lib A \code{character} vector specifying the library of machine
#' learning algorithms for use in fitting the propensity score P(A = a | W).
#' @param Q_lib A \code{character} vector specifying the library of machine
#' learning algorithms for use in fitting the outcome regression E[Y | A,W].
#' @param ... Additional arguments to be passed to \code{\link[drtmle]{drtmle}}
#' in computing the targeted minimum loss estimator of the average treatment
#' effect.
#'
#' @importFrom SummarizedExperiment assay colData rowData SummarizedExperiment
#' @importFrom BiocParallel register bplapply bpprogressbar MulticoreParam
#' @importFrom tibble as_tibble
#'
#' @return S4 object of class \code{biotmle}, inheriting from
#' \code{SummarizedExperiment}, with additional slots \code{tmleOut} and
#' \code{call}, among others, containing TML estimates of the ATE of exposure
#' on biomarker expression.
#'
#' @export biomarkertmle
#'
#' @examples
#' library(dplyr)
#' library(biotmleData)
#' library(SuperLearner)
#' library(SummarizedExperiment)
#' data(illuminaData)
#'
#' colData(illuminaData) <- colData(illuminaData) %>%
#' data.frame() %>%
#' mutate(age = as.numeric(age > median(age))) %>%
#' DataFrame()
#' benz_idx <- which(names(colData(illuminaData)) %in% "benzene")
#'
#' biomarkerTMLEout <- biomarkertmle(
#' se = illuminaData[1:2, ],
#' varInt = benz_idx,
#' bppar_type = BiocParallel::SerialParam(),
#' g_lib = c("SL.mean", "SL.glm"),
#' Q_lib = c("SL.mean", "SL.glm")
#' )
biomarkertmle <- function(se,
varInt,
normalized = TRUE,
ngscounts = FALSE,
bppar_type = BiocParallel::MulticoreParam(),
bppar_debug = FALSE,
cv_folds = 1,
g_lib = c(
"SL.mean", "SL.glm", "SL.bayesglm"
),
Q_lib = c(
"SL.mean", "SL.bayesglm", "SL.earth", "SL.ranger"
),
...) {
# catch input and invoke S4 class constructor for "bioTMLE" object
call <- match.call(expand.dots = TRUE)
biotmle <- .biotmle(
SummarizedExperiment(
assays = list(expMeasures = assay(se)),
rowData = rowData(se),
colData = colData(se)
),
call = call,
tmleOut = tibble::as_tibble(matrix(NA, 10, 10), .name_repair = "minimal"),
topTable = tibble::as_tibble(matrix(NA, 10, 10), .name_repair = "minimal")
)
# invoke the voom transform from LIMMA if next-generation sequencing data)
if (ngscounts) {
voom_out <- rnaseq_ic(biotmle)
voom_exp <- 2^(voom_out$E)
assay(se) <- voom_exp
}
# set up parallelization based on input
BiocParallel::bpprogressbar(bppar_type) <- TRUE
BiocParallel::register(bppar_type, default = TRUE)
# TMLE procedure to identify biomarkers based on an EXPOSURE
if (!ngscounts && !normalized) {
# median normalization
exp_normed <- limma::normalizeBetweenArrays(as.matrix(assay(se)),
method = "scale"
)
Y <- tibble::as_tibble(t(exp_normed), .name_repair = "minimal")
} else {
Y <- tibble::as_tibble(t(as.matrix(assay(se))), .name_repair = "minimal")
}
# simple sanity check of whether Y includes array values
if (!all(apply(Y, 2, class) == "numeric")) {
stop("Warning - values in Y do not appear to be numeric.")
}
# exposure / treatment
A <- as.numeric(SummarizedExperiment::colData(se)[, varInt])
# baseline covariates
W <- tibble::as_tibble(SummarizedExperiment::colData(se)[, -varInt],
.name_repair = "minimal")
if (is.null(dim(W)[2])) {
W <- as.numeric(rep(1, length(A)))
}
# coerce matrix of baseline covariates to numeric
if (!all(is.numeric(apply(W, 2, class)))) {
W <- tibble::as_tibble(apply(W, 2, as.numeric), .name_repair = "minimal")
}
# perform multi-level TMLE (of the ATE) for genes as Y
if (!bppar_debug) {
biomarkertmle_out <- BiocParallel::bplapply(Y[, seq_along(Y)],
exp_biomarkertmle,
W = W,
A = A,
g_lib = g_lib,
Q_lib = Q_lib,
cv_folds = cv_folds,
...
)
} else {
biomarkertmle_out <- lapply(Y[, seq_along(Y)],
exp_biomarkertmle,
W = W,
A = A,
g_lib = g_lib,
Q_lib = Q_lib,
cv_folds = cv_folds,
...
)
}
biomarkertmle_params <- do.call(c, lapply(biomarkertmle_out, `[[`, "param"))
biomarkertmle_eifs <- do.call(
cbind.data.frame,
lapply(biomarkertmle_out, `[[`, "eif")
)
biotmle@ateOut <- as.numeric(biomarkertmle_params)
if (!ngscounts) {
biomarker_eifs <- t(as.matrix(biomarkertmle_eifs))
colnames(biomarker_eifs) <- colnames(se)
biotmle@tmleOut <- tibble::as_tibble(
biomarker_eifs,
.name_repair = "minimal"
)
} else {
voom_out$E <- t(as.matrix(biomarkertmle_eifs))
biotmle@tmleOut <- voom_out
}
return(biotmle)
}
###############################################################################
#' TMLE procedure using ATE for Biomarker Identication from Exposure
#'
#' This function performs influence curve-based estimation of the effect of an
#' exposure on biological expression values associated with a given biomarker,
#' controlling for a user-specified set of baseline covariates.
#'
#' @param Y A \code{numeric} vector of expression values for a given biomarker.
#' @param A A \code{numeric} vector of discretized exposure vector (e.g., from
#' a design matrix whose effect on expression values is of interest.
#' @param W A \code{Matrix} of \code{numeric} values corresponding to baseline
#' covariates to be marginalized over in the estimation process.
#' @param g_lib A \code{character} vector identifying the library of learning
#' algorithms to be used in fitting the propensity score P[A = a | W].
#' @param Q_lib A \code{character} vector identifying the library of learning
#' algorithms to be used in fitting the outcome regression E[Y | A, W].
#' @param cv_folds A \code{numeric} scalar indicating how many folds to use in
#' performing targeted minimum loss estimation. Cross-validated estimates are
#' more robust, allowing relaxing of theoretical conditions and construction
#' of conservative variance estimates.
#' @param ... Additional arguments passed to \code{\link[drtmle]{drtmle}} in
#' computing the targeted minimum loss estimator of the average treatment
#' effect.
#'
#' @importFrom assertthat assert_that
#' @importFrom drtmle drtmle
#'
#' @return TMLE-based estimate of the relationship between biomarker expression
#' and changes in an exposure variable, computed iteratively and saved in the
#' \code{tmleOut} slot in a \code{biotmle} object.
exp_biomarkertmle <- function(Y,
A,
W,
g_lib,
Q_lib,
cv_folds,
...) {
# check the case that Y is passed in as a column of a data.frame
if (any(class(Y) == "data.frame")) Y <- as.numeric(unlist(Y[, 1]))
if (any(class(A) == "data.frame")) A <- as.numeric(unlist(A[, 1]))
assertthat::assert_that(length(unique(A)) > 1)
# fit standard (possibly CV) TML estimator (n.b., guard = NULL)
a_0 <- sort(unique(A[!is.na(A)]))
suppressWarnings(
tmle_fit <- drtmle::drtmle(
Y = Y,
A = A,
W = W,
a_0 = a_0,
SL_g = g_lib,
SL_Q = Q_lib,
cvFolds = cv_folds,
stratify = TRUE,
guard = NULL,
parallel = FALSE,
use_future = FALSE,
...
)
)
# compute ATE and estimated EIF by delta method
ate_tmle <- tmle_fit$tmle$est[seq_along(a_0)[-1]] - tmle_fit$tmle$est[1]
eif_tmle_delta <- tmle_fit$ic$ic[, seq_along(a_0)[-1]] - tmle_fit$ic$ic[, 1]
# return only highest contrast (e.g., a[1] v a[5]) if many contrasts
if (!is.vector(eif_tmle_delta)) {
param_out <- ate_tmle[length(ate_tmle)]
eif_out <- eif_tmle_delta[, ncol(eif_tmle_delta)] +
ate_tmle[length(ate_tmle)]
} else {
param_out <- ate_tmle
eif_out <- eif_tmle_delta + ate_tmle
}
assertthat::assert_that(is.vector(eif_out))
# output
out <- list(param = param_out, eif = eif_out)
return(out)
}