a b/R/biotmle.R
1
utils::globalVariables(c("assay<-"))
2
3
#' Biomarker Evaluation with Targeted Minimum Loss Estimation of the ATE
4
#'
5
#' Computes the causal target parameter defined as the difference between the
6
#' biomarker expression values under treatment and those same values under no
7
#' treatment, using Targeted Minimum Loss Estimation.
8
#'
9
#' @param se A \code{SummarizedExperiment} containing microarray expression
10
#'  or next-generation sequencing data in the \code{assays} slot and a matrix
11
#'  of phenotype-level data in the \code{colData} slot.
12
#' @param varInt A \code{numeric} indicating the column of the design matrix
13
#'  corresponding to the treatment or outcome of interest (in the
14
#'  \code{colData} slot of the \code{SummarizedExperiment} argument "se").
15
#' @param normalized A \code{logical} indicating whether the data included in
16
#'  the \code{assay} slot of the input \code{SummarizedExperiment} object has
17
#'  been normalized externally. The default is set to \code{TRUE} with the
18
#'  expectation that an appropriate normalization method has been applied. If
19
#'  set to \code{FALSE}, median normalization is performed for microarray data.
20
#' @param ngscounts A \code{logical} indicating whether the data are counts
21
#'  generated from a next-generation sequencing experiment (e.g., RNA-seq). The
22
#'  default setting assumes continuous expression measures as generated by
23
#'  microarray platforms.
24
#' @param bppar_type A parallelization option specified by \code{BiocParallel}.
25
#'  Consult the manual page for \code{\link[BiocParallel]{BiocParallelParam}}
26
#'  for possible types and their descriptions. The default for this argument is
27
#'  \code{\link[BiocParallel]{MulticoreParam}}, for multicore evaluation.
28
#' @param bppar_debug A \code{logical} indicating whether or not to rely upon
29
#'  pkg{BiocParallel}. Setting this argument to \code{TRUE}, replaces the call
30
#'  to \code{\link[BiocParallel]{bplapply}} by a call to \code{lapply}, which
31
#'  significantly reduces the overhead of debugging. Note that invoking this
32
#'  option overrides all other parallelization arguments.
33
#' @param cv_folds A \code{numeric} scalar indicating how many folds to use in
34
#'  performing targeted minimum loss estimation. Cross-validated estimates have
35
#'  been demonstrated to allow relaxation of certain theoretical conditions and
36
#'  and accommodate the construction of more conservative variance estimates.
37
#' @param g_lib A \code{character} vector specifying the library of machine
38
#'  learning algorithms for use in fitting the propensity score P(A = a | W).
39
#' @param Q_lib A \code{character} vector specifying the library of machine
40
#'  learning algorithms for use in fitting the outcome regression E[Y | A,W].
41
#' @param ... Additional arguments to be passed to \code{\link[drtmle]{drtmle}}
42
#'  in computing the targeted minimum loss estimator of the average treatment
43
#'  effect.
44
#'
45
#' @importFrom SummarizedExperiment assay colData rowData SummarizedExperiment
46
#' @importFrom BiocParallel register bplapply bpprogressbar MulticoreParam
47
#' @importFrom tibble as_tibble
48
#'
49
#' @return S4 object of class \code{biotmle}, inheriting from
50
#'  \code{SummarizedExperiment}, with additional slots \code{tmleOut} and
51
#'  \code{call}, among others, containing TML estimates of the ATE of exposure
52
#'  on biomarker expression.
53
#'
54
#' @export biomarkertmle
55
#'
56
#' @examples
57
#' library(dplyr)
58
#' library(biotmleData)
59
#' library(SuperLearner)
60
#' library(SummarizedExperiment)
61
#' data(illuminaData)
62
#'
63
#' colData(illuminaData) <- colData(illuminaData) %>%
64
#'   data.frame() %>%
65
#'   mutate(age = as.numeric(age > median(age))) %>%
66
#'   DataFrame()
67
#' benz_idx <- which(names(colData(illuminaData)) %in% "benzene")
68
#'
69
#' biomarkerTMLEout <- biomarkertmle(
70
#'   se = illuminaData[1:2, ],
71
#'   varInt = benz_idx,
72
#'   bppar_type = BiocParallel::SerialParam(),
73
#'   g_lib = c("SL.mean", "SL.glm"),
74
#'   Q_lib = c("SL.mean", "SL.glm")
75
#' )
76
biomarkertmle <- function(se,
77
                          varInt,
78
                          normalized = TRUE,
79
                          ngscounts = FALSE,
80
                          bppar_type = BiocParallel::MulticoreParam(),
81
                          bppar_debug = FALSE,
82
                          cv_folds = 1,
83
                          g_lib = c(
84
                            "SL.mean", "SL.glm", "SL.bayesglm"
85
                          ),
86
                          Q_lib = c(
87
                            "SL.mean", "SL.bayesglm", "SL.earth", "SL.ranger"
88
                          ),
89
                          ...) {
90
91
  # catch input and invoke S4 class constructor for "bioTMLE" object
92
  call <- match.call(expand.dots = TRUE)
93
  biotmle <- .biotmle(
94
    SummarizedExperiment(
95
      assays = list(expMeasures = assay(se)),
96
      rowData = rowData(se),
97
      colData = colData(se)
98
    ),
99
    call = call,
100
    tmleOut = tibble::as_tibble(matrix(NA, 10, 10), .name_repair = "minimal"),
101
    topTable = tibble::as_tibble(matrix(NA, 10, 10), .name_repair = "minimal")
102
  )
103
104
  # invoke the voom transform from LIMMA if next-generation sequencing data)
105
  if (ngscounts) {
106
    voom_out <- rnaseq_ic(biotmle)
107
    voom_exp <- 2^(voom_out$E)
108
    assay(se) <- voom_exp
109
  }
110
111
  # set up parallelization based on input
112
  BiocParallel::bpprogressbar(bppar_type) <- TRUE
113
  BiocParallel::register(bppar_type, default = TRUE)
114
115
  # TMLE procedure to identify biomarkers based on an EXPOSURE
116
  if (!ngscounts && !normalized) {
117
    # median normalization
118
    exp_normed <- limma::normalizeBetweenArrays(as.matrix(assay(se)),
119
      method = "scale"
120
    )
121
    Y <- tibble::as_tibble(t(exp_normed), .name_repair = "minimal")
122
  } else {
123
    Y <- tibble::as_tibble(t(as.matrix(assay(se))), .name_repair = "minimal")
124
  }
125
  # simple sanity check of whether Y includes array values
126
  if (!all(apply(Y, 2, class) == "numeric")) {
127
    stop("Warning - values in Y do not appear to be numeric.")
128
  }
129
130
  # exposure / treatment
131
  A <- as.numeric(SummarizedExperiment::colData(se)[, varInt])
132
133
  # baseline covariates
134
  W <- tibble::as_tibble(SummarizedExperiment::colData(se)[, -varInt],
135
                         .name_repair = "minimal")
136
  if (is.null(dim(W)[2])) {
137
    W <- as.numeric(rep(1, length(A)))
138
  }
139
140
  # coerce matrix of baseline covariates to numeric
141
  if (!all(is.numeric(apply(W, 2, class)))) {
142
    W <- tibble::as_tibble(apply(W, 2, as.numeric), .name_repair = "minimal")
143
  }
144
145
  # perform multi-level TMLE (of the ATE) for genes as Y
146
  if (!bppar_debug) {
147
    biomarkertmle_out <- BiocParallel::bplapply(Y[, seq_along(Y)],
148
      exp_biomarkertmle,
149
      W = W,
150
      A = A,
151
      g_lib = g_lib,
152
      Q_lib = Q_lib,
153
      cv_folds = cv_folds,
154
      ...
155
    )
156
  } else {
157
    biomarkertmle_out <- lapply(Y[, seq_along(Y)],
158
      exp_biomarkertmle,
159
      W = W,
160
      A = A,
161
      g_lib = g_lib,
162
      Q_lib = Q_lib,
163
      cv_folds = cv_folds,
164
      ...
165
    )
166
  }
167
  biomarkertmle_params <- do.call(c, lapply(biomarkertmle_out, `[[`, "param"))
168
  biomarkertmle_eifs <- do.call(
169
    cbind.data.frame,
170
    lapply(biomarkertmle_out, `[[`, "eif")
171
  )
172
173
  biotmle@ateOut <- as.numeric(biomarkertmle_params)
174
  if (!ngscounts) {
175
    biomarker_eifs <- t(as.matrix(biomarkertmle_eifs))
176
    colnames(biomarker_eifs) <- colnames(se)
177
    biotmle@tmleOut <- tibble::as_tibble(
178
      biomarker_eifs,
179
      .name_repair = "minimal"
180
    )
181
  } else {
182
    voom_out$E <- t(as.matrix(biomarkertmle_eifs))
183
    biotmle@tmleOut <- voom_out
184
  }
185
  return(biotmle)
186
}
187
188
###############################################################################
189
190
#' TMLE procedure using ATE for Biomarker Identication from Exposure
191
#'
192
#' This function performs influence curve-based estimation of the effect of an
193
#' exposure on biological expression values associated with a given biomarker,
194
#' controlling for a user-specified set of baseline covariates.
195
#'
196
#' @param Y A \code{numeric} vector of expression values for a given biomarker.
197
#' @param A A \code{numeric} vector of discretized exposure vector (e.g., from
198
#'  a design matrix whose effect on expression values is of interest.
199
#' @param W A \code{Matrix} of \code{numeric} values corresponding to baseline
200
#'  covariates to be marginalized over in the estimation process.
201
#' @param g_lib A \code{character} vector identifying the library of learning
202
#'  algorithms to be used in fitting the propensity score P[A = a | W].
203
#' @param Q_lib A \code{character} vector identifying the library of learning
204
#'  algorithms to be used in fitting the outcome regression E[Y | A, W].
205
#' @param cv_folds A \code{numeric} scalar indicating how many folds to use in
206
#'  performing targeted minimum loss estimation. Cross-validated estimates are
207
#'  more robust, allowing relaxing of theoretical conditions and construction
208
#'  of conservative variance estimates.
209
#' @param ... Additional arguments passed to \code{\link[drtmle]{drtmle}} in
210
#'  computing the targeted minimum loss estimator of the average treatment
211
#'  effect.
212
#'
213
#' @importFrom assertthat assert_that
214
#' @importFrom drtmle drtmle
215
#'
216
#' @return TMLE-based estimate of the relationship between biomarker expression
217
#'  and changes in an exposure variable, computed iteratively and saved in the
218
#'  \code{tmleOut} slot in a \code{biotmle} object.
219
exp_biomarkertmle <- function(Y,
220
                              A,
221
                              W,
222
                              g_lib,
223
                              Q_lib,
224
                              cv_folds,
225
                              ...) {
226
  # check the case that Y is passed in as a column of a data.frame
227
  if (any(class(Y) == "data.frame")) Y <- as.numeric(unlist(Y[, 1]))
228
  if (any(class(A) == "data.frame")) A <- as.numeric(unlist(A[, 1]))
229
  assertthat::assert_that(length(unique(A)) > 1)
230
231
  # fit standard (possibly CV) TML estimator (n.b., guard = NULL)
232
  a_0 <- sort(unique(A[!is.na(A)]))
233
  suppressWarnings(
234
    tmle_fit <- drtmle::drtmle(
235
      Y = Y,
236
      A = A,
237
      W = W,
238
      a_0 = a_0,
239
      SL_g = g_lib,
240
      SL_Q = Q_lib,
241
      cvFolds = cv_folds,
242
      stratify = TRUE,
243
      guard = NULL,
244
      parallel = FALSE,
245
      use_future = FALSE,
246
      ...
247
    )
248
  )
249
250
  # compute ATE and estimated EIF by delta method
251
  ate_tmle <- tmle_fit$tmle$est[seq_along(a_0)[-1]] - tmle_fit$tmle$est[1]
252
  eif_tmle_delta <- tmle_fit$ic$ic[, seq_along(a_0)[-1]] - tmle_fit$ic$ic[, 1]
253
254
  # return only highest contrast (e.g., a[1] v a[5]) if many contrasts
255
  if (!is.vector(eif_tmle_delta)) {
256
    param_out <- ate_tmle[length(ate_tmle)]
257
    eif_out <- eif_tmle_delta[, ncol(eif_tmle_delta)] +
258
      ate_tmle[length(ate_tmle)]
259
  } else {
260
    param_out <- ate_tmle
261
    eif_out <- eif_tmle_delta + ate_tmle
262
  }
263
  assertthat::assert_that(is.vector(eif_out))
264
265
  # output
266
  out <- list(param = param_out, eif = eif_out)
267
  return(out)
268
}