--- a +++ b/R/biotmle.R @@ -0,0 +1,268 @@ +utils::globalVariables(c("assay<-")) + +#' Biomarker Evaluation with Targeted Minimum Loss Estimation of the ATE +#' +#' Computes the causal target parameter defined as the difference between the +#' biomarker expression values under treatment and those same values under no +#' treatment, using Targeted Minimum Loss Estimation. +#' +#' @param se A \code{SummarizedExperiment} containing microarray expression +#' or next-generation sequencing data in the \code{assays} slot and a matrix +#' of phenotype-level data in the \code{colData} slot. +#' @param varInt A \code{numeric} indicating the column of the design matrix +#' corresponding to the treatment or outcome of interest (in the +#' \code{colData} slot of the \code{SummarizedExperiment} argument "se"). +#' @param normalized A \code{logical} indicating whether the data included in +#' the \code{assay} slot of the input \code{SummarizedExperiment} object has +#' been normalized externally. The default is set to \code{TRUE} with the +#' expectation that an appropriate normalization method has been applied. If +#' set to \code{FALSE}, median normalization is performed for microarray data. +#' @param ngscounts A \code{logical} indicating whether the data are counts +#' generated from a next-generation sequencing experiment (e.g., RNA-seq). The +#' default setting assumes continuous expression measures as generated by +#' microarray platforms. +#' @param bppar_type A parallelization option specified by \code{BiocParallel}. +#' Consult the manual page for \code{\link[BiocParallel]{BiocParallelParam}} +#' for possible types and their descriptions. The default for this argument is +#' \code{\link[BiocParallel]{MulticoreParam}}, for multicore evaluation. +#' @param bppar_debug A \code{logical} indicating whether or not to rely upon +#' pkg{BiocParallel}. Setting this argument to \code{TRUE}, replaces the call +#' to \code{\link[BiocParallel]{bplapply}} by a call to \code{lapply}, which +#' significantly reduces the overhead of debugging. Note that invoking this +#' option overrides all other parallelization arguments. +#' @param cv_folds A \code{numeric} scalar indicating how many folds to use in +#' performing targeted minimum loss estimation. Cross-validated estimates have +#' been demonstrated to allow relaxation of certain theoretical conditions and +#' and accommodate the construction of more conservative variance estimates. +#' @param g_lib A \code{character} vector specifying the library of machine +#' learning algorithms for use in fitting the propensity score P(A = a | W). +#' @param Q_lib A \code{character} vector specifying the library of machine +#' learning algorithms for use in fitting the outcome regression E[Y | A,W]. +#' @param ... Additional arguments to be passed to \code{\link[drtmle]{drtmle}} +#' in computing the targeted minimum loss estimator of the average treatment +#' effect. +#' +#' @importFrom SummarizedExperiment assay colData rowData SummarizedExperiment +#' @importFrom BiocParallel register bplapply bpprogressbar MulticoreParam +#' @importFrom tibble as_tibble +#' +#' @return S4 object of class \code{biotmle}, inheriting from +#' \code{SummarizedExperiment}, with additional slots \code{tmleOut} and +#' \code{call}, among others, containing TML estimates of the ATE of exposure +#' on biomarker expression. +#' +#' @export biomarkertmle +#' +#' @examples +#' library(dplyr) +#' library(biotmleData) +#' library(SuperLearner) +#' library(SummarizedExperiment) +#' data(illuminaData) +#' +#' colData(illuminaData) <- colData(illuminaData) %>% +#' data.frame() %>% +#' mutate(age = as.numeric(age > median(age))) %>% +#' DataFrame() +#' benz_idx <- which(names(colData(illuminaData)) %in% "benzene") +#' +#' biomarkerTMLEout <- biomarkertmle( +#' se = illuminaData[1:2, ], +#' varInt = benz_idx, +#' bppar_type = BiocParallel::SerialParam(), +#' g_lib = c("SL.mean", "SL.glm"), +#' Q_lib = c("SL.mean", "SL.glm") +#' ) +biomarkertmle <- function(se, + varInt, + normalized = TRUE, + ngscounts = FALSE, + bppar_type = BiocParallel::MulticoreParam(), + bppar_debug = FALSE, + cv_folds = 1, + g_lib = c( + "SL.mean", "SL.glm", "SL.bayesglm" + ), + Q_lib = c( + "SL.mean", "SL.bayesglm", "SL.earth", "SL.ranger" + ), + ...) { + + # catch input and invoke S4 class constructor for "bioTMLE" object + call <- match.call(expand.dots = TRUE) + biotmle <- .biotmle( + SummarizedExperiment( + assays = list(expMeasures = assay(se)), + rowData = rowData(se), + colData = colData(se) + ), + call = call, + tmleOut = tibble::as_tibble(matrix(NA, 10, 10), .name_repair = "minimal"), + topTable = tibble::as_tibble(matrix(NA, 10, 10), .name_repair = "minimal") + ) + + # invoke the voom transform from LIMMA if next-generation sequencing data) + if (ngscounts) { + voom_out <- rnaseq_ic(biotmle) + voom_exp <- 2^(voom_out$E) + assay(se) <- voom_exp + } + + # set up parallelization based on input + BiocParallel::bpprogressbar(bppar_type) <- TRUE + BiocParallel::register(bppar_type, default = TRUE) + + # TMLE procedure to identify biomarkers based on an EXPOSURE + if (!ngscounts && !normalized) { + # median normalization + exp_normed <- limma::normalizeBetweenArrays(as.matrix(assay(se)), + method = "scale" + ) + Y <- tibble::as_tibble(t(exp_normed), .name_repair = "minimal") + } else { + Y <- tibble::as_tibble(t(as.matrix(assay(se))), .name_repair = "minimal") + } + # simple sanity check of whether Y includes array values + if (!all(apply(Y, 2, class) == "numeric")) { + stop("Warning - values in Y do not appear to be numeric.") + } + + # exposure / treatment + A <- as.numeric(SummarizedExperiment::colData(se)[, varInt]) + + # baseline covariates + W <- tibble::as_tibble(SummarizedExperiment::colData(se)[, -varInt], + .name_repair = "minimal") + if (is.null(dim(W)[2])) { + W <- as.numeric(rep(1, length(A))) + } + + # coerce matrix of baseline covariates to numeric + if (!all(is.numeric(apply(W, 2, class)))) { + W <- tibble::as_tibble(apply(W, 2, as.numeric), .name_repair = "minimal") + } + + # perform multi-level TMLE (of the ATE) for genes as Y + if (!bppar_debug) { + biomarkertmle_out <- BiocParallel::bplapply(Y[, seq_along(Y)], + exp_biomarkertmle, + W = W, + A = A, + g_lib = g_lib, + Q_lib = Q_lib, + cv_folds = cv_folds, + ... + ) + } else { + biomarkertmle_out <- lapply(Y[, seq_along(Y)], + exp_biomarkertmle, + W = W, + A = A, + g_lib = g_lib, + Q_lib = Q_lib, + cv_folds = cv_folds, + ... + ) + } + biomarkertmle_params <- do.call(c, lapply(biomarkertmle_out, `[[`, "param")) + biomarkertmle_eifs <- do.call( + cbind.data.frame, + lapply(biomarkertmle_out, `[[`, "eif") + ) + + biotmle@ateOut <- as.numeric(biomarkertmle_params) + if (!ngscounts) { + biomarker_eifs <- t(as.matrix(biomarkertmle_eifs)) + colnames(biomarker_eifs) <- colnames(se) + biotmle@tmleOut <- tibble::as_tibble( + biomarker_eifs, + .name_repair = "minimal" + ) + } else { + voom_out$E <- t(as.matrix(biomarkertmle_eifs)) + biotmle@tmleOut <- voom_out + } + return(biotmle) +} + +############################################################################### + +#' TMLE procedure using ATE for Biomarker Identication from Exposure +#' +#' This function performs influence curve-based estimation of the effect of an +#' exposure on biological expression values associated with a given biomarker, +#' controlling for a user-specified set of baseline covariates. +#' +#' @param Y A \code{numeric} vector of expression values for a given biomarker. +#' @param A A \code{numeric} vector of discretized exposure vector (e.g., from +#' a design matrix whose effect on expression values is of interest. +#' @param W A \code{Matrix} of \code{numeric} values corresponding to baseline +#' covariates to be marginalized over in the estimation process. +#' @param g_lib A \code{character} vector identifying the library of learning +#' algorithms to be used in fitting the propensity score P[A = a | W]. +#' @param Q_lib A \code{character} vector identifying the library of learning +#' algorithms to be used in fitting the outcome regression E[Y | A, W]. +#' @param cv_folds A \code{numeric} scalar indicating how many folds to use in +#' performing targeted minimum loss estimation. Cross-validated estimates are +#' more robust, allowing relaxing of theoretical conditions and construction +#' of conservative variance estimates. +#' @param ... Additional arguments passed to \code{\link[drtmle]{drtmle}} in +#' computing the targeted minimum loss estimator of the average treatment +#' effect. +#' +#' @importFrom assertthat assert_that +#' @importFrom drtmle drtmle +#' +#' @return TMLE-based estimate of the relationship between biomarker expression +#' and changes in an exposure variable, computed iteratively and saved in the +#' \code{tmleOut} slot in a \code{biotmle} object. +exp_biomarkertmle <- function(Y, + A, + W, + g_lib, + Q_lib, + cv_folds, + ...) { + # check the case that Y is passed in as a column of a data.frame + if (any(class(Y) == "data.frame")) Y <- as.numeric(unlist(Y[, 1])) + if (any(class(A) == "data.frame")) A <- as.numeric(unlist(A[, 1])) + assertthat::assert_that(length(unique(A)) > 1) + + # fit standard (possibly CV) TML estimator (n.b., guard = NULL) + a_0 <- sort(unique(A[!is.na(A)])) + suppressWarnings( + tmle_fit <- drtmle::drtmle( + Y = Y, + A = A, + W = W, + a_0 = a_0, + SL_g = g_lib, + SL_Q = Q_lib, + cvFolds = cv_folds, + stratify = TRUE, + guard = NULL, + parallel = FALSE, + use_future = FALSE, + ... + ) + ) + + # compute ATE and estimated EIF by delta method + ate_tmle <- tmle_fit$tmle$est[seq_along(a_0)[-1]] - tmle_fit$tmle$est[1] + eif_tmle_delta <- tmle_fit$ic$ic[, seq_along(a_0)[-1]] - tmle_fit$ic$ic[, 1] + + # return only highest contrast (e.g., a[1] v a[5]) if many contrasts + if (!is.vector(eif_tmle_delta)) { + param_out <- ate_tmle[length(ate_tmle)] + eif_out <- eif_tmle_delta[, ncol(eif_tmle_delta)] + + ate_tmle[length(ate_tmle)] + } else { + param_out <- ate_tmle + eif_out <- eif_tmle_delta + ate_tmle + } + assertthat::assert_that(is.vector(eif_out)) + + # output + out <- list(param = param_out, eif = eif_out) + return(out) +}