--- a +++ b/R/eif_moderated.R @@ -0,0 +1,102 @@ +#' Moderated Statistical Tests for Influence Functions +#' +#' Performs variance shrinkage via application of an empirical Bayes procedure +#' (of LIMMA) on the observed data after a transformation moving the data to +#' influence function space, based on the average treatment effect parameter. +#' +#' @param biotmle \code{biotmle} object as generated by \code{biomarkertmle} +#' @param adjust the multiple testing correction to be applied to p-values that +#' are generated from the moderated tests. The recommended (default) method +#' is that of Benjamini and Hochberg. See \link[limma]{topTable} for a list of +#' appropriate methods. +#' @param pval_type The reference distribution to be used for computing the +#' p-value. Those based on the normal approximation tend to provide misleading +#' inference when working with moderately sized (finite) samples. Use of the +#' logistic distribution has been found to empirically improve performance in +#' settings where multiple hypothesis testing is a concern. +#' @param ... Other arguments passed to \code{\link[limma]{topTable}}. +#' +#' @importFrom methods is +#' @importFrom dplyr "%>%" mutate select +#' @importFrom stats plogis p.adjust +#' @importFrom limma lmFit eBayes topTable +#' @importFrom tibble as_tibble rownames_to_column +#' @importFrom assertthat assert_that +#' +#' @return \code{biotmle} object containing the results of applying both +#' \code{\link[limma]{lmFit}} and \code{\link[limma]{topTable}}. +#' +#' @export modtest_ic +#' +#' @examples +#' library(dplyr) +#' library(biotmleData) +#' library(SuperLearner) +#' library(SummarizedExperiment) +#' data(illuminaData) +#' +#' colData(illuminaData) <- colData(illuminaData) %>% +#' data.frame() %>% +#' dplyr::mutate(age = as.numeric(age > median(age))) %>% +#' DataFrame() +#' benz_idx <- which(names(colData(illuminaData)) %in% "benzene") +#' +#' biomarkerTMLEout <- biomarkertmle( +#' se = illuminaData[1:2, ], +#' varInt = benz_idx, +#' bppar_type = BiocParallel::SerialParam(), +#' g_lib = c("SL.mean", "SL.glm"), +#' Q_lib = c("SL.mean", "SL.glm") +#' ) +#' +#' limmaTMLEout <- modtest_ic(biotmle = biomarkerTMLEout) +modtest_ic <- function(biotmle, + adjust = "BH", + pval_type = c("normal", "logistic"), + ...) { + # check for input type and set argument defaults + assertthat::assert_that(is(biotmle, "bioTMLE")) + pval_type <- match.arg(pval_type) + + # extract influence function estimates and number of observations + biomarkertmle_out <- as.matrix(biotmle@tmleOut) + n_obs <- nrow(colData(biotmle)) + + # build design matrix for forcing limma to only perform shrinkage + fit <- limma::lmFit( + object = biomarkertmle_out, + design = rep(1, n_obs) + ) + fit <- limma::eBayes(fit = fit) + + # compute inference + tt <- limma::topTable( + fit = fit, + coef = 1, + number = Inf, + adjust.method = adjust, + sort.by = "none", + ... + ) %>% + tibble::as_tibble(.name_repair = "minimal") %>% + dplyr::mutate( + # remove log-fold change and overwrite average expression with ATE + ID = biotmle@NAMES, + logFC = NULL, + AveExpr = biotmle@ateOut, + var_bayes = fit$s2.post / n_obs, + ) %>% + dplyr::select(ID, AveExpr, t, P.Value, adj.P.Val, B, var_bayes) + + # use logistic distribution as reference for p-values by default + if (pval_type == "logistic") { + p_val <- 2 * stats::plogis(-abs(tt$t), location = 0, scale = sqrt(3) / pi) + p_val_adj <- stats::p.adjust(p_val, method = adjust) + tt$P.Value <- p_val + tt$adj.P.Val <- p_val_adj + } + + # store in slot of output object + biotmle@topTable <- tt + return(biotmle) +}