--- a +++ b/man/Tweedieverse.Rd @@ -0,0 +1,324 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/Tweedieverse.R +\name{Tweedieverse} +\alias{Tweedieverse} +\title{Differential analysis of multi-omics data using Tweedie GLMs} +\usage{ +Tweedieverse( + input_features, + input_metadata = NULL, + output, + assay_name = "counts", + abd_threshold = 0, + prev_threshold = 0.1, + var_threshold = 0, + entropy_threshold = 0, + base_model = "CPLM", + link = "log", + fixed_effects = NULL, + random_effects = NULL, + cutoff_ZSCP = 0.3, + criteria_ZACP = "BIC", + adjust_offset = TRUE, + scale_factor = NULL, + max_significance = 0.05, + correction = "BH", + standardize = TRUE, + cores = 1, + optimizer = "nlminb", + na.action = na.exclude, + plot_heatmap = FALSE, + plot_scatter = FALSE, + heatmap_first_n = 50, + reference = NULL +) +} +\arguments{ +\item{input_features}{A tab-delimited input file or an R data frame of features (can be in rows/columns) +and samples (or cells). Samples are expected to have matching names with \code{input_metadata}. +\code{input_features} can also be an object of class \code{SummarizedExperiment} or \code{SingleCellExperiment} +that contains the expression or abundance matrix and other metadata; the \code{assays} +slot contains the expression or abundance matrix and is named \code{"counts"}. +This matrix should have one row for each feature and one sample for each column. +The \code{colData} slot should contain a data frame with one row per +sample and columns that contain metadata for each sample. +Additional information about the experiment can be contained in the +\code{metadata} slot as a list.} + +\item{input_metadata}{A tab-delimited input file or an R data frame of metadata (rows/columns). +Samples are expected to have matching sample names with \code{input_features}. +This file is ignored when \code{input_features} is a \code{SummarizedExperiment} +or a \code{SingleCellExperiment} object with \code{colData} containing the same information.} + +\item{output}{The output folder to write results.} + +\item{assay_name}{If the input is provided as one of the accepted Bioconductor objects +(e.g., SummarizedExperiment, RangedSummarizedExperiment, SingleCellExperiment, or TreeSummarizedExperiment), +this argument selects the name of the assay slot in the input object that contains the omics measurements.} + +\item{abd_threshold}{If prevalence-abundance filtering is desired, only features that are present (or detected) +in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion) +are retained. Default value for \code{abd_threshold} is \code{0.0}. +To disable prevalence-abundance filtering, set \code{abd_threshold = -Inf}.} + +\item{prev_threshold}{If prevalence-abundance filtering is desired, only features that are present (or detected) +in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion) +are retained. Default value for \code{prev_threshold} is \code{0.1}.} + +\item{var_threshold}{If variance filtering is desired, only features that have variances greater than +\code{var_threshold} are retained. This step is done after the prevalence-abundance filtering. +Default value for \code{var_threshold} is \code{0.0} (i.e. no variance filtering).} + +\item{entropy_threshold}{If entropy-based filtering is desired for metadata, only features that have entropy greater than +\code{entropy_threshold} are retained. Default value for \code{entropy_threshold} is \code{0.0} (i.e. no entropy filtering).} + +\item{base_model}{The per-feature base model. Default is "CPLM". Must be one of "CPLM", "ZICP", "ZSCP", or "ZACP".} + +\item{link}{A specification of the GLM link function. Default is "log". Must be one of "log", "identity", "sqrt", or "inverse".} + +\item{fixed_effects}{Metadata variable(s) describing the fixed effects coefficients.} + +\item{random_effects}{Metadata variable(s) describing the random effects part of the model.} + +\item{cutoff_ZSCP}{For \code{base_model = "ZSCP"}, the cutoff to stratify features for +adaptive ZI modeling based on sparsity (zero-inflation proportion). Default is 0.3. Must be between 0 and 1.} + +\item{criteria_ZACP}{For \code{base_model = "ZACP"}, the criteria to select the +best fitting model per feature. The possible options are 'AIC' and BIC' (default). +More criteria will be supported in a future release.} + +\item{adjust_offset}{If TRUE (default), an offset term will be included as the logarithm of \code{scale_factor}.} + +\item{scale_factor}{Name of the numerical variable containing library size (for non-normalized data) or scale factor +(for normalized data) across samples to be included as an offset in the base model (when \code{adjust_offset = TRUE}). +If not found in metadata, defaults to the sample-wise total sums, unless \code{adjust_offset = FALSE}.} + +\item{max_significance}{The q-value threshold for significance. Default is 0.05.} + +\item{correction}{The correction method for computing the q-value (see \code{\link[stats]{p.adjust}} for options, default is 'BH').} + +\item{standardize}{Should continuous metadata be standardized? Default is TRUE. Bypassed for categorical variables.} + +\item{cores}{An integer that indicates the number of R processes to run in parallel. Default is 1.} + +\item{optimizer}{The optimization routine to be used for estimating the parameters of the Tweedie model. +Possible choices are \code{"nlminb"} (the default, see \code{\link[stats]{nlminb}}), +\code{"bobyqa"} (\code{\link[minqa]{bobyqa}}), and \code{"L-BFGS-B"} (\code{\link[stats]{optim}}). +Ignored for random effects modeling which uses an alternative Template Model Builder (TMB) approach (\code{\link[glmmTMB]{glmmTMB}}).} + +\item{na.action}{How to handle missing values? See \code{\link{na.action}}. Default is \code{\link{na.exclude}}.} + +\item{plot_heatmap}{Logical. If TRUE (default is FALSE), generate a heatmap of the (top \code{heatmap_first_n}) significant associations.} + +\item{plot_scatter}{Logical. If TRUE (default is FALSE), generate scatter/box plots of individual associations.} + +\item{heatmap_first_n}{In heatmap, plot top N features with significant associations (default is 50).} + +\item{reference}{The factor to use as a reference for a variable with more than two levels provided as a string of 'variable,reference' semi-colon delimited for multiple variables (default is NULL).} +} +\value{ +A data frame containing coefficient estimates, p-values, and q-values (multiplicity-adjusted p-values) are returned. +} +\description{ +Fit a per-feature Tweedie generalized linear model to omics features. +} +\examples{ + +\dontrun{ + +############################################################################## +# Example 1 - Differential Abundance Analysis of Synthetic Microbiome Counts # +############################################################################## + +####################################### +# Install and Load Required Libraries # +####################################### + +library(devtools) +devtools::install_github('biobakery/sparseDOSSA@varyLibSize') +library(sparseDOSSA) +library(stringi) + +###################### +# Specify Parameters # +###################### + +n.microbes <- 200 # Number of Features +n.samples <- 100 # Number of Samples +spike.perc <- 0.02 # Percentage of Spiked-in Bugs +spikeStrength<-"20" # Effect Size + +########################### +# Specify Binary Metadata # +########################### + +n.metadata <- 1 +UserMetadata<-as.matrix(rep(c(0,1), each=n.samples/2)) +UserMetadata<-t(UserMetadata) # Transpose + +################################################### +# Spiked-in Metadata (Which Metadata to Spike-in) # +################################################### + +Metadatafrozenidx<-1 +spikeCount<-as.character(length(Metadatafrozenidx)) +significant_metadata<-paste('Metadata', Metadatafrozenidx, sep='') + +############################################# +# Generate SparseDOSSA Synthetic Abundances # +############################################# + +DD<-sparseDOSSA::sparseDOSSA(number_features = n.microbes, +number_samples = n.samples, +UserMetadata=UserMetadata, +Metadatafrozenidx=Metadatafrozenidx, +datasetCount = 1, +spikeCount = spikeCount, +spikeStrength = spikeStrength, +noZeroInflate=TRUE, +percent_spiked=spike.perc, +seed = 1234) + +############################## +# Gather SparseDOSSA Outputs # +############################## + +sparsedossa_results <- as.data.frame(DD$OTU_count) +rownames(sparsedossa_results)<-sparsedossa_results$X1 +sparsedossa_results<-sparsedossa_results[-1,-1] +colnames(sparsedossa_results)<-paste('Sample', 1:ncol(sparsedossa_results), sep='') +data<-as.matrix(sparsedossa_results[-c((n.metadata+1):(2*n.microbes+n.metadata)),]) +data<-data.matrix(data) +class(data) <- "numeric" +truth<-c(unlist(DD$truth)) +truth<-truth[!stri_detect_fixed(truth,":")] +truth<-truth[(5+n.metadata):length(truth)] +truth<-as.data.frame(truth) +significant_features<-truth[seq(1, +(as.numeric(spikeCount)+1)*(n.microbes*spike.perc), (as.numeric(spikeCount)+1)),] +significant_features<-as.vector(significant_features) + +#################### +# Extract Features # +#################### + +features<-as.data.frame(t(data[-c(1:n.metadata),])) + +#################### +# Extract Metadata # +#################### + +metadata<-as.data.frame(data[1,]) +colnames(metadata)<-rownames(data)[1] + +############################### +# Mark True Positive Features # +############################### + +wh.TP = colnames(features) \%in\% significant_features +colnames(features)<-paste("Feature", 1:n.microbes, sep = "") +newname = paste0(colnames(features)[wh.TP], "_TP") +colnames(features)[wh.TP] <- newname; +colnames(features)[grep('TP', colnames(features))] + +#################### +# Run Tweedieverse # +################### + +################### +# Default options # +################### + +CPLM <-Tweedieverse( +features, +metadata, +output = './demo_output/CPLM') # Assuming demo_output exists + +############################################### +# User-defined prevalence-abundance filtering # +############################################### + +ZICP<-Tweedieverse( +features, +metadata, +output = './demo_output/ZICP', # Assuming demo_output exists +base_model = 'ZICP', +abd_threshold = 0.0, +prev_threshold = 0.2) + +#################################### +# User-defined variance filtering # +#################################### + +sds<-apply(features, 2, sd) +var_threshold = median(sds)/2 +ZSCP<-Tweedieverse( +features, +metadata, +output = './demo_output/ZSCP', # Assuming demo_output exists +base_model = 'ZSCP', +var_threshold = var_threshold) + +################## +# Multiple cores # +################## + +ZACP<-Tweedieverse( +features, +metadata, +output = './demo_output/ZACP', # Assuming demo_output exists +base_model = 'ZACP', +cores = 4) + +########################################################################## +# Example 2 - Multivariable Association on HMP2 Longitudinal Microbiomes # +########################################################################## + +###################### +# HMP2 input_features Analysis # +###################### + +############# +# Load input_features # +############# + +library(data.table) +input_features <- fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_taxonomy.tsv", sep ="\t") +input_metadata <-fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_metadata.tsv", sep ="\t") + +############### +# Format data # +############### + +library(tibble) +features<- column_to_rownames(input_features, 'ID') +metadata<- column_to_rownames(input_metadata, 'ID') + +############# +# Fit Model # +############# + +library(Tweedieverse) +HMP2 <- Tweedieverse( +features, +metadata, +output = './demo_output/HMP2', # Assuming demo_output exists +fixed_effects = c('diagnosis', 'dysbiosisnonIBD','dysbiosisUC','dysbiosisCD', 'antibiotics', 'age'), +random_effects = c('site', 'subject'), +base_model = 'CPLM', +adjust_offset = FALSE, # No offset as the values are relative abundances +cores = 8, # Make sure your computer has the capability +standardize = FALSE, +reference = c('diagnosis,nonIBD')) + +} +} +\author{ +Himel Mallick, \email{him4004@med.cornell.edu} +} +\keyword{metagenomics,} +\keyword{microbiome,} +\keyword{multiomics,} +\keyword{scRNASeq,} +\keyword{singlecell} +\keyword{tweedie,}