Tweedieverse / Git / Diff of /R/Tweedieverse.R

Models:
AlyssaS/
Tweedieverse
Downloads: 1
Diff of /R/Tweedieverse.R [000000] .. [727782]
Switch to side-by-side view

--- a
+++ b/R/Tweedieverse.R
@@ -0,0 +1,1050 @@
+#' Differential analysis of multi-omics data using Tweedie GLMs
+#'
+#' Fit a per-feature Tweedie generalized linear model to omics features.
+
+#' @param input_features A tab-delimited input file or an R data frame of features (can be in rows/columns)
+#' and samples (or cells). Samples are expected to have matching names with \code{input_metadata}. 
+#' \code{input_features} can also be an object of class \code{SummarizedExperiment} or \code{SingleCellExperiment} 
+#' that contains the expression or abundance matrix and other metadata; the \code{assays} 
+#' slot contains the expression or abundance matrix and is named \code{"counts"}.  
+#' This matrix should have one row for each feature and one sample for each column.  
+#' The \code{colData} slot should contain a data frame with one row per 
+#' sample and columns that contain metadata for each sample. 
+#' Additional information about the experiment can be contained in the
+#' \code{metadata} slot as a list.
+#' @param input_metadata A tab-delimited input file or an R data frame of metadata (rows/columns).
+#' Samples are expected to have matching sample names with \code{input_features}. 
+#' This file is ignored when \code{input_features} is a \code{SummarizedExperiment} 
+#' or a \code{SingleCellExperiment} object with \code{colData} containing the same information.
+#' @param output The output folder to write results.
+#' @param assay_name If the input is provided as one of the accepted Bioconductor objects 
+#' (e.g., SummarizedExperiment, RangedSummarizedExperiment, SingleCellExperiment, or TreeSummarizedExperiment), 
+#' this argument selects the name of the assay slot in the input object that contains the omics measurements.
+#' @param abd_threshold If prevalence-abundance filtering is desired, only features that are present (or detected)
+#' in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion)
+#' are retained. Default value for \code{abd_threshold} is \code{0.0}.
+#' To disable prevalence-abundance filtering, set \code{abd_threshold = -Inf}.
+#' @param prev_threshold If prevalence-abundance filtering is desired, only features that are present (or detected)
+#' in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion)
+#' are retained. Default value for \code{prev_threshold} is \code{0.1}.
+#' @param var_threshold If variance filtering is desired, only features that have variances greater than
+#' \code{var_threshold} are retained. This step is done after the prevalence-abundance filtering.
+#' Default value for \code{var_threshold} is \code{0.0} (i.e. no variance filtering).
+#' @param entropy_threshold If entropy-based filtering is desired for metadata, only features that have entropy greater than
+#' \code{entropy_threshold} are retained. Default value for \code{entropy_threshold} is \code{0.0} (i.e. no entropy filtering).
+#' @param base_model The per-feature base model. Default is "CPLM". Must be one of "CPLM", "ZICP", "ZSCP", or "ZACP".
+#' @param link A specification of the GLM link function. Default is "log". Must be one of "log", "identity", "sqrt", or "inverse".
+#' @param fixed_effects Metadata variable(s) describing the fixed effects coefficients.
+#' @param random_effects Metadata variable(s) describing the random effects part of the model.
+#' @param cutoff_ZSCP For \code{base_model = "ZSCP"}, the cutoff to stratify features for
+#' adaptive ZI modeling based on sparsity (zero-inflation proportion). Default is 0.3. Must be between 0 and 1.
+#' @param criteria_ZACP For \code{base_model = "ZACP"}, the criteria to select the
+#' best fitting model per feature.  The possible options are 'AIC' and BIC' (default).
+#' More criteria will be supported in a future release.
+#' @param adjust_offset If TRUE (default), an offset term will be included as the logarithm of \code{scale_factor}.
+#' @param scale_factor Name of the numerical variable containing library size (for non-normalized data) or scale factor
+#' (for normalized data) across samples to be included as an offset in the base model (when \code{adjust_offset = TRUE}).
+#' If not found in metadata, defaults to the sample-wise total sums, unless \code{adjust_offset = FALSE}.
+#' @param max_significance The q-value threshold for significance. Default is 0.05.
+#' @param correction The correction method for computing the q-value (see \code{\link[stats]{p.adjust}} for options, default is 'BH').
+#' @param standardize Should continuous metadata be standardized? Default is TRUE. Bypassed for categorical variables.
+#' @param cores An integer that indicates the number of R processes to run in parallel. Default is 1.
+#' @param optimizer The optimization routine to be used for estimating the parameters of the Tweedie model.
+#' Possible choices are \code{"nlminb"} (the default, see \code{\link[stats]{nlminb}}),
+#' \code{"bobyqa"} (\code{\link[minqa]{bobyqa}}), and \code{"L-BFGS-B"} (\code{\link[stats]{optim}}).
+#' Ignored for random effects modeling which uses an alternative Template Model Builder (TMB) approach (\code{\link[glmmTMB]{glmmTMB}}).
+#' @param na.action How to handle missing values? See \code{\link{na.action}}. Default is \code{\link{na.exclude}}.
+#' @param plot_heatmap Logical. If TRUE (default is FALSE), generate a heatmap of the (top \code{heatmap_first_n}) significant associations.
+#' @param plot_scatter Logical. If TRUE (default is FALSE), generate scatter/box plots of individual associations.
+#' @param heatmap_first_n In heatmap, plot top N features with significant associations (default is 50).
+#' @param reference The factor to use as a reference for a variable with more than two levels provided as a string of 'variable,reference' semi-colon delimited for multiple variables (default is NULL).
+#'
+#' @importFrom grDevices colorRampPalette dev.off jpeg pdf
+#' @importFrom stats coef fitted as.formula na.exclude p.adjust plogis sd update
+#' @importFrom utils capture.output read.table write.table
+#' @importFrom dplyr %>% everything
+#' @importFrom parallel clusterExport
+#' @return A data frame containing coefficient estimates, p-values, and q-values (multiplicity-adjusted p-values) are returned.
+#'
+#' @author Himel Mallick, \email{him4004@@med.cornell.edu}
+#'
+#' @examples
+#'
+#' \dontrun{
+#'
+#' ##############################################################################
+#' # Example 1 - Differential Abundance Analysis of Synthetic Microbiome Counts #
+#' ##############################################################################
+#'
+#' #######################################
+#' # Install and Load Required Libraries #
+#' #######################################
+#'
+#' library(devtools)
+#' devtools::install_github('biobakery/sparseDOSSA@@varyLibSize')
+#' library(sparseDOSSA)
+#' library(stringi)
+#'
+#' ######################
+#' # Specify Parameters #
+#' ######################
+#'
+#' n.microbes <- 200 # Number of Features
+#' n.samples <- 100 # Number of Samples
+#' spike.perc <- 0.02 # Percentage of Spiked-in Bugs
+#' spikeStrength<-"20" # Effect Size
+#'
+#' ###########################
+#' # Specify Binary Metadata #
+#' ###########################
+#'
+#' n.metadata <- 1
+#' UserMetadata<-as.matrix(rep(c(0,1), each=n.samples/2))
+#' UserMetadata<-t(UserMetadata) # Transpose
+#'
+#' ###################################################
+#' # Spiked-in Metadata (Which Metadata to Spike-in) #
+#' ###################################################
+#'
+#' Metadatafrozenidx<-1
+#' spikeCount<-as.character(length(Metadatafrozenidx))
+#' significant_metadata<-paste('Metadata', Metadatafrozenidx, sep='')
+#'
+#' #############################################
+#' # Generate SparseDOSSA Synthetic Abundances #
+#' #############################################
+#'
+#' DD<-sparseDOSSA::sparseDOSSA(number_features = n.microbes,
+#' number_samples = n.samples,
+#' UserMetadata=UserMetadata,
+#' Metadatafrozenidx=Metadatafrozenidx,
+#' datasetCount = 1,
+#' spikeCount = spikeCount,
+#' spikeStrength = spikeStrength,
+#' noZeroInflate=TRUE,
+#' percent_spiked=spike.perc,
+#' seed = 1234)
+#'
+#' ##############################
+#' # Gather SparseDOSSA Outputs #
+#' ##############################
+#'
+#' sparsedossa_results <- as.data.frame(DD$OTU_count)
+#' rownames(sparsedossa_results)<-sparsedossa_results$X1
+#' sparsedossa_results<-sparsedossa_results[-1,-1]
+#' colnames(sparsedossa_results)<-paste('Sample', 1:ncol(sparsedossa_results), sep='')
+#' data<-as.matrix(sparsedossa_results[-c((n.metadata+1):(2*n.microbes+n.metadata)),])
+#' data<-data.matrix(data)
+#' class(data) <- "numeric"
+#' truth<-c(unlist(DD$truth))
+#' truth<-truth[!stri_detect_fixed(truth,":")]
+#' truth<-truth[(5+n.metadata):length(truth)]
+#' truth<-as.data.frame(truth)
+#' significant_features<-truth[seq(1,
+#' (as.numeric(spikeCount)+1)*(n.microbes*spike.perc), (as.numeric(spikeCount)+1)),]
+#' significant_features<-as.vector(significant_features)
+#'
+#' ####################
+#' # Extract Features #
+#' ####################
+#'
+#' features<-as.data.frame(t(data[-c(1:n.metadata),]))
+#'
+#' ####################
+#' # Extract Metadata #
+#' ####################
+#'
+#' metadata<-as.data.frame(data[1,])
+#' colnames(metadata)<-rownames(data)[1]
+#'
+#' ###############################
+#' # Mark True Positive Features #
+#' ###############################
+#'
+#' wh.TP = colnames(features) %in% significant_features
+#' colnames(features)<-paste("Feature", 1:n.microbes, sep = "")
+#' newname = paste0(colnames(features)[wh.TP], "_TP")
+#' colnames(features)[wh.TP] <- newname;
+#' colnames(features)[grep('TP', colnames(features))]
+#'
+#' ####################
+#' # Run Tweedieverse #
+#' ###################
+#'
+#' ###################
+#' # Default options #
+#' ###################
+#'
+#' CPLM <-Tweedieverse(
+#' features,
+#' metadata,
+#' output = './demo_output/CPLM') # Assuming demo_output exists
+#'
+#' ###############################################
+#' # User-defined prevalence-abundance filtering #
+#' ###############################################
+#'
+#' ZICP<-Tweedieverse(
+#' features,
+#' metadata,
+#' output = './demo_output/ZICP', # Assuming demo_output exists
+#' base_model = 'ZICP',
+#' abd_threshold = 0.0,
+#' prev_threshold = 0.2)
+#'
+#' ####################################
+#' # User-defined variance filtering  #
+#' ####################################
+#'
+#' sds<-apply(features, 2, sd)
+#' var_threshold = median(sds)/2
+#' ZSCP<-Tweedieverse(
+#' features,
+#' metadata,
+#' output = './demo_output/ZSCP', # Assuming demo_output exists
+#' base_model = 'ZSCP',
+#' var_threshold = var_threshold)
+#'
+#' ##################
+#' # Multiple cores #
+#' ##################
+#'
+#' ZACP<-Tweedieverse(
+#' features,
+#' metadata,
+#' output = './demo_output/ZACP', # Assuming demo_output exists
+#' base_model = 'ZACP',
+#' cores = 4)
+#'
+#' ##########################################################################
+#' # Example 2 - Multivariable Association on HMP2 Longitudinal Microbiomes #
+#' ##########################################################################
+#'
+#' ######################
+#' # HMP2 input_features Analysis #
+#' ######################
+#'
+#' #############
+#' # Load input_features #
+#' #############
+#' 
+#' library(data.table)
+#' input_features <- fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_taxonomy.tsv", sep ="\t")
+#' input_metadata <-fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_metadata.tsv", sep ="\t")
+#'
+#' ###############
+#' # Format data #
+#' ###############
+#'
+#' library(tibble)
+#' features<- column_to_rownames(input_features, 'ID')
+#' metadata<- column_to_rownames(input_metadata, 'ID')
+#'
+#' #############
+#' # Fit Model #
+#' #############
+#'
+#' library(Tweedieverse)
+#' HMP2 <- Tweedieverse(
+#' features,
+#' metadata,
+#' output = './demo_output/HMP2', # Assuming demo_output exists
+#' fixed_effects = c('diagnosis', 'dysbiosisnonIBD','dysbiosisUC','dysbiosisCD', 'antibiotics', 'age'),
+#' random_effects = c('site', 'subject'),
+#' base_model = 'CPLM',
+#' adjust_offset = FALSE, # No offset as the values are relative abundances
+#' cores = 8, # Make sure your computer has the capability
+#' standardize = FALSE,
+#' reference = c('diagnosis,nonIBD'))
+#'
+#' }
+#' @keywords microbiome, metagenomics, multiomics, scRNASeq, tweedie, singlecell
+#' @export
+Tweedieverse <- function(input_features,
+                         input_metadata = NULL,
+                         output,
+                         assay_name = "counts",
+                         abd_threshold = 0.0,
+                         prev_threshold = 0.1,
+                         var_threshold = 0.0,
+                         entropy_threshold = 0.0,
+                         base_model = "CPLM",
+                         link = "log",
+                         fixed_effects = NULL,
+                         random_effects = NULL,
+                         cutoff_ZSCP = 0.3,
+                         criteria_ZACP = "BIC",
+                         adjust_offset = TRUE,
+                         scale_factor = NULL,
+                         max_significance = 0.05,
+                         correction = "BH",
+                         standardize = TRUE,
+                         cores = 1,
+                         optimizer = "nlminb",
+                         na.action = na.exclude,
+                         plot_heatmap = FALSE,
+                         plot_scatter = FALSE,
+                         heatmap_first_n = 50,
+                         reference = NULL) {
+  
+  
+
+  #################################
+  # Specify all available options #
+  #################################
+  
+  model_choices <- c("CPLM", "ZICP", "ZACP", "ZSCP")
+  link_choices <- c("log", "identity", "sqrt", "inverse")
+  criteria_ZACP_choices <- c("AIC", "BIC")
+  correction_choices <-
+    c("BH", "holm", "hochberg", "hommel", "bonferroni", "BY")
+  optimizer_choices <- c("nlminb", "bobyqa", "L-BFGS-B")
+  
+  #######################################################################
+  #=====================================================================#
+  # Read in the data and metadata, create output folder, initialize log #
+  #=====================================================================#
+  #######################################################################
+  
+  #########################################
+  # Support multiple Bioconductor classes #
+  #########################################
+  
+  valid_classes <- c("SummarizedExperiment", "RangedSummarizedExperiment", 
+                     "SingleCellExperiment", "TreeSummarizedExperiment")
+  
+  ##############################################################
+  # Extract features and metadata based on user-provided input #
+  ##############################################################
+  
+  input_class<-class(input_features)
+  if (input_class %in% valid_classes) {
+    data <- extractAssay(input_features, assay_name)
+    if(is.null(input_metadata)) {
+      metadata <- data.frame(colData(input_features))
+    } else{
+      metadata<-input_metadata
+    }
+  } else if (!(is.character(input_features)) & !(is.data.frame(input_features))) {
+    stop(cat(paste('Input data of class <', class(input_features), '> not supported. Please use SummarizedExperiment, SingleCellExperiment, RangedSummarizedExperiment, TreeSummarizedExperiment, or data.frame')))
+  } else{
+    
+    # if a character string then this is a file name, else it
+    # is a data frame
+    if (is.character(input_features)) {
+      data <-
+        data.frame(
+          read.delim::fread(input_features, header = TRUE, sep = '\t'),
+          header = TRUE,
+          fill = T,
+          comment.char = "" ,
+          check.names = F,
+          row.names = 1
+        )
+      if (nrow(input_features) == 1) {
+        # read again to get row name
+        data <- read.delim(
+          input_features,
+          header = TRUE,
+          fill = T,
+          comment.char = "" ,
+          check.names = F,
+          row.names = 1
+        )
+      }
+    } else {
+      data <- input_features
+    }
+    if (is.character(input_metadata)) {
+      input_metadata <-
+        data.frame(
+          read.delim::fread(input_metadata, header = TRUE, sep = '\t'),
+          header = TRUE,
+          fill = T,
+          comment.char = "" ,
+          check.names = F,
+          row.names = 1
+        )
+      if (nrow(input_metadata) == 1) {
+        input_metadata <- read.delim(
+          input_metadata,
+          header = TRUE,
+          fill = T,
+          comment.char = "" ,
+          check.names = F,
+          row.names = 1
+        )
+      }
+    } else {
+      metadata <- input_metadata
+    }
+  }
+    
+  # create an output folder and figures folder if it does not exist
+  if (!file.exists(output)) {
+    print("Creating output folder")
+    dir.create(output)
+  }
+  
+  #if (plot_heatmap || plot_scatter) {
+  figures_folder <- file.path(output, "figures")
+  if (!file.exists(figures_folder)) {
+    print("Creating output figures folder")
+    dir.create(figures_folder)
+  }
+  #}
+ 
+  # Create log file (write info to stdout and debug level to log file)
+  # Set level to finest so all log levels are reviewed
+  log_file <- file.path(output, "Tweedieverse.log")
+  # Remove log file if already exists (to avoid append)
+  if (file.exists(log_file)) {
+    print(paste("Warning: Deleting existing log file:", log_file))
+    unlink(log_file)
+  }
+  logging::basicConfig(level = 'FINEST')
+  logging::addHandler(logging::writeToFile,
+                      file = log_file, level = "DEBUG")
+  logging::setLevel(20, logging::getHandler('basic.stdout'))
+  
+  #####################
+  # Log the arguments #
+  #####################
+  
+  logging::loginfo("Writing function arguments to log file")
+  logging::logdebug("Function arguments")
+  if (is.character(input_features)) {
+    logging::logdebug("Input data file: %s", input_features)
+  }
+  if (is.character(input_metadata)) {
+    logging::logdebug("Input metadata file: %s", input_metadata)
+  }
+  logging::logdebug("Output folder: %s", output)
+  logging::logdebug("Abundance threshold: %f", abd_threshold)
+  logging::logdebug("Prevalence threshold: %f", prev_threshold)
+  logging::logdebug("Variance threshold: %f", var_threshold)
+  logging::logdebug("Base model: %s", base_model)
+  logging::logdebug("Link function: %s", link)
+  logging::logdebug("Fixed effects: %s", fixed_effects)
+  logging::logdebug("Random effects: %s", random_effects)
+  logging::logdebug("ZSCP cutoff: %f", cutoff_ZSCP)
+  logging::logdebug("ZACP criteria: %s", criteria_ZACP)
+  logging::logdebug("Offset adjustment: %s", adjust_offset)
+  logging::logdebug("Scale factor: %s", scale_factor)
+  logging::logdebug("Max significance: %f", max_significance)
+  logging::logdebug("Correction method: %s", correction)
+  logging::logdebug("Standardize: %s", standardize)
+  logging::logdebug("Cores: %d", cores)
+  logging::logdebug("Optimization routine: %s", optimizer)
+  
+  
+  #######################################
+  # Check if valid options are selected #
+  #######################################
+  
+  # Check if the selected link is valid
+  if (!link %in% link_choices) {
+    option_not_valid_error("Please select a link from the list of available options",
+                           toString(link_choices))
+  }
+  
+  # Check if the selected base_model is valid
+  if (!base_model %in% model_choices) {
+    option_not_valid_error(
+      paste(
+        "Please select an analysis method",
+        "from the list of available options"
+      ),
+      toString(model_choices)
+    )
+  }
+  
+  # Check if the selected criteria_ZACP is valid
+    if (!criteria_ZACP %in% criteria_ZACP_choices) {
+      option_not_valid_error(
+        paste(
+          "Please select a criteria",
+          "from the list of available options"
+        ),
+        toString(criteria_ZACP_choices)
+      )
+    }
+  
+  # Check if the selected correction is valid
+  if (!correction %in% correction_choices) {
+    option_not_valid_error(
+      paste(
+        "Please select a correction method",
+        "from the list of available options"
+      ),
+      toString(correction_choices)
+    )
+  }
+  
+  # Check if the selected optimizer is valid
+  if (!optimizer %in% optimizer_choices) {
+    option_not_valid_error(
+      paste(
+        "Please select an optimizer method",
+        "from the list of available options"
+      ),
+      toString(correction_choices)
+    )
+  }
+  
+  ############################################################
+  # Check if the selected numerical options are within range #
+  ############################################################
+  
+  prop_options <- c(prev_threshold, cutoff_ZSCP, max_significance)
+  if (any(prop_options < 0) || any(prop_options > 1)) {
+    stop(
+      paste(
+        "One of the following is outside [0, 1]:",
+        "prev_threshold, cutoff_ZSCP, max_significance"
+      )
+    )
+  }
+  
+  ###############################################################
+  # Determine orientation of data in input and reorder to match #
+  ###############################################################
+  
+  logging::loginfo("Determining format of input files")
+  samples_row_row <- intersect(rownames(data), rownames(metadata))
+  if (length(samples_row_row) > 0) {
+    # this is the expected formatting so do not modify data frames
+    logging::loginfo(paste(
+      "Input format is data samples",
+      "as rows and metadata samples as rows"
+    ))
+  } else {
+    samples_column_row <- intersect(colnames(data), rownames(metadata))
+    if (length(samples_column_row) > 0) {
+      logging::loginfo(paste(
+        "Input format is data samples",
+        "as columns and metadata samples as rows"
+      ))
+      # transpose data frame so samples are rows
+      data <- type.convert(as.data.frame(t(data)))
+      logging::logdebug("linked data so samples are rows")
+    } else {
+      samples_column_column <-
+        intersect(colnames(data), colnames(metadata))
+      if (length(samples_column_column) > 0) {
+        logging::loginfo(
+          paste(
+            "Input format is data samples",
+            "as columns and metadata samples as columns"
+          )
+        )
+        data <- type.convert(as.data.frame(t(data)))
+        metadata <- type.convert(as.data.frame(t(metadata)))
+        logging::logdebug("linked data and metadata so samples are rows")
+      } else {
+        samples_row_column <-
+          intersect(rownames(data), colnames(metadata))
+        if (length(samples_row_column) > 0) {
+          logging::loginfo(
+            paste(
+              "Input format is data samples",
+              "as rows and metadata samples as columns"
+            )
+          )
+          metadata <- type.convert(as.data.frame(t(metadata)))
+          logging::logdebug("linked metadata so samples are rows")
+        } else {
+          logging::logerror(
+            paste(
+              "Unable to find samples in data and",
+              "metadata files.",
+              "Rows/columns do not match."
+            )
+          )
+          logging::logdebug("input_features rows: %s",
+                            paste(rownames(data), collapse = ","))
+          logging::logdebug("input_features columns: %s",
+                            paste(colnames(data), collapse = ","))
+          logging::logdebug("Metadata rows: %s",
+                            paste(rownames(metadata), collapse = ","))
+          logging::logdebug("Metadata columns: %s",
+                            paste(colnames(data), collapse = ","))
+          stop()
+        }
+      }
+    }
+  }
+  
+  # Replace unexpected characters in feature names
+  # colnames(data) <- make.names(colnames(data))
+  
+  # Check for samples without metadata
+  extra_feature_samples <-
+    setdiff(rownames(data), rownames(metadata))
+  if (length(extra_feature_samples) > 0)
+    logging::logdebug(
+      paste(
+        "The following samples were found",
+        "to have features but no metadata.",
+        "They will be removed. %s"
+      ),
+      paste(extra_feature_samples, collapse = ",")
+    )
+  
+  # Check for metadata samples without features
+  extra_metadata_samples <-
+    setdiff(rownames(metadata), rownames(data))
+  if (length(extra_metadata_samples) > 0)
+    logging::logdebug(
+      paste(
+        "The following samples were found",
+        "to have metadata but no features.",
+        "They will be removed. %s"
+      ),
+      paste(extra_metadata_samples, collapse = ",")
+    )
+  
+  # Get a set of the samples with both metadata and features
+  intersect_samples <- intersect(rownames(data), rownames(metadata))
+  logging::logdebug(
+    "A total of %s samples were found in both the data and metadata",
+    length(intersect_samples)
+  )
+  
+  # Now order both data and metadata with the same sample ordering
+  logging::logdebug("Reordering data/metadata to use same sample ordering")
+  data <- data[intersect_samples, , drop = FALSE]
+  metadata <- metadata[intersect_samples, , drop = FALSE]
+  
+
+  ########################################################################
+  # Assign reference values to categorical metadata (fixed effects only) #
+  ########################################################################
+  
+  if (is.null(reference)) {
+    reference <- ","
+  }
+  split_reference <- unlist(strsplit(reference, "[,;]"))
+  
+  # for each fixed effect, check that a reference level has been set if necessary: number of levels > 2 and metadata isn't already an ordered factor
+  for (i in fixed_effects) {
+    # don't check for or require reference levels for numeric metadata
+    if (is.numeric(metadata[,i])) {
+      next
+    }
+    # respect ordering if a factor is explicitly passed in with no reference set
+    if (is.factor(metadata[,i]) && !(i %in% split_reference)) {
+      logging::loginfo(paste("Factor detected for categorial metadata '", 
+                             i, "'. Provide a reference argument or manually set factor ordering to change reference level.", sep=""))
+      next
+    }
+    
+    # set metadata as a factor (ordered alphabetically)
+    metadata[,i] <- as.factor(metadata[,i])
+    mlevels <- levels(metadata[,i])
+    
+    # get reference level for variable being considered, returns NA if not found
+    ref <- split_reference[match(i, split_reference)+1]
+    
+    # if metadata has 2 levels, allow but don't require setting reference level, otherwise require it
+    if ((length(mlevels) == 2)) {
+      if(!is.na(ref)) {
+        metadata[,i] = relevel(metadata[,i], ref = ref)
+      }
+    } else if (length(mlevels) > 2) {
+      if (!is.na(ref)) {
+        metadata[,i] = relevel(metadata[,i], ref = ref)
+      } else {
+        stop(paste("Please provide the reference for the variable '",
+                   i, "' which includes more than 2 levels: ",
+                   paste(as.character(mlevels), collapse=", "), ".", sep=""))   
+      } 
+    } else {
+      stop("Provided categorical metadata has fewer than 2 unique, non-NA values.")
+    }
+  }
+  
+  
+  #########################################################
+  # Non-specific filtering based on user-provided options #
+  #########################################################
+  
+  unfiltered_data <- data
+  unfiltered_metadata <- metadata
+  
+  # require at least total samples * min prevalence values
+  # for each feature to be greater than min abundance
+  logging::loginfo("Filter data based on min abundance and min prevalence")
+  total_samples <- nrow(unfiltered_data)
+  logging::loginfo("Total samples in data: %d", total_samples)
+  min_samples <- total_samples * prev_threshold
+  logging::loginfo(
+    paste(
+      "Min samples required with min abundance",
+      "for a feature not to be filtered: %f"
+    ),
+    min_samples
+  )
+  
+  # Filter by abundance using zero as value for NAs
+  data_zeros <- unfiltered_data
+  data_zeros[is.na(data_zeros)] <- 0
+  filtered_data <-
+    unfiltered_data[,
+                    colSums(data_zeros > abd_threshold) > min_samples,
+                    drop = FALSE]
+  total_filtered_features <-
+    ncol(unfiltered_data) - ncol(filtered_data)
+  logging::loginfo(
+    "Total filtered features with prevalence-abundance filtering: %d",
+    total_filtered_features
+  )
+  filtered_feature_names <-
+    setdiff(names(unfiltered_data), names(filtered_data))
+  logging::loginfo("Filtered feature names: %s",
+                   toString(filtered_feature_names))
+  
+  
+  
+  #################################
+  # Filter data based on variance #
+  #################################
+  
+  sds <- apply(filtered_data, 2, na.rm = T, sd)
+  final_features <-
+    filtered_data[, which(sds > var_threshold), drop = FALSE]
+  total_filtered_features_var <-
+    ncol(filtered_data) - ncol(final_features)
+  logging::loginfo("Total filtered features with variance filtering: %d",
+                   total_filtered_features_var)
+  filtered_feature_names_var <-
+    setdiff(names(filtered_data), names(final_features))
+  logging::loginfo("Filtered feature names: %s",
+                   toString(filtered_feature_names_var))
+  
+  
+  ########################################################################
+  # Set the scale factor to rowsum if not provided and create a modified #
+  # metadata table without the scale factor variable (if present) ########
+  ########################################################################
+  
+  if (is.null(scale_factor)) {
+    offset <- rowSums(final_features)
+  } else {
+    if (!scale_factor %in% colnames(unfiltered_metadata)) {
+      stop(
+        paste(
+          "The specified scale_factor variable is not present in the metadata table:\n",
+          scale_factor
+        )
+      )
+    } else {
+      offset <- unfiltered_metadata[, scale_factor]
+      unfiltered_metadata <-
+        dplyr::select(unfiltered_metadata,-scale_factor)
+    }
+  }
+  
+  
+  ####################################
+  # Filter metadata based on entropy #
+  ####################################
+  
+  # Reduce metadata to only include those pass entropy threshold
+  temp_filtered_metadata <- unfiltered_metadata[, apply(unfiltered_metadata, 2, entropy) > entropy_threshold, drop=F]
+  excluded_metadata <- setdiff(colnames(unfiltered_metadata), colnames(temp_filtered_metadata))
+  logging::loginfo(
+    paste(
+      "Excluded metadata with",
+      "entropy less or equal to %s: %s"
+    ),
+    entropy_threshold, paste(excluded_metadata, collapse = ",")
+  )
+  filtered_metadata <- temp_filtered_metadata
+  
+  
+  ###############################################
+  # Compute the formula based on the user input #
+  ###############################################
+
+  #####################
+  # Determine formula #
+  #####################
+  
+  random_effects_formula <- NULL
+  # Use all metadata if no fixed effects are provided
+  if (is.null(fixed_effects)) {
+    fixed_effects <- colnames(filtered_metadata)
+  } else {
+    fixed_effects <- unlist(strsplit(fixed_effects, ",", fixed = TRUE))
+    # remove any fixed effects not found in metadata names
+    to_remove <- setdiff(fixed_effects, colnames(filtered_metadata))
+    if (length(to_remove) > 0)
+      logging::logwarn(
+        paste(
+          "Feature name not found in metadata",
+          "so not applied to formula as fixed effect: %s"
+        ),
+        paste(to_remove, collapse = " , ")
+      )
+    fixed_effects <- setdiff(fixed_effects, to_remove)
+    if (length(fixed_effects) == 0) {
+      logging::logerror("No fixed effects included in formula.")
+      stop()
+    }
+  }
+  
+  if (!is.null(random_effects)) {
+    random_effects <-
+      unlist(strsplit(random_effects, ",", fixed = TRUE))
+    # subtract random effects from fixed effects
+    fixed_effects <- setdiff(fixed_effects, random_effects)
+    # remove any random effects not found in metadata
+    to_remove <-
+      setdiff(random_effects, colnames(filtered_metadata))
+    if (length(to_remove) > 0)
+      logging::logwarn(
+        paste(
+          "Feature name not found in metadata",
+          "so not applied to formula as random effect: %s"
+        ),
+        paste(to_remove, collapse = " , ")
+      )
+    random_effects <- setdiff(random_effects, to_remove)
+    
+    # create formula
+    if (length(random_effects) > 0) {
+      random_effects_formula_text <-
+        paste("expr ~ (1 | ",
+              paste(
+                random_effects,
+                ")",
+                sep = '',
+                collapse = " + (1 | "
+              ),
+              sep = '')
+      logging::loginfo("Formula for random effects: %s",
+                       random_effects_formula_text)
+      random_effects_formula <-
+        tryCatch(
+          as.formula(random_effects_formula_text),
+          error = function(e)
+            stop(
+              paste(
+                "Invalid formula for random effects: ",
+                random_effects_formula_text
+              )
+            )
+        )
+    }
+  }
+  
+  # Reduce metadata to only include fixed/random effects in formula
+  effects_names <- union(fixed_effects, random_effects)
+  filtered_metadata <-
+    filtered_metadata[, effects_names, drop = FALSE]
+  
+  # Create the fixed effects formula text
+  formula_text <-
+    paste("expr ~ ", paste(fixed_effects, collapse = " + "))
+  logging::loginfo("Formula for fixed effects: %s", formula_text)
+  formula <-
+    tryCatch(
+      as.formula(formula_text),
+      error = function(e)
+        stop(
+          paste(
+            "Invalid formula.",
+            "Please provide a different formula: ",
+            formula_text
+          )
+        )
+    )
+
+  #############################################################
+  # Standardize metadata (excpet the offset variable), if set #
+  #############################################################
+  
+  if (standardize) {
+    logging::loginfo("Applying z-score to standardize continuous metadata")
+    filtered_metadata <-
+      filtered_metadata %>% dplyr::mutate_if(is.numeric, scale)
+  } else {
+    logging::loginfo("Bypass z-score application to metadata")
+  }
+  
+  ##################################
+  # Merge metadata and offset back #
+  ##################################
+  
+  final_metadata <- as.data.frame(cbind.data.frame(filtered_metadata, offset))
+  
+  ##############################################################
+  # Apply the base model to the filtered data with user inputs #
+  ##############################################################
+  
+  logging::loginfo("Running selected analysis method: %s", base_model)
+  
+  fit_data <- fit.Tweedieverse(
+    features = final_features,
+    metadata = final_metadata,
+    base_model = base_model,
+    link = link,
+    formula = formula,
+    random_effects_formula = random_effects_formula,
+    cutoff_ZSCP = cutoff_ZSCP,
+    criteria_ZACP = criteria_ZACP,
+    adjust_offset = adjust_offset,
+    correction = correction,
+    cores = cores,
+    optimizer = optimizer,
+    na.action = na.action
+  )
+  
+  
+  ###################################################
+  # Count the N and Zero-inflation for each feature #
+  ###################################################
+  
+  logging::loginfo("Counting prevalence for each feature")
+  try(fit_data$results$N <-
+        apply(
+          fit_data$results,
+          1,
+          FUN = function(x)
+            length(final_features[, x[1]])
+        ))
+  try(fit_data$results$N.not.zero <-
+        apply(
+          fit_data$results,
+          1,
+          FUN = function(x)
+            length(which(final_features[, x[1]] > 0))
+        ))
+  try(fit_data$results$percent.zero <-
+        apply(
+          fit_data$results,
+          1,
+          FUN = function(x)
+            round(mean(final_features[, x[1]] == 0, na.rm = TRUE), 2) *
+            100
+        ))
+  
+  #########################
+  # Write out the results #
+  #########################
+  
+  results_file <- file.path(output, "all_results.tsv")
+  logging::loginfo("Writing all results to file (ordered by increasing q-values): %s",
+                   results_file)
+  ordered_results <-
+    fit_data$results[order(fit_data$results$qval),]
+  ordered_results <-
+    ordered_results[!is.na(ordered_results$qval),] # Remove NA's
+  ordered_results <-
+    dplyr::select(
+      ordered_results,
+      c(
+        'feature',
+        'metadata',
+        'value',
+        "coef",
+        "stderr",
+        "pval",
+        "qval"
+      ),
+      everything()
+    )
+  write.table(
+    ordered_results,
+    file = results_file,
+    sep = "\t",
+    quote = FALSE,
+    row.names = FALSE
+  )
+  
+  
+  # Write results passing threshold to file
+  # (removing any that are NA for the q-value)
+  significant_results <-
+    ordered_results[ordered_results$qval <= max_significance,]
+  significant_results_file <-
+    file.path(output, "significant_results.tsv")
+  logging::loginfo(
+    paste(
+      "Writing the significant results",
+      "(those which are less than or equal to the threshold",
+      "of %f ) to file (ordered by increasing q-values): %s"
+    ),
+    max_significance,
+    significant_results_file
+  )
+  write.table(
+    significant_results,
+    file = significant_results_file,
+    sep = "\t",
+    quote = FALSE,
+    row.names = FALSE
+  )
+  
+  
+  #######################################################
+  # Create visualizations for results passing threshold #
+  #######################################################
+  
+  logging::loginfo("Writing Tweedie inxed plot to file: %s",
+                   output)
+  tryCatch({
+    tweedie_index_plot(ordered_results, figures_folder)
+    
+  }, error = function(err) {
+    logging::logerror("Unable to do make a Tweedie inxed plot of results!!!")
+    logging::logerror(err)
+    # dev.off()
+  })
+  
+  if (plot_heatmap & length(unique(significant_results_file["metdata"])) > 1) {
+    heatmap_file <- file.path(output, "Tweedieverse_Heatmap.pdf")
+    logging::loginfo("Writing heatmap of significant results to file: %s",
+                     heatmap_file)
+    tryCatch({
+      save_heatmap(significant_results_file,
+                   heatmap_file,
+                   figures_folder,
+                   first_n = heatmap_first_n)
+    }, error = function(err) {
+      logging::logerror("Unable to do make a hetamp of results!!!")
+      logging::logerror(err)
+      # dev.off()
+    })
+  }
+  
+  if (plot_scatter) {
+    logging::loginfo(
+      paste(
+        "Writing association plots",
+        "(one for each significant association)",
+        "to output folder: %s"
+      ),
+      output
+    )
+    association_plots(
+      metadata,
+      final_features/offset,
+      significant_results_file,
+      output,
+      figures_folder
+    )
+  }
+  
+  return(significant_results)
+}
+
+
+option_not_valid_error <- function(message, valid_options) {
+  logging::logerror(paste(message, ": %s"), toString(valid_options))
+  stop("Option not valid", call. = FALSE)
+}
+
+## Quiets concerns of R CMD check
+utils::globalVariables(c("name"))