--- a +++ b/R/Tweedieverse.R @@ -0,0 +1,1050 @@ +#' Differential analysis of multi-omics data using Tweedie GLMs +#' +#' Fit a per-feature Tweedie generalized linear model to omics features. + +#' @param input_features A tab-delimited input file or an R data frame of features (can be in rows/columns) +#' and samples (or cells). Samples are expected to have matching names with \code{input_metadata}. +#' \code{input_features} can also be an object of class \code{SummarizedExperiment} or \code{SingleCellExperiment} +#' that contains the expression or abundance matrix and other metadata; the \code{assays} +#' slot contains the expression or abundance matrix and is named \code{"counts"}. +#' This matrix should have one row for each feature and one sample for each column. +#' The \code{colData} slot should contain a data frame with one row per +#' sample and columns that contain metadata for each sample. +#' Additional information about the experiment can be contained in the +#' \code{metadata} slot as a list. +#' @param input_metadata A tab-delimited input file or an R data frame of metadata (rows/columns). +#' Samples are expected to have matching sample names with \code{input_features}. +#' This file is ignored when \code{input_features} is a \code{SummarizedExperiment} +#' or a \code{SingleCellExperiment} object with \code{colData} containing the same information. +#' @param output The output folder to write results. +#' @param assay_name If the input is provided as one of the accepted Bioconductor objects +#' (e.g., SummarizedExperiment, RangedSummarizedExperiment, SingleCellExperiment, or TreeSummarizedExperiment), +#' this argument selects the name of the assay slot in the input object that contains the omics measurements. +#' @param abd_threshold If prevalence-abundance filtering is desired, only features that are present (or detected) +#' in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion) +#' are retained. Default value for \code{abd_threshold} is \code{0.0}. +#' To disable prevalence-abundance filtering, set \code{abd_threshold = -Inf}. +#' @param prev_threshold If prevalence-abundance filtering is desired, only features that are present (or detected) +#' in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion) +#' are retained. Default value for \code{prev_threshold} is \code{0.1}. +#' @param var_threshold If variance filtering is desired, only features that have variances greater than +#' \code{var_threshold} are retained. This step is done after the prevalence-abundance filtering. +#' Default value for \code{var_threshold} is \code{0.0} (i.e. no variance filtering). +#' @param entropy_threshold If entropy-based filtering is desired for metadata, only features that have entropy greater than +#' \code{entropy_threshold} are retained. Default value for \code{entropy_threshold} is \code{0.0} (i.e. no entropy filtering). +#' @param base_model The per-feature base model. Default is "CPLM". Must be one of "CPLM", "ZICP", "ZSCP", or "ZACP". +#' @param link A specification of the GLM link function. Default is "log". Must be one of "log", "identity", "sqrt", or "inverse". +#' @param fixed_effects Metadata variable(s) describing the fixed effects coefficients. +#' @param random_effects Metadata variable(s) describing the random effects part of the model. +#' @param cutoff_ZSCP For \code{base_model = "ZSCP"}, the cutoff to stratify features for +#' adaptive ZI modeling based on sparsity (zero-inflation proportion). Default is 0.3. Must be between 0 and 1. +#' @param criteria_ZACP For \code{base_model = "ZACP"}, the criteria to select the +#' best fitting model per feature. The possible options are 'AIC' and BIC' (default). +#' More criteria will be supported in a future release. +#' @param adjust_offset If TRUE (default), an offset term will be included as the logarithm of \code{scale_factor}. +#' @param scale_factor Name of the numerical variable containing library size (for non-normalized data) or scale factor +#' (for normalized data) across samples to be included as an offset in the base model (when \code{adjust_offset = TRUE}). +#' If not found in metadata, defaults to the sample-wise total sums, unless \code{adjust_offset = FALSE}. +#' @param max_significance The q-value threshold for significance. Default is 0.05. +#' @param correction The correction method for computing the q-value (see \code{\link[stats]{p.adjust}} for options, default is 'BH'). +#' @param standardize Should continuous metadata be standardized? Default is TRUE. Bypassed for categorical variables. +#' @param cores An integer that indicates the number of R processes to run in parallel. Default is 1. +#' @param optimizer The optimization routine to be used for estimating the parameters of the Tweedie model. +#' Possible choices are \code{"nlminb"} (the default, see \code{\link[stats]{nlminb}}), +#' \code{"bobyqa"} (\code{\link[minqa]{bobyqa}}), and \code{"L-BFGS-B"} (\code{\link[stats]{optim}}). +#' Ignored for random effects modeling which uses an alternative Template Model Builder (TMB) approach (\code{\link[glmmTMB]{glmmTMB}}). +#' @param na.action How to handle missing values? See \code{\link{na.action}}. Default is \code{\link{na.exclude}}. +#' @param plot_heatmap Logical. If TRUE (default is FALSE), generate a heatmap of the (top \code{heatmap_first_n}) significant associations. +#' @param plot_scatter Logical. If TRUE (default is FALSE), generate scatter/box plots of individual associations. +#' @param heatmap_first_n In heatmap, plot top N features with significant associations (default is 50). +#' @param reference The factor to use as a reference for a variable with more than two levels provided as a string of 'variable,reference' semi-colon delimited for multiple variables (default is NULL). +#' +#' @importFrom grDevices colorRampPalette dev.off jpeg pdf +#' @importFrom stats coef fitted as.formula na.exclude p.adjust plogis sd update +#' @importFrom utils capture.output read.table write.table +#' @importFrom dplyr %>% everything +#' @importFrom parallel clusterExport +#' @return A data frame containing coefficient estimates, p-values, and q-values (multiplicity-adjusted p-values) are returned. +#' +#' @author Himel Mallick, \email{him4004@@med.cornell.edu} +#' +#' @examples +#' +#' \dontrun{ +#' +#' ############################################################################## +#' # Example 1 - Differential Abundance Analysis of Synthetic Microbiome Counts # +#' ############################################################################## +#' +#' ####################################### +#' # Install and Load Required Libraries # +#' ####################################### +#' +#' library(devtools) +#' devtools::install_github('biobakery/sparseDOSSA@@varyLibSize') +#' library(sparseDOSSA) +#' library(stringi) +#' +#' ###################### +#' # Specify Parameters # +#' ###################### +#' +#' n.microbes <- 200 # Number of Features +#' n.samples <- 100 # Number of Samples +#' spike.perc <- 0.02 # Percentage of Spiked-in Bugs +#' spikeStrength<-"20" # Effect Size +#' +#' ########################### +#' # Specify Binary Metadata # +#' ########################### +#' +#' n.metadata <- 1 +#' UserMetadata<-as.matrix(rep(c(0,1), each=n.samples/2)) +#' UserMetadata<-t(UserMetadata) # Transpose +#' +#' ################################################### +#' # Spiked-in Metadata (Which Metadata to Spike-in) # +#' ################################################### +#' +#' Metadatafrozenidx<-1 +#' spikeCount<-as.character(length(Metadatafrozenidx)) +#' significant_metadata<-paste('Metadata', Metadatafrozenidx, sep='') +#' +#' ############################################# +#' # Generate SparseDOSSA Synthetic Abundances # +#' ############################################# +#' +#' DD<-sparseDOSSA::sparseDOSSA(number_features = n.microbes, +#' number_samples = n.samples, +#' UserMetadata=UserMetadata, +#' Metadatafrozenidx=Metadatafrozenidx, +#' datasetCount = 1, +#' spikeCount = spikeCount, +#' spikeStrength = spikeStrength, +#' noZeroInflate=TRUE, +#' percent_spiked=spike.perc, +#' seed = 1234) +#' +#' ############################## +#' # Gather SparseDOSSA Outputs # +#' ############################## +#' +#' sparsedossa_results <- as.data.frame(DD$OTU_count) +#' rownames(sparsedossa_results)<-sparsedossa_results$X1 +#' sparsedossa_results<-sparsedossa_results[-1,-1] +#' colnames(sparsedossa_results)<-paste('Sample', 1:ncol(sparsedossa_results), sep='') +#' data<-as.matrix(sparsedossa_results[-c((n.metadata+1):(2*n.microbes+n.metadata)),]) +#' data<-data.matrix(data) +#' class(data) <- "numeric" +#' truth<-c(unlist(DD$truth)) +#' truth<-truth[!stri_detect_fixed(truth,":")] +#' truth<-truth[(5+n.metadata):length(truth)] +#' truth<-as.data.frame(truth) +#' significant_features<-truth[seq(1, +#' (as.numeric(spikeCount)+1)*(n.microbes*spike.perc), (as.numeric(spikeCount)+1)),] +#' significant_features<-as.vector(significant_features) +#' +#' #################### +#' # Extract Features # +#' #################### +#' +#' features<-as.data.frame(t(data[-c(1:n.metadata),])) +#' +#' #################### +#' # Extract Metadata # +#' #################### +#' +#' metadata<-as.data.frame(data[1,]) +#' colnames(metadata)<-rownames(data)[1] +#' +#' ############################### +#' # Mark True Positive Features # +#' ############################### +#' +#' wh.TP = colnames(features) %in% significant_features +#' colnames(features)<-paste("Feature", 1:n.microbes, sep = "") +#' newname = paste0(colnames(features)[wh.TP], "_TP") +#' colnames(features)[wh.TP] <- newname; +#' colnames(features)[grep('TP', colnames(features))] +#' +#' #################### +#' # Run Tweedieverse # +#' ################### +#' +#' ################### +#' # Default options # +#' ################### +#' +#' CPLM <-Tweedieverse( +#' features, +#' metadata, +#' output = './demo_output/CPLM') # Assuming demo_output exists +#' +#' ############################################### +#' # User-defined prevalence-abundance filtering # +#' ############################################### +#' +#' ZICP<-Tweedieverse( +#' features, +#' metadata, +#' output = './demo_output/ZICP', # Assuming demo_output exists +#' base_model = 'ZICP', +#' abd_threshold = 0.0, +#' prev_threshold = 0.2) +#' +#' #################################### +#' # User-defined variance filtering # +#' #################################### +#' +#' sds<-apply(features, 2, sd) +#' var_threshold = median(sds)/2 +#' ZSCP<-Tweedieverse( +#' features, +#' metadata, +#' output = './demo_output/ZSCP', # Assuming demo_output exists +#' base_model = 'ZSCP', +#' var_threshold = var_threshold) +#' +#' ################## +#' # Multiple cores # +#' ################## +#' +#' ZACP<-Tweedieverse( +#' features, +#' metadata, +#' output = './demo_output/ZACP', # Assuming demo_output exists +#' base_model = 'ZACP', +#' cores = 4) +#' +#' ########################################################################## +#' # Example 2 - Multivariable Association on HMP2 Longitudinal Microbiomes # +#' ########################################################################## +#' +#' ###################### +#' # HMP2 input_features Analysis # +#' ###################### +#' +#' ############# +#' # Load input_features # +#' ############# +#' +#' library(data.table) +#' input_features <- fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_taxonomy.tsv", sep ="\t") +#' input_metadata <-fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_metadata.tsv", sep ="\t") +#' +#' ############### +#' # Format data # +#' ############### +#' +#' library(tibble) +#' features<- column_to_rownames(input_features, 'ID') +#' metadata<- column_to_rownames(input_metadata, 'ID') +#' +#' ############# +#' # Fit Model # +#' ############# +#' +#' library(Tweedieverse) +#' HMP2 <- Tweedieverse( +#' features, +#' metadata, +#' output = './demo_output/HMP2', # Assuming demo_output exists +#' fixed_effects = c('diagnosis', 'dysbiosisnonIBD','dysbiosisUC','dysbiosisCD', 'antibiotics', 'age'), +#' random_effects = c('site', 'subject'), +#' base_model = 'CPLM', +#' adjust_offset = FALSE, # No offset as the values are relative abundances +#' cores = 8, # Make sure your computer has the capability +#' standardize = FALSE, +#' reference = c('diagnosis,nonIBD')) +#' +#' } +#' @keywords microbiome, metagenomics, multiomics, scRNASeq, tweedie, singlecell +#' @export +Tweedieverse <- function(input_features, + input_metadata = NULL, + output, + assay_name = "counts", + abd_threshold = 0.0, + prev_threshold = 0.1, + var_threshold = 0.0, + entropy_threshold = 0.0, + base_model = "CPLM", + link = "log", + fixed_effects = NULL, + random_effects = NULL, + cutoff_ZSCP = 0.3, + criteria_ZACP = "BIC", + adjust_offset = TRUE, + scale_factor = NULL, + max_significance = 0.05, + correction = "BH", + standardize = TRUE, + cores = 1, + optimizer = "nlminb", + na.action = na.exclude, + plot_heatmap = FALSE, + plot_scatter = FALSE, + heatmap_first_n = 50, + reference = NULL) { + + + + ################################# + # Specify all available options # + ################################# + + model_choices <- c("CPLM", "ZICP", "ZACP", "ZSCP") + link_choices <- c("log", "identity", "sqrt", "inverse") + criteria_ZACP_choices <- c("AIC", "BIC") + correction_choices <- + c("BH", "holm", "hochberg", "hommel", "bonferroni", "BY") + optimizer_choices <- c("nlminb", "bobyqa", "L-BFGS-B") + + ####################################################################### + #=====================================================================# + # Read in the data and metadata, create output folder, initialize log # + #=====================================================================# + ####################################################################### + + ######################################### + # Support multiple Bioconductor classes # + ######################################### + + valid_classes <- c("SummarizedExperiment", "RangedSummarizedExperiment", + "SingleCellExperiment", "TreeSummarizedExperiment") + + ############################################################## + # Extract features and metadata based on user-provided input # + ############################################################## + + input_class<-class(input_features) + if (input_class %in% valid_classes) { + data <- extractAssay(input_features, assay_name) + if(is.null(input_metadata)) { + metadata <- data.frame(colData(input_features)) + } else{ + metadata<-input_metadata + } + } else if (!(is.character(input_features)) & !(is.data.frame(input_features))) { + stop(cat(paste('Input data of class <', class(input_features), '> not supported. Please use SummarizedExperiment, SingleCellExperiment, RangedSummarizedExperiment, TreeSummarizedExperiment, or data.frame'))) + } else{ + + # if a character string then this is a file name, else it + # is a data frame + if (is.character(input_features)) { + data <- + data.frame( + read.delim::fread(input_features, header = TRUE, sep = '\t'), + header = TRUE, + fill = T, + comment.char = "" , + check.names = F, + row.names = 1 + ) + if (nrow(input_features) == 1) { + # read again to get row name + data <- read.delim( + input_features, + header = TRUE, + fill = T, + comment.char = "" , + check.names = F, + row.names = 1 + ) + } + } else { + data <- input_features + } + if (is.character(input_metadata)) { + input_metadata <- + data.frame( + read.delim::fread(input_metadata, header = TRUE, sep = '\t'), + header = TRUE, + fill = T, + comment.char = "" , + check.names = F, + row.names = 1 + ) + if (nrow(input_metadata) == 1) { + input_metadata <- read.delim( + input_metadata, + header = TRUE, + fill = T, + comment.char = "" , + check.names = F, + row.names = 1 + ) + } + } else { + metadata <- input_metadata + } + } + + # create an output folder and figures folder if it does not exist + if (!file.exists(output)) { + print("Creating output folder") + dir.create(output) + } + + #if (plot_heatmap || plot_scatter) { + figures_folder <- file.path(output, "figures") + if (!file.exists(figures_folder)) { + print("Creating output figures folder") + dir.create(figures_folder) + } + #} + + # Create log file (write info to stdout and debug level to log file) + # Set level to finest so all log levels are reviewed + log_file <- file.path(output, "Tweedieverse.log") + # Remove log file if already exists (to avoid append) + if (file.exists(log_file)) { + print(paste("Warning: Deleting existing log file:", log_file)) + unlink(log_file) + } + logging::basicConfig(level = 'FINEST') + logging::addHandler(logging::writeToFile, + file = log_file, level = "DEBUG") + logging::setLevel(20, logging::getHandler('basic.stdout')) + + ##################### + # Log the arguments # + ##################### + + logging::loginfo("Writing function arguments to log file") + logging::logdebug("Function arguments") + if (is.character(input_features)) { + logging::logdebug("Input data file: %s", input_features) + } + if (is.character(input_metadata)) { + logging::logdebug("Input metadata file: %s", input_metadata) + } + logging::logdebug("Output folder: %s", output) + logging::logdebug("Abundance threshold: %f", abd_threshold) + logging::logdebug("Prevalence threshold: %f", prev_threshold) + logging::logdebug("Variance threshold: %f", var_threshold) + logging::logdebug("Base model: %s", base_model) + logging::logdebug("Link function: %s", link) + logging::logdebug("Fixed effects: %s", fixed_effects) + logging::logdebug("Random effects: %s", random_effects) + logging::logdebug("ZSCP cutoff: %f", cutoff_ZSCP) + logging::logdebug("ZACP criteria: %s", criteria_ZACP) + logging::logdebug("Offset adjustment: %s", adjust_offset) + logging::logdebug("Scale factor: %s", scale_factor) + logging::logdebug("Max significance: %f", max_significance) + logging::logdebug("Correction method: %s", correction) + logging::logdebug("Standardize: %s", standardize) + logging::logdebug("Cores: %d", cores) + logging::logdebug("Optimization routine: %s", optimizer) + + + ####################################### + # Check if valid options are selected # + ####################################### + + # Check if the selected link is valid + if (!link %in% link_choices) { + option_not_valid_error("Please select a link from the list of available options", + toString(link_choices)) + } + + # Check if the selected base_model is valid + if (!base_model %in% model_choices) { + option_not_valid_error( + paste( + "Please select an analysis method", + "from the list of available options" + ), + toString(model_choices) + ) + } + + # Check if the selected criteria_ZACP is valid + if (!criteria_ZACP %in% criteria_ZACP_choices) { + option_not_valid_error( + paste( + "Please select a criteria", + "from the list of available options" + ), + toString(criteria_ZACP_choices) + ) + } + + # Check if the selected correction is valid + if (!correction %in% correction_choices) { + option_not_valid_error( + paste( + "Please select a correction method", + "from the list of available options" + ), + toString(correction_choices) + ) + } + + # Check if the selected optimizer is valid + if (!optimizer %in% optimizer_choices) { + option_not_valid_error( + paste( + "Please select an optimizer method", + "from the list of available options" + ), + toString(correction_choices) + ) + } + + ############################################################ + # Check if the selected numerical options are within range # + ############################################################ + + prop_options <- c(prev_threshold, cutoff_ZSCP, max_significance) + if (any(prop_options < 0) || any(prop_options > 1)) { + stop( + paste( + "One of the following is outside [0, 1]:", + "prev_threshold, cutoff_ZSCP, max_significance" + ) + ) + } + + ############################################################### + # Determine orientation of data in input and reorder to match # + ############################################################### + + logging::loginfo("Determining format of input files") + samples_row_row <- intersect(rownames(data), rownames(metadata)) + if (length(samples_row_row) > 0) { + # this is the expected formatting so do not modify data frames + logging::loginfo(paste( + "Input format is data samples", + "as rows and metadata samples as rows" + )) + } else { + samples_column_row <- intersect(colnames(data), rownames(metadata)) + if (length(samples_column_row) > 0) { + logging::loginfo(paste( + "Input format is data samples", + "as columns and metadata samples as rows" + )) + # transpose data frame so samples are rows + data <- type.convert(as.data.frame(t(data))) + logging::logdebug("linked data so samples are rows") + } else { + samples_column_column <- + intersect(colnames(data), colnames(metadata)) + if (length(samples_column_column) > 0) { + logging::loginfo( + paste( + "Input format is data samples", + "as columns and metadata samples as columns" + ) + ) + data <- type.convert(as.data.frame(t(data))) + metadata <- type.convert(as.data.frame(t(metadata))) + logging::logdebug("linked data and metadata so samples are rows") + } else { + samples_row_column <- + intersect(rownames(data), colnames(metadata)) + if (length(samples_row_column) > 0) { + logging::loginfo( + paste( + "Input format is data samples", + "as rows and metadata samples as columns" + ) + ) + metadata <- type.convert(as.data.frame(t(metadata))) + logging::logdebug("linked metadata so samples are rows") + } else { + logging::logerror( + paste( + "Unable to find samples in data and", + "metadata files.", + "Rows/columns do not match." + ) + ) + logging::logdebug("input_features rows: %s", + paste(rownames(data), collapse = ",")) + logging::logdebug("input_features columns: %s", + paste(colnames(data), collapse = ",")) + logging::logdebug("Metadata rows: %s", + paste(rownames(metadata), collapse = ",")) + logging::logdebug("Metadata columns: %s", + paste(colnames(data), collapse = ",")) + stop() + } + } + } + } + + # Replace unexpected characters in feature names + # colnames(data) <- make.names(colnames(data)) + + # Check for samples without metadata + extra_feature_samples <- + setdiff(rownames(data), rownames(metadata)) + if (length(extra_feature_samples) > 0) + logging::logdebug( + paste( + "The following samples were found", + "to have features but no metadata.", + "They will be removed. %s" + ), + paste(extra_feature_samples, collapse = ",") + ) + + # Check for metadata samples without features + extra_metadata_samples <- + setdiff(rownames(metadata), rownames(data)) + if (length(extra_metadata_samples) > 0) + logging::logdebug( + paste( + "The following samples were found", + "to have metadata but no features.", + "They will be removed. %s" + ), + paste(extra_metadata_samples, collapse = ",") + ) + + # Get a set of the samples with both metadata and features + intersect_samples <- intersect(rownames(data), rownames(metadata)) + logging::logdebug( + "A total of %s samples were found in both the data and metadata", + length(intersect_samples) + ) + + # Now order both data and metadata with the same sample ordering + logging::logdebug("Reordering data/metadata to use same sample ordering") + data <- data[intersect_samples, , drop = FALSE] + metadata <- metadata[intersect_samples, , drop = FALSE] + + + ######################################################################## + # Assign reference values to categorical metadata (fixed effects only) # + ######################################################################## + + if (is.null(reference)) { + reference <- "," + } + split_reference <- unlist(strsplit(reference, "[,;]")) + + # for each fixed effect, check that a reference level has been set if necessary: number of levels > 2 and metadata isn't already an ordered factor + for (i in fixed_effects) { + # don't check for or require reference levels for numeric metadata + if (is.numeric(metadata[,i])) { + next + } + # respect ordering if a factor is explicitly passed in with no reference set + if (is.factor(metadata[,i]) && !(i %in% split_reference)) { + logging::loginfo(paste("Factor detected for categorial metadata '", + i, "'. Provide a reference argument or manually set factor ordering to change reference level.", sep="")) + next + } + + # set metadata as a factor (ordered alphabetically) + metadata[,i] <- as.factor(metadata[,i]) + mlevels <- levels(metadata[,i]) + + # get reference level for variable being considered, returns NA if not found + ref <- split_reference[match(i, split_reference)+1] + + # if metadata has 2 levels, allow but don't require setting reference level, otherwise require it + if ((length(mlevels) == 2)) { + if(!is.na(ref)) { + metadata[,i] = relevel(metadata[,i], ref = ref) + } + } else if (length(mlevels) > 2) { + if (!is.na(ref)) { + metadata[,i] = relevel(metadata[,i], ref = ref) + } else { + stop(paste("Please provide the reference for the variable '", + i, "' which includes more than 2 levels: ", + paste(as.character(mlevels), collapse=", "), ".", sep="")) + } + } else { + stop("Provided categorical metadata has fewer than 2 unique, non-NA values.") + } + } + + + ######################################################### + # Non-specific filtering based on user-provided options # + ######################################################### + + unfiltered_data <- data + unfiltered_metadata <- metadata + + # require at least total samples * min prevalence values + # for each feature to be greater than min abundance + logging::loginfo("Filter data based on min abundance and min prevalence") + total_samples <- nrow(unfiltered_data) + logging::loginfo("Total samples in data: %d", total_samples) + min_samples <- total_samples * prev_threshold + logging::loginfo( + paste( + "Min samples required with min abundance", + "for a feature not to be filtered: %f" + ), + min_samples + ) + + # Filter by abundance using zero as value for NAs + data_zeros <- unfiltered_data + data_zeros[is.na(data_zeros)] <- 0 + filtered_data <- + unfiltered_data[, + colSums(data_zeros > abd_threshold) > min_samples, + drop = FALSE] + total_filtered_features <- + ncol(unfiltered_data) - ncol(filtered_data) + logging::loginfo( + "Total filtered features with prevalence-abundance filtering: %d", + total_filtered_features + ) + filtered_feature_names <- + setdiff(names(unfiltered_data), names(filtered_data)) + logging::loginfo("Filtered feature names: %s", + toString(filtered_feature_names)) + + + + ################################# + # Filter data based on variance # + ################################# + + sds <- apply(filtered_data, 2, na.rm = T, sd) + final_features <- + filtered_data[, which(sds > var_threshold), drop = FALSE] + total_filtered_features_var <- + ncol(filtered_data) - ncol(final_features) + logging::loginfo("Total filtered features with variance filtering: %d", + total_filtered_features_var) + filtered_feature_names_var <- + setdiff(names(filtered_data), names(final_features)) + logging::loginfo("Filtered feature names: %s", + toString(filtered_feature_names_var)) + + + ######################################################################## + # Set the scale factor to rowsum if not provided and create a modified # + # metadata table without the scale factor variable (if present) ######## + ######################################################################## + + if (is.null(scale_factor)) { + offset <- rowSums(final_features) + } else { + if (!scale_factor %in% colnames(unfiltered_metadata)) { + stop( + paste( + "The specified scale_factor variable is not present in the metadata table:\n", + scale_factor + ) + ) + } else { + offset <- unfiltered_metadata[, scale_factor] + unfiltered_metadata <- + dplyr::select(unfiltered_metadata,-scale_factor) + } + } + + + #################################### + # Filter metadata based on entropy # + #################################### + + # Reduce metadata to only include those pass entropy threshold + temp_filtered_metadata <- unfiltered_metadata[, apply(unfiltered_metadata, 2, entropy) > entropy_threshold, drop=F] + excluded_metadata <- setdiff(colnames(unfiltered_metadata), colnames(temp_filtered_metadata)) + logging::loginfo( + paste( + "Excluded metadata with", + "entropy less or equal to %s: %s" + ), + entropy_threshold, paste(excluded_metadata, collapse = ",") + ) + filtered_metadata <- temp_filtered_metadata + + + ############################################### + # Compute the formula based on the user input # + ############################################### + + ##################### + # Determine formula # + ##################### + + random_effects_formula <- NULL + # Use all metadata if no fixed effects are provided + if (is.null(fixed_effects)) { + fixed_effects <- colnames(filtered_metadata) + } else { + fixed_effects <- unlist(strsplit(fixed_effects, ",", fixed = TRUE)) + # remove any fixed effects not found in metadata names + to_remove <- setdiff(fixed_effects, colnames(filtered_metadata)) + if (length(to_remove) > 0) + logging::logwarn( + paste( + "Feature name not found in metadata", + "so not applied to formula as fixed effect: %s" + ), + paste(to_remove, collapse = " , ") + ) + fixed_effects <- setdiff(fixed_effects, to_remove) + if (length(fixed_effects) == 0) { + logging::logerror("No fixed effects included in formula.") + stop() + } + } + + if (!is.null(random_effects)) { + random_effects <- + unlist(strsplit(random_effects, ",", fixed = TRUE)) + # subtract random effects from fixed effects + fixed_effects <- setdiff(fixed_effects, random_effects) + # remove any random effects not found in metadata + to_remove <- + setdiff(random_effects, colnames(filtered_metadata)) + if (length(to_remove) > 0) + logging::logwarn( + paste( + "Feature name not found in metadata", + "so not applied to formula as random effect: %s" + ), + paste(to_remove, collapse = " , ") + ) + random_effects <- setdiff(random_effects, to_remove) + + # create formula + if (length(random_effects) > 0) { + random_effects_formula_text <- + paste("expr ~ (1 | ", + paste( + random_effects, + ")", + sep = '', + collapse = " + (1 | " + ), + sep = '') + logging::loginfo("Formula for random effects: %s", + random_effects_formula_text) + random_effects_formula <- + tryCatch( + as.formula(random_effects_formula_text), + error = function(e) + stop( + paste( + "Invalid formula for random effects: ", + random_effects_formula_text + ) + ) + ) + } + } + + # Reduce metadata to only include fixed/random effects in formula + effects_names <- union(fixed_effects, random_effects) + filtered_metadata <- + filtered_metadata[, effects_names, drop = FALSE] + + # Create the fixed effects formula text + formula_text <- + paste("expr ~ ", paste(fixed_effects, collapse = " + ")) + logging::loginfo("Formula for fixed effects: %s", formula_text) + formula <- + tryCatch( + as.formula(formula_text), + error = function(e) + stop( + paste( + "Invalid formula.", + "Please provide a different formula: ", + formula_text + ) + ) + ) + + ############################################################# + # Standardize metadata (excpet the offset variable), if set # + ############################################################# + + if (standardize) { + logging::loginfo("Applying z-score to standardize continuous metadata") + filtered_metadata <- + filtered_metadata %>% dplyr::mutate_if(is.numeric, scale) + } else { + logging::loginfo("Bypass z-score application to metadata") + } + + ################################## + # Merge metadata and offset back # + ################################## + + final_metadata <- as.data.frame(cbind.data.frame(filtered_metadata, offset)) + + ############################################################## + # Apply the base model to the filtered data with user inputs # + ############################################################## + + logging::loginfo("Running selected analysis method: %s", base_model) + + fit_data <- fit.Tweedieverse( + features = final_features, + metadata = final_metadata, + base_model = base_model, + link = link, + formula = formula, + random_effects_formula = random_effects_formula, + cutoff_ZSCP = cutoff_ZSCP, + criteria_ZACP = criteria_ZACP, + adjust_offset = adjust_offset, + correction = correction, + cores = cores, + optimizer = optimizer, + na.action = na.action + ) + + + ################################################### + # Count the N and Zero-inflation for each feature # + ################################################### + + logging::loginfo("Counting prevalence for each feature") + try(fit_data$results$N <- + apply( + fit_data$results, + 1, + FUN = function(x) + length(final_features[, x[1]]) + )) + try(fit_data$results$N.not.zero <- + apply( + fit_data$results, + 1, + FUN = function(x) + length(which(final_features[, x[1]] > 0)) + )) + try(fit_data$results$percent.zero <- + apply( + fit_data$results, + 1, + FUN = function(x) + round(mean(final_features[, x[1]] == 0, na.rm = TRUE), 2) * + 100 + )) + + ######################### + # Write out the results # + ######################### + + results_file <- file.path(output, "all_results.tsv") + logging::loginfo("Writing all results to file (ordered by increasing q-values): %s", + results_file) + ordered_results <- + fit_data$results[order(fit_data$results$qval),] + ordered_results <- + ordered_results[!is.na(ordered_results$qval),] # Remove NA's + ordered_results <- + dplyr::select( + ordered_results, + c( + 'feature', + 'metadata', + 'value', + "coef", + "stderr", + "pval", + "qval" + ), + everything() + ) + write.table( + ordered_results, + file = results_file, + sep = "\t", + quote = FALSE, + row.names = FALSE + ) + + + # Write results passing threshold to file + # (removing any that are NA for the q-value) + significant_results <- + ordered_results[ordered_results$qval <= max_significance,] + significant_results_file <- + file.path(output, "significant_results.tsv") + logging::loginfo( + paste( + "Writing the significant results", + "(those which are less than or equal to the threshold", + "of %f ) to file (ordered by increasing q-values): %s" + ), + max_significance, + significant_results_file + ) + write.table( + significant_results, + file = significant_results_file, + sep = "\t", + quote = FALSE, + row.names = FALSE + ) + + + ####################################################### + # Create visualizations for results passing threshold # + ####################################################### + + logging::loginfo("Writing Tweedie inxed plot to file: %s", + output) + tryCatch({ + tweedie_index_plot(ordered_results, figures_folder) + + }, error = function(err) { + logging::logerror("Unable to do make a Tweedie inxed plot of results!!!") + logging::logerror(err) + # dev.off() + }) + + if (plot_heatmap & length(unique(significant_results_file["metdata"])) > 1) { + heatmap_file <- file.path(output, "Tweedieverse_Heatmap.pdf") + logging::loginfo("Writing heatmap of significant results to file: %s", + heatmap_file) + tryCatch({ + save_heatmap(significant_results_file, + heatmap_file, + figures_folder, + first_n = heatmap_first_n) + }, error = function(err) { + logging::logerror("Unable to do make a hetamp of results!!!") + logging::logerror(err) + # dev.off() + }) + } + + if (plot_scatter) { + logging::loginfo( + paste( + "Writing association plots", + "(one for each significant association)", + "to output folder: %s" + ), + output + ) + association_plots( + metadata, + final_features/offset, + significant_results_file, + output, + figures_folder + ) + } + + return(significant_results) +} + + +option_not_valid_error <- function(message, valid_options) { + logging::logerror(paste(message, ": %s"), toString(valid_options)) + stop("Option not valid", call. = FALSE) +} + +## Quiets concerns of R CMD check +utils::globalVariables(c("name"))