Diff of /R/Tweedieverse.R [000000] .. [727782]

Switch to unified view

a b/R/Tweedieverse.R
1
#' Differential analysis of multi-omics data using Tweedie GLMs
2
#'
3
#' Fit a per-feature Tweedie generalized linear model to omics features.
4
5
#' @param input_features A tab-delimited input file or an R data frame of features (can be in rows/columns)
6
#' and samples (or cells). Samples are expected to have matching names with \code{input_metadata}. 
7
#' \code{input_features} can also be an object of class \code{SummarizedExperiment} or \code{SingleCellExperiment} 
8
#' that contains the expression or abundance matrix and other metadata; the \code{assays} 
9
#' slot contains the expression or abundance matrix and is named \code{"counts"}.  
10
#' This matrix should have one row for each feature and one sample for each column.  
11
#' The \code{colData} slot should contain a data frame with one row per 
12
#' sample and columns that contain metadata for each sample. 
13
#' Additional information about the experiment can be contained in the
14
#' \code{metadata} slot as a list.
15
#' @param input_metadata A tab-delimited input file or an R data frame of metadata (rows/columns).
16
#' Samples are expected to have matching sample names with \code{input_features}. 
17
#' This file is ignored when \code{input_features} is a \code{SummarizedExperiment} 
18
#' or a \code{SingleCellExperiment} object with \code{colData} containing the same information.
19
#' @param output The output folder to write results.
20
#' @param assay_name If the input is provided as one of the accepted Bioconductor objects 
21
#' (e.g., SummarizedExperiment, RangedSummarizedExperiment, SingleCellExperiment, or TreeSummarizedExperiment), 
22
#' this argument selects the name of the assay slot in the input object that contains the omics measurements.
23
#' @param abd_threshold If prevalence-abundance filtering is desired, only features that are present (or detected)
24
#' in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion)
25
#' are retained. Default value for \code{abd_threshold} is \code{0.0}.
26
#' To disable prevalence-abundance filtering, set \code{abd_threshold = -Inf}.
27
#' @param prev_threshold If prevalence-abundance filtering is desired, only features that are present (or detected)
28
#' in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion)
29
#' are retained. Default value for \code{prev_threshold} is \code{0.1}.
30
#' @param var_threshold If variance filtering is desired, only features that have variances greater than
31
#' \code{var_threshold} are retained. This step is done after the prevalence-abundance filtering.
32
#' Default value for \code{var_threshold} is \code{0.0} (i.e. no variance filtering).
33
#' @param entropy_threshold If entropy-based filtering is desired for metadata, only features that have entropy greater than
34
#' \code{entropy_threshold} are retained. Default value for \code{entropy_threshold} is \code{0.0} (i.e. no entropy filtering).
35
#' @param base_model The per-feature base model. Default is "CPLM". Must be one of "CPLM", "ZICP", "ZSCP", or "ZACP".
36
#' @param link A specification of the GLM link function. Default is "log". Must be one of "log", "identity", "sqrt", or "inverse".
37
#' @param fixed_effects Metadata variable(s) describing the fixed effects coefficients.
38
#' @param random_effects Metadata variable(s) describing the random effects part of the model.
39
#' @param cutoff_ZSCP For \code{base_model = "ZSCP"}, the cutoff to stratify features for
40
#' adaptive ZI modeling based on sparsity (zero-inflation proportion). Default is 0.3. Must be between 0 and 1.
41
#' @param criteria_ZACP For \code{base_model = "ZACP"}, the criteria to select the
42
#' best fitting model per feature.  The possible options are 'AIC' and BIC' (default).
43
#' More criteria will be supported in a future release.
44
#' @param adjust_offset If TRUE (default), an offset term will be included as the logarithm of \code{scale_factor}.
45
#' @param scale_factor Name of the numerical variable containing library size (for non-normalized data) or scale factor
46
#' (for normalized data) across samples to be included as an offset in the base model (when \code{adjust_offset = TRUE}).
47
#' If not found in metadata, defaults to the sample-wise total sums, unless \code{adjust_offset = FALSE}.
48
#' @param max_significance The q-value threshold for significance. Default is 0.05.
49
#' @param correction The correction method for computing the q-value (see \code{\link[stats]{p.adjust}} for options, default is 'BH').
50
#' @param standardize Should continuous metadata be standardized? Default is TRUE. Bypassed for categorical variables.
51
#' @param cores An integer that indicates the number of R processes to run in parallel. Default is 1.
52
#' @param optimizer The optimization routine to be used for estimating the parameters of the Tweedie model.
53
#' Possible choices are \code{"nlminb"} (the default, see \code{\link[stats]{nlminb}}),
54
#' \code{"bobyqa"} (\code{\link[minqa]{bobyqa}}), and \code{"L-BFGS-B"} (\code{\link[stats]{optim}}).
55
#' Ignored for random effects modeling which uses an alternative Template Model Builder (TMB) approach (\code{\link[glmmTMB]{glmmTMB}}).
56
#' @param na.action How to handle missing values? See \code{\link{na.action}}. Default is \code{\link{na.exclude}}.
57
#' @param plot_heatmap Logical. If TRUE (default is FALSE), generate a heatmap of the (top \code{heatmap_first_n}) significant associations.
58
#' @param plot_scatter Logical. If TRUE (default is FALSE), generate scatter/box plots of individual associations.
59
#' @param heatmap_first_n In heatmap, plot top N features with significant associations (default is 50).
60
#' @param reference The factor to use as a reference for a variable with more than two levels provided as a string of 'variable,reference' semi-colon delimited for multiple variables (default is NULL).
61
#'
62
#' @importFrom grDevices colorRampPalette dev.off jpeg pdf
63
#' @importFrom stats coef fitted as.formula na.exclude p.adjust plogis sd update
64
#' @importFrom utils capture.output read.table write.table
65
#' @importFrom dplyr %>% everything
66
#' @importFrom parallel clusterExport
67
#' @return A data frame containing coefficient estimates, p-values, and q-values (multiplicity-adjusted p-values) are returned.
68
#'
69
#' @author Himel Mallick, \email{him4004@@med.cornell.edu}
70
#'
71
#' @examples
72
#'
73
#' \dontrun{
74
#'
75
#' ##############################################################################
76
#' # Example 1 - Differential Abundance Analysis of Synthetic Microbiome Counts #
77
#' ##############################################################################
78
#'
79
#' #######################################
80
#' # Install and Load Required Libraries #
81
#' #######################################
82
#'
83
#' library(devtools)
84
#' devtools::install_github('biobakery/sparseDOSSA@@varyLibSize')
85
#' library(sparseDOSSA)
86
#' library(stringi)
87
#'
88
#' ######################
89
#' # Specify Parameters #
90
#' ######################
91
#'
92
#' n.microbes <- 200 # Number of Features
93
#' n.samples <- 100 # Number of Samples
94
#' spike.perc <- 0.02 # Percentage of Spiked-in Bugs
95
#' spikeStrength<-"20" # Effect Size
96
#'
97
#' ###########################
98
#' # Specify Binary Metadata #
99
#' ###########################
100
#'
101
#' n.metadata <- 1
102
#' UserMetadata<-as.matrix(rep(c(0,1), each=n.samples/2))
103
#' UserMetadata<-t(UserMetadata) # Transpose
104
#'
105
#' ###################################################
106
#' # Spiked-in Metadata (Which Metadata to Spike-in) #
107
#' ###################################################
108
#'
109
#' Metadatafrozenidx<-1
110
#' spikeCount<-as.character(length(Metadatafrozenidx))
111
#' significant_metadata<-paste('Metadata', Metadatafrozenidx, sep='')
112
#'
113
#' #############################################
114
#' # Generate SparseDOSSA Synthetic Abundances #
115
#' #############################################
116
#'
117
#' DD<-sparseDOSSA::sparseDOSSA(number_features = n.microbes,
118
#' number_samples = n.samples,
119
#' UserMetadata=UserMetadata,
120
#' Metadatafrozenidx=Metadatafrozenidx,
121
#' datasetCount = 1,
122
#' spikeCount = spikeCount,
123
#' spikeStrength = spikeStrength,
124
#' noZeroInflate=TRUE,
125
#' percent_spiked=spike.perc,
126
#' seed = 1234)
127
#'
128
#' ##############################
129
#' # Gather SparseDOSSA Outputs #
130
#' ##############################
131
#'
132
#' sparsedossa_results <- as.data.frame(DD$OTU_count)
133
#' rownames(sparsedossa_results)<-sparsedossa_results$X1
134
#' sparsedossa_results<-sparsedossa_results[-1,-1]
135
#' colnames(sparsedossa_results)<-paste('Sample', 1:ncol(sparsedossa_results), sep='')
136
#' data<-as.matrix(sparsedossa_results[-c((n.metadata+1):(2*n.microbes+n.metadata)),])
137
#' data<-data.matrix(data)
138
#' class(data) <- "numeric"
139
#' truth<-c(unlist(DD$truth))
140
#' truth<-truth[!stri_detect_fixed(truth,":")]
141
#' truth<-truth[(5+n.metadata):length(truth)]
142
#' truth<-as.data.frame(truth)
143
#' significant_features<-truth[seq(1,
144
#' (as.numeric(spikeCount)+1)*(n.microbes*spike.perc), (as.numeric(spikeCount)+1)),]
145
#' significant_features<-as.vector(significant_features)
146
#'
147
#' ####################
148
#' # Extract Features #
149
#' ####################
150
#'
151
#' features<-as.data.frame(t(data[-c(1:n.metadata),]))
152
#'
153
#' ####################
154
#' # Extract Metadata #
155
#' ####################
156
#'
157
#' metadata<-as.data.frame(data[1,])
158
#' colnames(metadata)<-rownames(data)[1]
159
#'
160
#' ###############################
161
#' # Mark True Positive Features #
162
#' ###############################
163
#'
164
#' wh.TP = colnames(features) %in% significant_features
165
#' colnames(features)<-paste("Feature", 1:n.microbes, sep = "")
166
#' newname = paste0(colnames(features)[wh.TP], "_TP")
167
#' colnames(features)[wh.TP] <- newname;
168
#' colnames(features)[grep('TP', colnames(features))]
169
#'
170
#' ####################
171
#' # Run Tweedieverse #
172
#' ###################
173
#'
174
#' ###################
175
#' # Default options #
176
#' ###################
177
#'
178
#' CPLM <-Tweedieverse(
179
#' features,
180
#' metadata,
181
#' output = './demo_output/CPLM') # Assuming demo_output exists
182
#'
183
#' ###############################################
184
#' # User-defined prevalence-abundance filtering #
185
#' ###############################################
186
#'
187
#' ZICP<-Tweedieverse(
188
#' features,
189
#' metadata,
190
#' output = './demo_output/ZICP', # Assuming demo_output exists
191
#' base_model = 'ZICP',
192
#' abd_threshold = 0.0,
193
#' prev_threshold = 0.2)
194
#'
195
#' ####################################
196
#' # User-defined variance filtering  #
197
#' ####################################
198
#'
199
#' sds<-apply(features, 2, sd)
200
#' var_threshold = median(sds)/2
201
#' ZSCP<-Tweedieverse(
202
#' features,
203
#' metadata,
204
#' output = './demo_output/ZSCP', # Assuming demo_output exists
205
#' base_model = 'ZSCP',
206
#' var_threshold = var_threshold)
207
#'
208
#' ##################
209
#' # Multiple cores #
210
#' ##################
211
#'
212
#' ZACP<-Tweedieverse(
213
#' features,
214
#' metadata,
215
#' output = './demo_output/ZACP', # Assuming demo_output exists
216
#' base_model = 'ZACP',
217
#' cores = 4)
218
#'
219
#' ##########################################################################
220
#' # Example 2 - Multivariable Association on HMP2 Longitudinal Microbiomes #
221
#' ##########################################################################
222
#'
223
#' ######################
224
#' # HMP2 input_features Analysis #
225
#' ######################
226
#'
227
#' #############
228
#' # Load input_features #
229
#' #############
230
#' 
231
#' library(data.table)
232
#' input_features <- fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_taxonomy.tsv", sep ="\t")
233
#' input_metadata <-fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_metadata.tsv", sep ="\t")
234
#'
235
#' ###############
236
#' # Format data #
237
#' ###############
238
#'
239
#' library(tibble)
240
#' features<- column_to_rownames(input_features, 'ID')
241
#' metadata<- column_to_rownames(input_metadata, 'ID')
242
#'
243
#' #############
244
#' # Fit Model #
245
#' #############
246
#'
247
#' library(Tweedieverse)
248
#' HMP2 <- Tweedieverse(
249
#' features,
250
#' metadata,
251
#' output = './demo_output/HMP2', # Assuming demo_output exists
252
#' fixed_effects = c('diagnosis', 'dysbiosisnonIBD','dysbiosisUC','dysbiosisCD', 'antibiotics', 'age'),
253
#' random_effects = c('site', 'subject'),
254
#' base_model = 'CPLM',
255
#' adjust_offset = FALSE, # No offset as the values are relative abundances
256
#' cores = 8, # Make sure your computer has the capability
257
#' standardize = FALSE,
258
#' reference = c('diagnosis,nonIBD'))
259
#'
260
#' }
261
#' @keywords microbiome, metagenomics, multiomics, scRNASeq, tweedie, singlecell
262
#' @export
263
Tweedieverse <- function(input_features,
264
                         input_metadata = NULL,
265
                         output,
266
                         assay_name = "counts",
267
                         abd_threshold = 0.0,
268
                         prev_threshold = 0.1,
269
                         var_threshold = 0.0,
270
                         entropy_threshold = 0.0,
271
                         base_model = "CPLM",
272
                         link = "log",
273
                         fixed_effects = NULL,
274
                         random_effects = NULL,
275
                         cutoff_ZSCP = 0.3,
276
                         criteria_ZACP = "BIC",
277
                         adjust_offset = TRUE,
278
                         scale_factor = NULL,
279
                         max_significance = 0.05,
280
                         correction = "BH",
281
                         standardize = TRUE,
282
                         cores = 1,
283
                         optimizer = "nlminb",
284
                         na.action = na.exclude,
285
                         plot_heatmap = FALSE,
286
                         plot_scatter = FALSE,
287
                         heatmap_first_n = 50,
288
                         reference = NULL) {
289
  
290
  
291
292
  #################################
293
  # Specify all available options #
294
  #################################
295
  
296
  model_choices <- c("CPLM", "ZICP", "ZACP", "ZSCP")
297
  link_choices <- c("log", "identity", "sqrt", "inverse")
298
  criteria_ZACP_choices <- c("AIC", "BIC")
299
  correction_choices <-
300
    c("BH", "holm", "hochberg", "hommel", "bonferroni", "BY")
301
  optimizer_choices <- c("nlminb", "bobyqa", "L-BFGS-B")
302
  
303
  #######################################################################
304
  #=====================================================================#
305
  # Read in the data and metadata, create output folder, initialize log #
306
  #=====================================================================#
307
  #######################################################################
308
  
309
  #########################################
310
  # Support multiple Bioconductor classes #
311
  #########################################
312
  
313
  valid_classes <- c("SummarizedExperiment", "RangedSummarizedExperiment", 
314
                     "SingleCellExperiment", "TreeSummarizedExperiment")
315
  
316
  ##############################################################
317
  # Extract features and metadata based on user-provided input #
318
  ##############################################################
319
  
320
  input_class<-class(input_features)
321
  if (input_class %in% valid_classes) {
322
    data <- extractAssay(input_features, assay_name)
323
    if(is.null(input_metadata)) {
324
      metadata <- data.frame(colData(input_features))
325
    } else{
326
      metadata<-input_metadata
327
    }
328
  } else if (!(is.character(input_features)) & !(is.data.frame(input_features))) {
329
    stop(cat(paste('Input data of class <', class(input_features), '> not supported. Please use SummarizedExperiment, SingleCellExperiment, RangedSummarizedExperiment, TreeSummarizedExperiment, or data.frame')))
330
  } else{
331
    
332
    # if a character string then this is a file name, else it
333
    # is a data frame
334
    if (is.character(input_features)) {
335
      data <-
336
        data.frame(
337
          read.delim::fread(input_features, header = TRUE, sep = '\t'),
338
          header = TRUE,
339
          fill = T,
340
          comment.char = "" ,
341
          check.names = F,
342
          row.names = 1
343
        )
344
      if (nrow(input_features) == 1) {
345
        # read again to get row name
346
        data <- read.delim(
347
          input_features,
348
          header = TRUE,
349
          fill = T,
350
          comment.char = "" ,
351
          check.names = F,
352
          row.names = 1
353
        )
354
      }
355
    } else {
356
      data <- input_features
357
    }
358
    if (is.character(input_metadata)) {
359
      input_metadata <-
360
        data.frame(
361
          read.delim::fread(input_metadata, header = TRUE, sep = '\t'),
362
          header = TRUE,
363
          fill = T,
364
          comment.char = "" ,
365
          check.names = F,
366
          row.names = 1
367
        )
368
      if (nrow(input_metadata) == 1) {
369
        input_metadata <- read.delim(
370
          input_metadata,
371
          header = TRUE,
372
          fill = T,
373
          comment.char = "" ,
374
          check.names = F,
375
          row.names = 1
376
        )
377
      }
378
    } else {
379
      metadata <- input_metadata
380
    }
381
  }
382
    
383
  # create an output folder and figures folder if it does not exist
384
  if (!file.exists(output)) {
385
    print("Creating output folder")
386
    dir.create(output)
387
  }
388
  
389
  #if (plot_heatmap || plot_scatter) {
390
  figures_folder <- file.path(output, "figures")
391
  if (!file.exists(figures_folder)) {
392
    print("Creating output figures folder")
393
    dir.create(figures_folder)
394
  }
395
  #}
396
 
397
  # Create log file (write info to stdout and debug level to log file)
398
  # Set level to finest so all log levels are reviewed
399
  log_file <- file.path(output, "Tweedieverse.log")
400
  # Remove log file if already exists (to avoid append)
401
  if (file.exists(log_file)) {
402
    print(paste("Warning: Deleting existing log file:", log_file))
403
    unlink(log_file)
404
  }
405
  logging::basicConfig(level = 'FINEST')
406
  logging::addHandler(logging::writeToFile,
407
                      file = log_file, level = "DEBUG")
408
  logging::setLevel(20, logging::getHandler('basic.stdout'))
409
  
410
  #####################
411
  # Log the arguments #
412
  #####################
413
  
414
  logging::loginfo("Writing function arguments to log file")
415
  logging::logdebug("Function arguments")
416
  if (is.character(input_features)) {
417
    logging::logdebug("Input data file: %s", input_features)
418
  }
419
  if (is.character(input_metadata)) {
420
    logging::logdebug("Input metadata file: %s", input_metadata)
421
  }
422
  logging::logdebug("Output folder: %s", output)
423
  logging::logdebug("Abundance threshold: %f", abd_threshold)
424
  logging::logdebug("Prevalence threshold: %f", prev_threshold)
425
  logging::logdebug("Variance threshold: %f", var_threshold)
426
  logging::logdebug("Base model: %s", base_model)
427
  logging::logdebug("Link function: %s", link)
428
  logging::logdebug("Fixed effects: %s", fixed_effects)
429
  logging::logdebug("Random effects: %s", random_effects)
430
  logging::logdebug("ZSCP cutoff: %f", cutoff_ZSCP)
431
  logging::logdebug("ZACP criteria: %s", criteria_ZACP)
432
  logging::logdebug("Offset adjustment: %s", adjust_offset)
433
  logging::logdebug("Scale factor: %s", scale_factor)
434
  logging::logdebug("Max significance: %f", max_significance)
435
  logging::logdebug("Correction method: %s", correction)
436
  logging::logdebug("Standardize: %s", standardize)
437
  logging::logdebug("Cores: %d", cores)
438
  logging::logdebug("Optimization routine: %s", optimizer)
439
  
440
  
441
  #######################################
442
  # Check if valid options are selected #
443
  #######################################
444
  
445
  # Check if the selected link is valid
446
  if (!link %in% link_choices) {
447
    option_not_valid_error("Please select a link from the list of available options",
448
                           toString(link_choices))
449
  }
450
  
451
  # Check if the selected base_model is valid
452
  if (!base_model %in% model_choices) {
453
    option_not_valid_error(
454
      paste(
455
        "Please select an analysis method",
456
        "from the list of available options"
457
      ),
458
      toString(model_choices)
459
    )
460
  }
461
  
462
  # Check if the selected criteria_ZACP is valid
463
    if (!criteria_ZACP %in% criteria_ZACP_choices) {
464
      option_not_valid_error(
465
        paste(
466
          "Please select a criteria",
467
          "from the list of available options"
468
        ),
469
        toString(criteria_ZACP_choices)
470
      )
471
    }
472
  
473
  # Check if the selected correction is valid
474
  if (!correction %in% correction_choices) {
475
    option_not_valid_error(
476
      paste(
477
        "Please select a correction method",
478
        "from the list of available options"
479
      ),
480
      toString(correction_choices)
481
    )
482
  }
483
  
484
  # Check if the selected optimizer is valid
485
  if (!optimizer %in% optimizer_choices) {
486
    option_not_valid_error(
487
      paste(
488
        "Please select an optimizer method",
489
        "from the list of available options"
490
      ),
491
      toString(correction_choices)
492
    )
493
  }
494
  
495
  ############################################################
496
  # Check if the selected numerical options are within range #
497
  ############################################################
498
  
499
  prop_options <- c(prev_threshold, cutoff_ZSCP, max_significance)
500
  if (any(prop_options < 0) || any(prop_options > 1)) {
501
    stop(
502
      paste(
503
        "One of the following is outside [0, 1]:",
504
        "prev_threshold, cutoff_ZSCP, max_significance"
505
      )
506
    )
507
  }
508
  
509
  ###############################################################
510
  # Determine orientation of data in input and reorder to match #
511
  ###############################################################
512
  
513
  logging::loginfo("Determining format of input files")
514
  samples_row_row <- intersect(rownames(data), rownames(metadata))
515
  if (length(samples_row_row) > 0) {
516
    # this is the expected formatting so do not modify data frames
517
    logging::loginfo(paste(
518
      "Input format is data samples",
519
      "as rows and metadata samples as rows"
520
    ))
521
  } else {
522
    samples_column_row <- intersect(colnames(data), rownames(metadata))
523
    if (length(samples_column_row) > 0) {
524
      logging::loginfo(paste(
525
        "Input format is data samples",
526
        "as columns and metadata samples as rows"
527
      ))
528
      # transpose data frame so samples are rows
529
      data <- type.convert(as.data.frame(t(data)))
530
      logging::logdebug("linked data so samples are rows")
531
    } else {
532
      samples_column_column <-
533
        intersect(colnames(data), colnames(metadata))
534
      if (length(samples_column_column) > 0) {
535
        logging::loginfo(
536
          paste(
537
            "Input format is data samples",
538
            "as columns and metadata samples as columns"
539
          )
540
        )
541
        data <- type.convert(as.data.frame(t(data)))
542
        metadata <- type.convert(as.data.frame(t(metadata)))
543
        logging::logdebug("linked data and metadata so samples are rows")
544
      } else {
545
        samples_row_column <-
546
          intersect(rownames(data), colnames(metadata))
547
        if (length(samples_row_column) > 0) {
548
          logging::loginfo(
549
            paste(
550
              "Input format is data samples",
551
              "as rows and metadata samples as columns"
552
            )
553
          )
554
          metadata <- type.convert(as.data.frame(t(metadata)))
555
          logging::logdebug("linked metadata so samples are rows")
556
        } else {
557
          logging::logerror(
558
            paste(
559
              "Unable to find samples in data and",
560
              "metadata files.",
561
              "Rows/columns do not match."
562
            )
563
          )
564
          logging::logdebug("input_features rows: %s",
565
                            paste(rownames(data), collapse = ","))
566
          logging::logdebug("input_features columns: %s",
567
                            paste(colnames(data), collapse = ","))
568
          logging::logdebug("Metadata rows: %s",
569
                            paste(rownames(metadata), collapse = ","))
570
          logging::logdebug("Metadata columns: %s",
571
                            paste(colnames(data), collapse = ","))
572
          stop()
573
        }
574
      }
575
    }
576
  }
577
  
578
  # Replace unexpected characters in feature names
579
  # colnames(data) <- make.names(colnames(data))
580
  
581
  # Check for samples without metadata
582
  extra_feature_samples <-
583
    setdiff(rownames(data), rownames(metadata))
584
  if (length(extra_feature_samples) > 0)
585
    logging::logdebug(
586
      paste(
587
        "The following samples were found",
588
        "to have features but no metadata.",
589
        "They will be removed. %s"
590
      ),
591
      paste(extra_feature_samples, collapse = ",")
592
    )
593
  
594
  # Check for metadata samples without features
595
  extra_metadata_samples <-
596
    setdiff(rownames(metadata), rownames(data))
597
  if (length(extra_metadata_samples) > 0)
598
    logging::logdebug(
599
      paste(
600
        "The following samples were found",
601
        "to have metadata but no features.",
602
        "They will be removed. %s"
603
      ),
604
      paste(extra_metadata_samples, collapse = ",")
605
    )
606
  
607
  # Get a set of the samples with both metadata and features
608
  intersect_samples <- intersect(rownames(data), rownames(metadata))
609
  logging::logdebug(
610
    "A total of %s samples were found in both the data and metadata",
611
    length(intersect_samples)
612
  )
613
  
614
  # Now order both data and metadata with the same sample ordering
615
  logging::logdebug("Reordering data/metadata to use same sample ordering")
616
  data <- data[intersect_samples, , drop = FALSE]
617
  metadata <- metadata[intersect_samples, , drop = FALSE]
618
  
619
620
  ########################################################################
621
  # Assign reference values to categorical metadata (fixed effects only) #
622
  ########################################################################
623
  
624
  if (is.null(reference)) {
625
    reference <- ","
626
  }
627
  split_reference <- unlist(strsplit(reference, "[,;]"))
628
  
629
  # for each fixed effect, check that a reference level has been set if necessary: number of levels > 2 and metadata isn't already an ordered factor
630
  for (i in fixed_effects) {
631
    # don't check for or require reference levels for numeric metadata
632
    if (is.numeric(metadata[,i])) {
633
      next
634
    }
635
    # respect ordering if a factor is explicitly passed in with no reference set
636
    if (is.factor(metadata[,i]) && !(i %in% split_reference)) {
637
      logging::loginfo(paste("Factor detected for categorial metadata '", 
638
                             i, "'. Provide a reference argument or manually set factor ordering to change reference level.", sep=""))
639
      next
640
    }
641
    
642
    # set metadata as a factor (ordered alphabetically)
643
    metadata[,i] <- as.factor(metadata[,i])
644
    mlevels <- levels(metadata[,i])
645
    
646
    # get reference level for variable being considered, returns NA if not found
647
    ref <- split_reference[match(i, split_reference)+1]
648
    
649
    # if metadata has 2 levels, allow but don't require setting reference level, otherwise require it
650
    if ((length(mlevels) == 2)) {
651
      if(!is.na(ref)) {
652
        metadata[,i] = relevel(metadata[,i], ref = ref)
653
      }
654
    } else if (length(mlevels) > 2) {
655
      if (!is.na(ref)) {
656
        metadata[,i] = relevel(metadata[,i], ref = ref)
657
      } else {
658
        stop(paste("Please provide the reference for the variable '",
659
                   i, "' which includes more than 2 levels: ",
660
                   paste(as.character(mlevels), collapse=", "), ".", sep=""))   
661
      } 
662
    } else {
663
      stop("Provided categorical metadata has fewer than 2 unique, non-NA values.")
664
    }
665
  }
666
  
667
  
668
  #########################################################
669
  # Non-specific filtering based on user-provided options #
670
  #########################################################
671
  
672
  unfiltered_data <- data
673
  unfiltered_metadata <- metadata
674
  
675
  # require at least total samples * min prevalence values
676
  # for each feature to be greater than min abundance
677
  logging::loginfo("Filter data based on min abundance and min prevalence")
678
  total_samples <- nrow(unfiltered_data)
679
  logging::loginfo("Total samples in data: %d", total_samples)
680
  min_samples <- total_samples * prev_threshold
681
  logging::loginfo(
682
    paste(
683
      "Min samples required with min abundance",
684
      "for a feature not to be filtered: %f"
685
    ),
686
    min_samples
687
  )
688
  
689
  # Filter by abundance using zero as value for NAs
690
  data_zeros <- unfiltered_data
691
  data_zeros[is.na(data_zeros)] <- 0
692
  filtered_data <-
693
    unfiltered_data[,
694
                    colSums(data_zeros > abd_threshold) > min_samples,
695
                    drop = FALSE]
696
  total_filtered_features <-
697
    ncol(unfiltered_data) - ncol(filtered_data)
698
  logging::loginfo(
699
    "Total filtered features with prevalence-abundance filtering: %d",
700
    total_filtered_features
701
  )
702
  filtered_feature_names <-
703
    setdiff(names(unfiltered_data), names(filtered_data))
704
  logging::loginfo("Filtered feature names: %s",
705
                   toString(filtered_feature_names))
706
  
707
  
708
  
709
  #################################
710
  # Filter data based on variance #
711
  #################################
712
  
713
  sds <- apply(filtered_data, 2, na.rm = T, sd)
714
  final_features <-
715
    filtered_data[, which(sds > var_threshold), drop = FALSE]
716
  total_filtered_features_var <-
717
    ncol(filtered_data) - ncol(final_features)
718
  logging::loginfo("Total filtered features with variance filtering: %d",
719
                   total_filtered_features_var)
720
  filtered_feature_names_var <-
721
    setdiff(names(filtered_data), names(final_features))
722
  logging::loginfo("Filtered feature names: %s",
723
                   toString(filtered_feature_names_var))
724
  
725
  
726
  ########################################################################
727
  # Set the scale factor to rowsum if not provided and create a modified #
728
  # metadata table without the scale factor variable (if present) ########
729
  ########################################################################
730
  
731
  if (is.null(scale_factor)) {
732
    offset <- rowSums(final_features)
733
  } else {
734
    if (!scale_factor %in% colnames(unfiltered_metadata)) {
735
      stop(
736
        paste(
737
          "The specified scale_factor variable is not present in the metadata table:\n",
738
          scale_factor
739
        )
740
      )
741
    } else {
742
      offset <- unfiltered_metadata[, scale_factor]
743
      unfiltered_metadata <-
744
        dplyr::select(unfiltered_metadata,-scale_factor)
745
    }
746
  }
747
  
748
  
749
  ####################################
750
  # Filter metadata based on entropy #
751
  ####################################
752
  
753
  # Reduce metadata to only include those pass entropy threshold
754
  temp_filtered_metadata <- unfiltered_metadata[, apply(unfiltered_metadata, 2, entropy) > entropy_threshold, drop=F]
755
  excluded_metadata <- setdiff(colnames(unfiltered_metadata), colnames(temp_filtered_metadata))
756
  logging::loginfo(
757
    paste(
758
      "Excluded metadata with",
759
      "entropy less or equal to %s: %s"
760
    ),
761
    entropy_threshold, paste(excluded_metadata, collapse = ",")
762
  )
763
  filtered_metadata <- temp_filtered_metadata
764
  
765
  
766
  ###############################################
767
  # Compute the formula based on the user input #
768
  ###############################################
769
770
  #####################
771
  # Determine formula #
772
  #####################
773
  
774
  random_effects_formula <- NULL
775
  # Use all metadata if no fixed effects are provided
776
  if (is.null(fixed_effects)) {
777
    fixed_effects <- colnames(filtered_metadata)
778
  } else {
779
    fixed_effects <- unlist(strsplit(fixed_effects, ",", fixed = TRUE))
780
    # remove any fixed effects not found in metadata names
781
    to_remove <- setdiff(fixed_effects, colnames(filtered_metadata))
782
    if (length(to_remove) > 0)
783
      logging::logwarn(
784
        paste(
785
          "Feature name not found in metadata",
786
          "so not applied to formula as fixed effect: %s"
787
        ),
788
        paste(to_remove, collapse = " , ")
789
      )
790
    fixed_effects <- setdiff(fixed_effects, to_remove)
791
    if (length(fixed_effects) == 0) {
792
      logging::logerror("No fixed effects included in formula.")
793
      stop()
794
    }
795
  }
796
  
797
  if (!is.null(random_effects)) {
798
    random_effects <-
799
      unlist(strsplit(random_effects, ",", fixed = TRUE))
800
    # subtract random effects from fixed effects
801
    fixed_effects <- setdiff(fixed_effects, random_effects)
802
    # remove any random effects not found in metadata
803
    to_remove <-
804
      setdiff(random_effects, colnames(filtered_metadata))
805
    if (length(to_remove) > 0)
806
      logging::logwarn(
807
        paste(
808
          "Feature name not found in metadata",
809
          "so not applied to formula as random effect: %s"
810
        ),
811
        paste(to_remove, collapse = " , ")
812
      )
813
    random_effects <- setdiff(random_effects, to_remove)
814
    
815
    # create formula
816
    if (length(random_effects) > 0) {
817
      random_effects_formula_text <-
818
        paste("expr ~ (1 | ",
819
              paste(
820
                random_effects,
821
                ")",
822
                sep = '',
823
                collapse = " + (1 | "
824
              ),
825
              sep = '')
826
      logging::loginfo("Formula for random effects: %s",
827
                       random_effects_formula_text)
828
      random_effects_formula <-
829
        tryCatch(
830
          as.formula(random_effects_formula_text),
831
          error = function(e)
832
            stop(
833
              paste(
834
                "Invalid formula for random effects: ",
835
                random_effects_formula_text
836
              )
837
            )
838
        )
839
    }
840
  }
841
  
842
  # Reduce metadata to only include fixed/random effects in formula
843
  effects_names <- union(fixed_effects, random_effects)
844
  filtered_metadata <-
845
    filtered_metadata[, effects_names, drop = FALSE]
846
  
847
  # Create the fixed effects formula text
848
  formula_text <-
849
    paste("expr ~ ", paste(fixed_effects, collapse = " + "))
850
  logging::loginfo("Formula for fixed effects: %s", formula_text)
851
  formula <-
852
    tryCatch(
853
      as.formula(formula_text),
854
      error = function(e)
855
        stop(
856
          paste(
857
            "Invalid formula.",
858
            "Please provide a different formula: ",
859
            formula_text
860
          )
861
        )
862
    )
863
864
  #############################################################
865
  # Standardize metadata (excpet the offset variable), if set #
866
  #############################################################
867
  
868
  if (standardize) {
869
    logging::loginfo("Applying z-score to standardize continuous metadata")
870
    filtered_metadata <-
871
      filtered_metadata %>% dplyr::mutate_if(is.numeric, scale)
872
  } else {
873
    logging::loginfo("Bypass z-score application to metadata")
874
  }
875
  
876
  ##################################
877
  # Merge metadata and offset back #
878
  ##################################
879
  
880
  final_metadata <- as.data.frame(cbind.data.frame(filtered_metadata, offset))
881
  
882
  ##############################################################
883
  # Apply the base model to the filtered data with user inputs #
884
  ##############################################################
885
  
886
  logging::loginfo("Running selected analysis method: %s", base_model)
887
  
888
  fit_data <- fit.Tweedieverse(
889
    features = final_features,
890
    metadata = final_metadata,
891
    base_model = base_model,
892
    link = link,
893
    formula = formula,
894
    random_effects_formula = random_effects_formula,
895
    cutoff_ZSCP = cutoff_ZSCP,
896
    criteria_ZACP = criteria_ZACP,
897
    adjust_offset = adjust_offset,
898
    correction = correction,
899
    cores = cores,
900
    optimizer = optimizer,
901
    na.action = na.action
902
  )
903
  
904
  
905
  ###################################################
906
  # Count the N and Zero-inflation for each feature #
907
  ###################################################
908
  
909
  logging::loginfo("Counting prevalence for each feature")
910
  try(fit_data$results$N <-
911
        apply(
912
          fit_data$results,
913
          1,
914
          FUN = function(x)
915
            length(final_features[, x[1]])
916
        ))
917
  try(fit_data$results$N.not.zero <-
918
        apply(
919
          fit_data$results,
920
          1,
921
          FUN = function(x)
922
            length(which(final_features[, x[1]] > 0))
923
        ))
924
  try(fit_data$results$percent.zero <-
925
        apply(
926
          fit_data$results,
927
          1,
928
          FUN = function(x)
929
            round(mean(final_features[, x[1]] == 0, na.rm = TRUE), 2) *
930
            100
931
        ))
932
  
933
  #########################
934
  # Write out the results #
935
  #########################
936
  
937
  results_file <- file.path(output, "all_results.tsv")
938
  logging::loginfo("Writing all results to file (ordered by increasing q-values): %s",
939
                   results_file)
940
  ordered_results <-
941
    fit_data$results[order(fit_data$results$qval),]
942
  ordered_results <-
943
    ordered_results[!is.na(ordered_results$qval),] # Remove NA's
944
  ordered_results <-
945
    dplyr::select(
946
      ordered_results,
947
      c(
948
        'feature',
949
        'metadata',
950
        'value',
951
        "coef",
952
        "stderr",
953
        "pval",
954
        "qval"
955
      ),
956
      everything()
957
    )
958
  write.table(
959
    ordered_results,
960
    file = results_file,
961
    sep = "\t",
962
    quote = FALSE,
963
    row.names = FALSE
964
  )
965
  
966
  
967
  # Write results passing threshold to file
968
  # (removing any that are NA for the q-value)
969
  significant_results <-
970
    ordered_results[ordered_results$qval <= max_significance,]
971
  significant_results_file <-
972
    file.path(output, "significant_results.tsv")
973
  logging::loginfo(
974
    paste(
975
      "Writing the significant results",
976
      "(those which are less than or equal to the threshold",
977
      "of %f ) to file (ordered by increasing q-values): %s"
978
    ),
979
    max_significance,
980
    significant_results_file
981
  )
982
  write.table(
983
    significant_results,
984
    file = significant_results_file,
985
    sep = "\t",
986
    quote = FALSE,
987
    row.names = FALSE
988
  )
989
  
990
  
991
  #######################################################
992
  # Create visualizations for results passing threshold #
993
  #######################################################
994
  
995
  logging::loginfo("Writing Tweedie inxed plot to file: %s",
996
                   output)
997
  tryCatch({
998
    tweedie_index_plot(ordered_results, figures_folder)
999
    
1000
  }, error = function(err) {
1001
    logging::logerror("Unable to do make a Tweedie inxed plot of results!!!")
1002
    logging::logerror(err)
1003
    # dev.off()
1004
  })
1005
  
1006
  if (plot_heatmap & length(unique(significant_results_file["metdata"])) > 1) {
1007
    heatmap_file <- file.path(output, "Tweedieverse_Heatmap.pdf")
1008
    logging::loginfo("Writing heatmap of significant results to file: %s",
1009
                     heatmap_file)
1010
    tryCatch({
1011
      save_heatmap(significant_results_file,
1012
                   heatmap_file,
1013
                   figures_folder,
1014
                   first_n = heatmap_first_n)
1015
    }, error = function(err) {
1016
      logging::logerror("Unable to do make a hetamp of results!!!")
1017
      logging::logerror(err)
1018
      # dev.off()
1019
    })
1020
  }
1021
  
1022
  if (plot_scatter) {
1023
    logging::loginfo(
1024
      paste(
1025
        "Writing association plots",
1026
        "(one for each significant association)",
1027
        "to output folder: %s"
1028
      ),
1029
      output
1030
    )
1031
    association_plots(
1032
      metadata,
1033
      final_features/offset,
1034
      significant_results_file,
1035
      output,
1036
      figures_folder
1037
    )
1038
  }
1039
  
1040
  return(significant_results)
1041
}
1042
1043
1044
option_not_valid_error <- function(message, valid_options) {
1045
  logging::logerror(paste(message, ": %s"), toString(valid_options))
1046
  stop("Option not valid", call. = FALSE)
1047
}
1048
1049
## Quiets concerns of R CMD check
1050
utils::globalVariables(c("name"))