Diff of /man/Tweedieverse.Rd [000000] .. [727782]

Switch to unified view

a b/man/Tweedieverse.Rd
1
% Generated by roxygen2: do not edit by hand
2
% Please edit documentation in R/Tweedieverse.R
3
\name{Tweedieverse}
4
\alias{Tweedieverse}
5
\title{Differential analysis of multi-omics data using Tweedie GLMs}
6
\usage{
7
Tweedieverse(
8
  input_features,
9
  input_metadata = NULL,
10
  output,
11
  assay_name = "counts",
12
  abd_threshold = 0,
13
  prev_threshold = 0.1,
14
  var_threshold = 0,
15
  entropy_threshold = 0,
16
  base_model = "CPLM",
17
  link = "log",
18
  fixed_effects = NULL,
19
  random_effects = NULL,
20
  cutoff_ZSCP = 0.3,
21
  criteria_ZACP = "BIC",
22
  adjust_offset = TRUE,
23
  scale_factor = NULL,
24
  max_significance = 0.05,
25
  correction = "BH",
26
  standardize = TRUE,
27
  cores = 1,
28
  optimizer = "nlminb",
29
  na.action = na.exclude,
30
  plot_heatmap = FALSE,
31
  plot_scatter = FALSE,
32
  heatmap_first_n = 50,
33
  reference = NULL
34
)
35
}
36
\arguments{
37
\item{input_features}{A tab-delimited input file or an R data frame of features (can be in rows/columns)
38
and samples (or cells). Samples are expected to have matching names with \code{input_metadata}. 
39
\code{input_features} can also be an object of class \code{SummarizedExperiment} or \code{SingleCellExperiment} 
40
that contains the expression or abundance matrix and other metadata; the \code{assays} 
41
slot contains the expression or abundance matrix and is named \code{"counts"}.  
42
This matrix should have one row for each feature and one sample for each column.  
43
The \code{colData} slot should contain a data frame with one row per 
44
sample and columns that contain metadata for each sample. 
45
Additional information about the experiment can be contained in the
46
\code{metadata} slot as a list.}
47
48
\item{input_metadata}{A tab-delimited input file or an R data frame of metadata (rows/columns).
49
Samples are expected to have matching sample names with \code{input_features}. 
50
This file is ignored when \code{input_features} is a \code{SummarizedExperiment} 
51
or a \code{SingleCellExperiment} object with \code{colData} containing the same information.}
52
53
\item{output}{The output folder to write results.}
54
55
\item{assay_name}{If the input is provided as one of the accepted Bioconductor objects 
56
(e.g., SummarizedExperiment, RangedSummarizedExperiment, SingleCellExperiment, or TreeSummarizedExperiment), 
57
this argument selects the name of the assay slot in the input object that contains the omics measurements.}
58
59
\item{abd_threshold}{If prevalence-abundance filtering is desired, only features that are present (or detected)
60
in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion)
61
are retained. Default value for \code{abd_threshold} is \code{0.0}.
62
To disable prevalence-abundance filtering, set \code{abd_threshold = -Inf}.}
63
64
\item{prev_threshold}{If prevalence-abundance filtering is desired, only features that are present (or detected)
65
in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion)
66
are retained. Default value for \code{prev_threshold} is \code{0.1}.}
67
68
\item{var_threshold}{If variance filtering is desired, only features that have variances greater than
69
\code{var_threshold} are retained. This step is done after the prevalence-abundance filtering.
70
Default value for \code{var_threshold} is \code{0.0} (i.e. no variance filtering).}
71
72
\item{entropy_threshold}{If entropy-based filtering is desired for metadata, only features that have entropy greater than
73
\code{entropy_threshold} are retained. Default value for \code{entropy_threshold} is \code{0.0} (i.e. no entropy filtering).}
74
75
\item{base_model}{The per-feature base model. Default is "CPLM". Must be one of "CPLM", "ZICP", "ZSCP", or "ZACP".}
76
77
\item{link}{A specification of the GLM link function. Default is "log". Must be one of "log", "identity", "sqrt", or "inverse".}
78
79
\item{fixed_effects}{Metadata variable(s) describing the fixed effects coefficients.}
80
81
\item{random_effects}{Metadata variable(s) describing the random effects part of the model.}
82
83
\item{cutoff_ZSCP}{For \code{base_model = "ZSCP"}, the cutoff to stratify features for
84
adaptive ZI modeling based on sparsity (zero-inflation proportion). Default is 0.3. Must be between 0 and 1.}
85
86
\item{criteria_ZACP}{For \code{base_model = "ZACP"}, the criteria to select the
87
best fitting model per feature.  The possible options are 'AIC' and BIC' (default).
88
More criteria will be supported in a future release.}
89
90
\item{adjust_offset}{If TRUE (default), an offset term will be included as the logarithm of \code{scale_factor}.}
91
92
\item{scale_factor}{Name of the numerical variable containing library size (for non-normalized data) or scale factor
93
(for normalized data) across samples to be included as an offset in the base model (when \code{adjust_offset = TRUE}).
94
If not found in metadata, defaults to the sample-wise total sums, unless \code{adjust_offset = FALSE}.}
95
96
\item{max_significance}{The q-value threshold for significance. Default is 0.05.}
97
98
\item{correction}{The correction method for computing the q-value (see \code{\link[stats]{p.adjust}} for options, default is 'BH').}
99
100
\item{standardize}{Should continuous metadata be standardized? Default is TRUE. Bypassed for categorical variables.}
101
102
\item{cores}{An integer that indicates the number of R processes to run in parallel. Default is 1.}
103
104
\item{optimizer}{The optimization routine to be used for estimating the parameters of the Tweedie model.
105
Possible choices are \code{"nlminb"} (the default, see \code{\link[stats]{nlminb}}),
106
\code{"bobyqa"} (\code{\link[minqa]{bobyqa}}), and \code{"L-BFGS-B"} (\code{\link[stats]{optim}}).
107
Ignored for random effects modeling which uses an alternative Template Model Builder (TMB) approach (\code{\link[glmmTMB]{glmmTMB}}).}
108
109
\item{na.action}{How to handle missing values? See \code{\link{na.action}}. Default is \code{\link{na.exclude}}.}
110
111
\item{plot_heatmap}{Logical. If TRUE (default is FALSE), generate a heatmap of the (top \code{heatmap_first_n}) significant associations.}
112
113
\item{plot_scatter}{Logical. If TRUE (default is FALSE), generate scatter/box plots of individual associations.}
114
115
\item{heatmap_first_n}{In heatmap, plot top N features with significant associations (default is 50).}
116
117
\item{reference}{The factor to use as a reference for a variable with more than two levels provided as a string of 'variable,reference' semi-colon delimited for multiple variables (default is NULL).}
118
}
119
\value{
120
A data frame containing coefficient estimates, p-values, and q-values (multiplicity-adjusted p-values) are returned.
121
}
122
\description{
123
Fit a per-feature Tweedie generalized linear model to omics features.
124
}
125
\examples{
126
127
\dontrun{
128
129
##############################################################################
130
# Example 1 - Differential Abundance Analysis of Synthetic Microbiome Counts #
131
##############################################################################
132
133
#######################################
134
# Install and Load Required Libraries #
135
#######################################
136
137
library(devtools)
138
devtools::install_github('biobakery/sparseDOSSA@varyLibSize')
139
library(sparseDOSSA)
140
library(stringi)
141
142
######################
143
# Specify Parameters #
144
######################
145
146
n.microbes <- 200 # Number of Features
147
n.samples <- 100 # Number of Samples
148
spike.perc <- 0.02 # Percentage of Spiked-in Bugs
149
spikeStrength<-"20" # Effect Size
150
151
###########################
152
# Specify Binary Metadata #
153
###########################
154
155
n.metadata <- 1
156
UserMetadata<-as.matrix(rep(c(0,1), each=n.samples/2))
157
UserMetadata<-t(UserMetadata) # Transpose
158
159
###################################################
160
# Spiked-in Metadata (Which Metadata to Spike-in) #
161
###################################################
162
163
Metadatafrozenidx<-1
164
spikeCount<-as.character(length(Metadatafrozenidx))
165
significant_metadata<-paste('Metadata', Metadatafrozenidx, sep='')
166
167
#############################################
168
# Generate SparseDOSSA Synthetic Abundances #
169
#############################################
170
171
DD<-sparseDOSSA::sparseDOSSA(number_features = n.microbes,
172
number_samples = n.samples,
173
UserMetadata=UserMetadata,
174
Metadatafrozenidx=Metadatafrozenidx,
175
datasetCount = 1,
176
spikeCount = spikeCount,
177
spikeStrength = spikeStrength,
178
noZeroInflate=TRUE,
179
percent_spiked=spike.perc,
180
seed = 1234)
181
182
##############################
183
# Gather SparseDOSSA Outputs #
184
##############################
185
186
sparsedossa_results <- as.data.frame(DD$OTU_count)
187
rownames(sparsedossa_results)<-sparsedossa_results$X1
188
sparsedossa_results<-sparsedossa_results[-1,-1]
189
colnames(sparsedossa_results)<-paste('Sample', 1:ncol(sparsedossa_results), sep='')
190
data<-as.matrix(sparsedossa_results[-c((n.metadata+1):(2*n.microbes+n.metadata)),])
191
data<-data.matrix(data)
192
class(data) <- "numeric"
193
truth<-c(unlist(DD$truth))
194
truth<-truth[!stri_detect_fixed(truth,":")]
195
truth<-truth[(5+n.metadata):length(truth)]
196
truth<-as.data.frame(truth)
197
significant_features<-truth[seq(1,
198
(as.numeric(spikeCount)+1)*(n.microbes*spike.perc), (as.numeric(spikeCount)+1)),]
199
significant_features<-as.vector(significant_features)
200
201
####################
202
# Extract Features #
203
####################
204
205
features<-as.data.frame(t(data[-c(1:n.metadata),]))
206
207
####################
208
# Extract Metadata #
209
####################
210
211
metadata<-as.data.frame(data[1,])
212
colnames(metadata)<-rownames(data)[1]
213
214
###############################
215
# Mark True Positive Features #
216
###############################
217
218
wh.TP = colnames(features) \%in\% significant_features
219
colnames(features)<-paste("Feature", 1:n.microbes, sep = "")
220
newname = paste0(colnames(features)[wh.TP], "_TP")
221
colnames(features)[wh.TP] <- newname;
222
colnames(features)[grep('TP', colnames(features))]
223
224
####################
225
# Run Tweedieverse #
226
###################
227
228
###################
229
# Default options #
230
###################
231
232
CPLM <-Tweedieverse(
233
features,
234
metadata,
235
output = './demo_output/CPLM') # Assuming demo_output exists
236
237
###############################################
238
# User-defined prevalence-abundance filtering #
239
###############################################
240
241
ZICP<-Tweedieverse(
242
features,
243
metadata,
244
output = './demo_output/ZICP', # Assuming demo_output exists
245
base_model = 'ZICP',
246
abd_threshold = 0.0,
247
prev_threshold = 0.2)
248
249
####################################
250
# User-defined variance filtering  #
251
####################################
252
253
sds<-apply(features, 2, sd)
254
var_threshold = median(sds)/2
255
ZSCP<-Tweedieverse(
256
features,
257
metadata,
258
output = './demo_output/ZSCP', # Assuming demo_output exists
259
base_model = 'ZSCP',
260
var_threshold = var_threshold)
261
262
##################
263
# Multiple cores #
264
##################
265
266
ZACP<-Tweedieverse(
267
features,
268
metadata,
269
output = './demo_output/ZACP', # Assuming demo_output exists
270
base_model = 'ZACP',
271
cores = 4)
272
273
##########################################################################
274
# Example 2 - Multivariable Association on HMP2 Longitudinal Microbiomes #
275
##########################################################################
276
277
######################
278
# HMP2 input_features Analysis #
279
######################
280
281
#############
282
# Load input_features #
283
#############
284
285
library(data.table)
286
input_features <- fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_taxonomy.tsv", sep ="\t")
287
input_metadata <-fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_metadata.tsv", sep ="\t")
288
289
###############
290
# Format data #
291
###############
292
293
library(tibble)
294
features<- column_to_rownames(input_features, 'ID')
295
metadata<- column_to_rownames(input_metadata, 'ID')
296
297
#############
298
# Fit Model #
299
#############
300
301
library(Tweedieverse)
302
HMP2 <- Tweedieverse(
303
features,
304
metadata,
305
output = './demo_output/HMP2', # Assuming demo_output exists
306
fixed_effects = c('diagnosis', 'dysbiosisnonIBD','dysbiosisUC','dysbiosisCD', 'antibiotics', 'age'),
307
random_effects = c('site', 'subject'),
308
base_model = 'CPLM',
309
adjust_offset = FALSE, # No offset as the values are relative abundances
310
cores = 8, # Make sure your computer has the capability
311
standardize = FALSE,
312
reference = c('diagnosis,nonIBD'))
313
314
}
315
}
316
\author{
317
Himel Mallick, \email{him4004@med.cornell.edu}
318
}
319
\keyword{metagenomics,}
320
\keyword{microbiome,}
321
\keyword{multiomics,}
322
\keyword{scRNASeq,}
323
\keyword{singlecell}
324
\keyword{tweedie,}