|
a |
|
b/man/Tweedieverse.Rd |
|
|
1 |
% Generated by roxygen2: do not edit by hand |
|
|
2 |
% Please edit documentation in R/Tweedieverse.R |
|
|
3 |
\name{Tweedieverse} |
|
|
4 |
\alias{Tweedieverse} |
|
|
5 |
\title{Differential analysis of multi-omics data using Tweedie GLMs} |
|
|
6 |
\usage{ |
|
|
7 |
Tweedieverse( |
|
|
8 |
input_features, |
|
|
9 |
input_metadata = NULL, |
|
|
10 |
output, |
|
|
11 |
assay_name = "counts", |
|
|
12 |
abd_threshold = 0, |
|
|
13 |
prev_threshold = 0.1, |
|
|
14 |
var_threshold = 0, |
|
|
15 |
entropy_threshold = 0, |
|
|
16 |
base_model = "CPLM", |
|
|
17 |
link = "log", |
|
|
18 |
fixed_effects = NULL, |
|
|
19 |
random_effects = NULL, |
|
|
20 |
cutoff_ZSCP = 0.3, |
|
|
21 |
criteria_ZACP = "BIC", |
|
|
22 |
adjust_offset = TRUE, |
|
|
23 |
scale_factor = NULL, |
|
|
24 |
max_significance = 0.05, |
|
|
25 |
correction = "BH", |
|
|
26 |
standardize = TRUE, |
|
|
27 |
cores = 1, |
|
|
28 |
optimizer = "nlminb", |
|
|
29 |
na.action = na.exclude, |
|
|
30 |
plot_heatmap = FALSE, |
|
|
31 |
plot_scatter = FALSE, |
|
|
32 |
heatmap_first_n = 50, |
|
|
33 |
reference = NULL |
|
|
34 |
) |
|
|
35 |
} |
|
|
36 |
\arguments{ |
|
|
37 |
\item{input_features}{A tab-delimited input file or an R data frame of features (can be in rows/columns) |
|
|
38 |
and samples (or cells). Samples are expected to have matching names with \code{input_metadata}. |
|
|
39 |
\code{input_features} can also be an object of class \code{SummarizedExperiment} or \code{SingleCellExperiment} |
|
|
40 |
that contains the expression or abundance matrix and other metadata; the \code{assays} |
|
|
41 |
slot contains the expression or abundance matrix and is named \code{"counts"}. |
|
|
42 |
This matrix should have one row for each feature and one sample for each column. |
|
|
43 |
The \code{colData} slot should contain a data frame with one row per |
|
|
44 |
sample and columns that contain metadata for each sample. |
|
|
45 |
Additional information about the experiment can be contained in the |
|
|
46 |
\code{metadata} slot as a list.} |
|
|
47 |
|
|
|
48 |
\item{input_metadata}{A tab-delimited input file or an R data frame of metadata (rows/columns). |
|
|
49 |
Samples are expected to have matching sample names with \code{input_features}. |
|
|
50 |
This file is ignored when \code{input_features} is a \code{SummarizedExperiment} |
|
|
51 |
or a \code{SingleCellExperiment} object with \code{colData} containing the same information.} |
|
|
52 |
|
|
|
53 |
\item{output}{The output folder to write results.} |
|
|
54 |
|
|
|
55 |
\item{assay_name}{If the input is provided as one of the accepted Bioconductor objects |
|
|
56 |
(e.g., SummarizedExperiment, RangedSummarizedExperiment, SingleCellExperiment, or TreeSummarizedExperiment), |
|
|
57 |
this argument selects the name of the assay slot in the input object that contains the omics measurements.} |
|
|
58 |
|
|
|
59 |
\item{abd_threshold}{If prevalence-abundance filtering is desired, only features that are present (or detected) |
|
|
60 |
in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion) |
|
|
61 |
are retained. Default value for \code{abd_threshold} is \code{0.0}. |
|
|
62 |
To disable prevalence-abundance filtering, set \code{abd_threshold = -Inf}.} |
|
|
63 |
|
|
|
64 |
\item{prev_threshold}{If prevalence-abundance filtering is desired, only features that are present (or detected) |
|
|
65 |
in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion) |
|
|
66 |
are retained. Default value for \code{prev_threshold} is \code{0.1}.} |
|
|
67 |
|
|
|
68 |
\item{var_threshold}{If variance filtering is desired, only features that have variances greater than |
|
|
69 |
\code{var_threshold} are retained. This step is done after the prevalence-abundance filtering. |
|
|
70 |
Default value for \code{var_threshold} is \code{0.0} (i.e. no variance filtering).} |
|
|
71 |
|
|
|
72 |
\item{entropy_threshold}{If entropy-based filtering is desired for metadata, only features that have entropy greater than |
|
|
73 |
\code{entropy_threshold} are retained. Default value for \code{entropy_threshold} is \code{0.0} (i.e. no entropy filtering).} |
|
|
74 |
|
|
|
75 |
\item{base_model}{The per-feature base model. Default is "CPLM". Must be one of "CPLM", "ZICP", "ZSCP", or "ZACP".} |
|
|
76 |
|
|
|
77 |
\item{link}{A specification of the GLM link function. Default is "log". Must be one of "log", "identity", "sqrt", or "inverse".} |
|
|
78 |
|
|
|
79 |
\item{fixed_effects}{Metadata variable(s) describing the fixed effects coefficients.} |
|
|
80 |
|
|
|
81 |
\item{random_effects}{Metadata variable(s) describing the random effects part of the model.} |
|
|
82 |
|
|
|
83 |
\item{cutoff_ZSCP}{For \code{base_model = "ZSCP"}, the cutoff to stratify features for |
|
|
84 |
adaptive ZI modeling based on sparsity (zero-inflation proportion). Default is 0.3. Must be between 0 and 1.} |
|
|
85 |
|
|
|
86 |
\item{criteria_ZACP}{For \code{base_model = "ZACP"}, the criteria to select the |
|
|
87 |
best fitting model per feature. The possible options are 'AIC' and BIC' (default). |
|
|
88 |
More criteria will be supported in a future release.} |
|
|
89 |
|
|
|
90 |
\item{adjust_offset}{If TRUE (default), an offset term will be included as the logarithm of \code{scale_factor}.} |
|
|
91 |
|
|
|
92 |
\item{scale_factor}{Name of the numerical variable containing library size (for non-normalized data) or scale factor |
|
|
93 |
(for normalized data) across samples to be included as an offset in the base model (when \code{adjust_offset = TRUE}). |
|
|
94 |
If not found in metadata, defaults to the sample-wise total sums, unless \code{adjust_offset = FALSE}.} |
|
|
95 |
|
|
|
96 |
\item{max_significance}{The q-value threshold for significance. Default is 0.05.} |
|
|
97 |
|
|
|
98 |
\item{correction}{The correction method for computing the q-value (see \code{\link[stats]{p.adjust}} for options, default is 'BH').} |
|
|
99 |
|
|
|
100 |
\item{standardize}{Should continuous metadata be standardized? Default is TRUE. Bypassed for categorical variables.} |
|
|
101 |
|
|
|
102 |
\item{cores}{An integer that indicates the number of R processes to run in parallel. Default is 1.} |
|
|
103 |
|
|
|
104 |
\item{optimizer}{The optimization routine to be used for estimating the parameters of the Tweedie model. |
|
|
105 |
Possible choices are \code{"nlminb"} (the default, see \code{\link[stats]{nlminb}}), |
|
|
106 |
\code{"bobyqa"} (\code{\link[minqa]{bobyqa}}), and \code{"L-BFGS-B"} (\code{\link[stats]{optim}}). |
|
|
107 |
Ignored for random effects modeling which uses an alternative Template Model Builder (TMB) approach (\code{\link[glmmTMB]{glmmTMB}}).} |
|
|
108 |
|
|
|
109 |
\item{na.action}{How to handle missing values? See \code{\link{na.action}}. Default is \code{\link{na.exclude}}.} |
|
|
110 |
|
|
|
111 |
\item{plot_heatmap}{Logical. If TRUE (default is FALSE), generate a heatmap of the (top \code{heatmap_first_n}) significant associations.} |
|
|
112 |
|
|
|
113 |
\item{plot_scatter}{Logical. If TRUE (default is FALSE), generate scatter/box plots of individual associations.} |
|
|
114 |
|
|
|
115 |
\item{heatmap_first_n}{In heatmap, plot top N features with significant associations (default is 50).} |
|
|
116 |
|
|
|
117 |
\item{reference}{The factor to use as a reference for a variable with more than two levels provided as a string of 'variable,reference' semi-colon delimited for multiple variables (default is NULL).} |
|
|
118 |
} |
|
|
119 |
\value{ |
|
|
120 |
A data frame containing coefficient estimates, p-values, and q-values (multiplicity-adjusted p-values) are returned. |
|
|
121 |
} |
|
|
122 |
\description{ |
|
|
123 |
Fit a per-feature Tweedie generalized linear model to omics features. |
|
|
124 |
} |
|
|
125 |
\examples{ |
|
|
126 |
|
|
|
127 |
\dontrun{ |
|
|
128 |
|
|
|
129 |
############################################################################## |
|
|
130 |
# Example 1 - Differential Abundance Analysis of Synthetic Microbiome Counts # |
|
|
131 |
############################################################################## |
|
|
132 |
|
|
|
133 |
####################################### |
|
|
134 |
# Install and Load Required Libraries # |
|
|
135 |
####################################### |
|
|
136 |
|
|
|
137 |
library(devtools) |
|
|
138 |
devtools::install_github('biobakery/sparseDOSSA@varyLibSize') |
|
|
139 |
library(sparseDOSSA) |
|
|
140 |
library(stringi) |
|
|
141 |
|
|
|
142 |
###################### |
|
|
143 |
# Specify Parameters # |
|
|
144 |
###################### |
|
|
145 |
|
|
|
146 |
n.microbes <- 200 # Number of Features |
|
|
147 |
n.samples <- 100 # Number of Samples |
|
|
148 |
spike.perc <- 0.02 # Percentage of Spiked-in Bugs |
|
|
149 |
spikeStrength<-"20" # Effect Size |
|
|
150 |
|
|
|
151 |
########################### |
|
|
152 |
# Specify Binary Metadata # |
|
|
153 |
########################### |
|
|
154 |
|
|
|
155 |
n.metadata <- 1 |
|
|
156 |
UserMetadata<-as.matrix(rep(c(0,1), each=n.samples/2)) |
|
|
157 |
UserMetadata<-t(UserMetadata) # Transpose |
|
|
158 |
|
|
|
159 |
################################################### |
|
|
160 |
# Spiked-in Metadata (Which Metadata to Spike-in) # |
|
|
161 |
################################################### |
|
|
162 |
|
|
|
163 |
Metadatafrozenidx<-1 |
|
|
164 |
spikeCount<-as.character(length(Metadatafrozenidx)) |
|
|
165 |
significant_metadata<-paste('Metadata', Metadatafrozenidx, sep='') |
|
|
166 |
|
|
|
167 |
############################################# |
|
|
168 |
# Generate SparseDOSSA Synthetic Abundances # |
|
|
169 |
############################################# |
|
|
170 |
|
|
|
171 |
DD<-sparseDOSSA::sparseDOSSA(number_features = n.microbes, |
|
|
172 |
number_samples = n.samples, |
|
|
173 |
UserMetadata=UserMetadata, |
|
|
174 |
Metadatafrozenidx=Metadatafrozenidx, |
|
|
175 |
datasetCount = 1, |
|
|
176 |
spikeCount = spikeCount, |
|
|
177 |
spikeStrength = spikeStrength, |
|
|
178 |
noZeroInflate=TRUE, |
|
|
179 |
percent_spiked=spike.perc, |
|
|
180 |
seed = 1234) |
|
|
181 |
|
|
|
182 |
############################## |
|
|
183 |
# Gather SparseDOSSA Outputs # |
|
|
184 |
############################## |
|
|
185 |
|
|
|
186 |
sparsedossa_results <- as.data.frame(DD$OTU_count) |
|
|
187 |
rownames(sparsedossa_results)<-sparsedossa_results$X1 |
|
|
188 |
sparsedossa_results<-sparsedossa_results[-1,-1] |
|
|
189 |
colnames(sparsedossa_results)<-paste('Sample', 1:ncol(sparsedossa_results), sep='') |
|
|
190 |
data<-as.matrix(sparsedossa_results[-c((n.metadata+1):(2*n.microbes+n.metadata)),]) |
|
|
191 |
data<-data.matrix(data) |
|
|
192 |
class(data) <- "numeric" |
|
|
193 |
truth<-c(unlist(DD$truth)) |
|
|
194 |
truth<-truth[!stri_detect_fixed(truth,":")] |
|
|
195 |
truth<-truth[(5+n.metadata):length(truth)] |
|
|
196 |
truth<-as.data.frame(truth) |
|
|
197 |
significant_features<-truth[seq(1, |
|
|
198 |
(as.numeric(spikeCount)+1)*(n.microbes*spike.perc), (as.numeric(spikeCount)+1)),] |
|
|
199 |
significant_features<-as.vector(significant_features) |
|
|
200 |
|
|
|
201 |
#################### |
|
|
202 |
# Extract Features # |
|
|
203 |
#################### |
|
|
204 |
|
|
|
205 |
features<-as.data.frame(t(data[-c(1:n.metadata),])) |
|
|
206 |
|
|
|
207 |
#################### |
|
|
208 |
# Extract Metadata # |
|
|
209 |
#################### |
|
|
210 |
|
|
|
211 |
metadata<-as.data.frame(data[1,]) |
|
|
212 |
colnames(metadata)<-rownames(data)[1] |
|
|
213 |
|
|
|
214 |
############################### |
|
|
215 |
# Mark True Positive Features # |
|
|
216 |
############################### |
|
|
217 |
|
|
|
218 |
wh.TP = colnames(features) \%in\% significant_features |
|
|
219 |
colnames(features)<-paste("Feature", 1:n.microbes, sep = "") |
|
|
220 |
newname = paste0(colnames(features)[wh.TP], "_TP") |
|
|
221 |
colnames(features)[wh.TP] <- newname; |
|
|
222 |
colnames(features)[grep('TP', colnames(features))] |
|
|
223 |
|
|
|
224 |
#################### |
|
|
225 |
# Run Tweedieverse # |
|
|
226 |
################### |
|
|
227 |
|
|
|
228 |
################### |
|
|
229 |
# Default options # |
|
|
230 |
################### |
|
|
231 |
|
|
|
232 |
CPLM <-Tweedieverse( |
|
|
233 |
features, |
|
|
234 |
metadata, |
|
|
235 |
output = './demo_output/CPLM') # Assuming demo_output exists |
|
|
236 |
|
|
|
237 |
############################################### |
|
|
238 |
# User-defined prevalence-abundance filtering # |
|
|
239 |
############################################### |
|
|
240 |
|
|
|
241 |
ZICP<-Tweedieverse( |
|
|
242 |
features, |
|
|
243 |
metadata, |
|
|
244 |
output = './demo_output/ZICP', # Assuming demo_output exists |
|
|
245 |
base_model = 'ZICP', |
|
|
246 |
abd_threshold = 0.0, |
|
|
247 |
prev_threshold = 0.2) |
|
|
248 |
|
|
|
249 |
#################################### |
|
|
250 |
# User-defined variance filtering # |
|
|
251 |
#################################### |
|
|
252 |
|
|
|
253 |
sds<-apply(features, 2, sd) |
|
|
254 |
var_threshold = median(sds)/2 |
|
|
255 |
ZSCP<-Tweedieverse( |
|
|
256 |
features, |
|
|
257 |
metadata, |
|
|
258 |
output = './demo_output/ZSCP', # Assuming demo_output exists |
|
|
259 |
base_model = 'ZSCP', |
|
|
260 |
var_threshold = var_threshold) |
|
|
261 |
|
|
|
262 |
################## |
|
|
263 |
# Multiple cores # |
|
|
264 |
################## |
|
|
265 |
|
|
|
266 |
ZACP<-Tweedieverse( |
|
|
267 |
features, |
|
|
268 |
metadata, |
|
|
269 |
output = './demo_output/ZACP', # Assuming demo_output exists |
|
|
270 |
base_model = 'ZACP', |
|
|
271 |
cores = 4) |
|
|
272 |
|
|
|
273 |
########################################################################## |
|
|
274 |
# Example 2 - Multivariable Association on HMP2 Longitudinal Microbiomes # |
|
|
275 |
########################################################################## |
|
|
276 |
|
|
|
277 |
###################### |
|
|
278 |
# HMP2 input_features Analysis # |
|
|
279 |
###################### |
|
|
280 |
|
|
|
281 |
############# |
|
|
282 |
# Load input_features # |
|
|
283 |
############# |
|
|
284 |
|
|
|
285 |
library(data.table) |
|
|
286 |
input_features <- fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_taxonomy.tsv", sep ="\t") |
|
|
287 |
input_metadata <-fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_metadata.tsv", sep ="\t") |
|
|
288 |
|
|
|
289 |
############### |
|
|
290 |
# Format data # |
|
|
291 |
############### |
|
|
292 |
|
|
|
293 |
library(tibble) |
|
|
294 |
features<- column_to_rownames(input_features, 'ID') |
|
|
295 |
metadata<- column_to_rownames(input_metadata, 'ID') |
|
|
296 |
|
|
|
297 |
############# |
|
|
298 |
# Fit Model # |
|
|
299 |
############# |
|
|
300 |
|
|
|
301 |
library(Tweedieverse) |
|
|
302 |
HMP2 <- Tweedieverse( |
|
|
303 |
features, |
|
|
304 |
metadata, |
|
|
305 |
output = './demo_output/HMP2', # Assuming demo_output exists |
|
|
306 |
fixed_effects = c('diagnosis', 'dysbiosisnonIBD','dysbiosisUC','dysbiosisCD', 'antibiotics', 'age'), |
|
|
307 |
random_effects = c('site', 'subject'), |
|
|
308 |
base_model = 'CPLM', |
|
|
309 |
adjust_offset = FALSE, # No offset as the values are relative abundances |
|
|
310 |
cores = 8, # Make sure your computer has the capability |
|
|
311 |
standardize = FALSE, |
|
|
312 |
reference = c('diagnosis,nonIBD')) |
|
|
313 |
|
|
|
314 |
} |
|
|
315 |
} |
|
|
316 |
\author{ |
|
|
317 |
Himel Mallick, \email{him4004@med.cornell.edu} |
|
|
318 |
} |
|
|
319 |
\keyword{metagenomics,} |
|
|
320 |
\keyword{microbiome,} |
|
|
321 |
\keyword{multiomics,} |
|
|
322 |
\keyword{scRNASeq,} |
|
|
323 |
\keyword{singlecell} |
|
|
324 |
\keyword{tweedie,} |