|
a |
|
b/R/Tweedieverse.R |
|
|
1 |
#' Differential analysis of multi-omics data using Tweedie GLMs |
|
|
2 |
#' |
|
|
3 |
#' Fit a per-feature Tweedie generalized linear model to omics features. |
|
|
4 |
|
|
|
5 |
#' @param input_features A tab-delimited input file or an R data frame of features (can be in rows/columns) |
|
|
6 |
#' and samples (or cells). Samples are expected to have matching names with \code{input_metadata}. |
|
|
7 |
#' \code{input_features} can also be an object of class \code{SummarizedExperiment} or \code{SingleCellExperiment} |
|
|
8 |
#' that contains the expression or abundance matrix and other metadata; the \code{assays} |
|
|
9 |
#' slot contains the expression or abundance matrix and is named \code{"counts"}. |
|
|
10 |
#' This matrix should have one row for each feature and one sample for each column. |
|
|
11 |
#' The \code{colData} slot should contain a data frame with one row per |
|
|
12 |
#' sample and columns that contain metadata for each sample. |
|
|
13 |
#' Additional information about the experiment can be contained in the |
|
|
14 |
#' \code{metadata} slot as a list. |
|
|
15 |
#' @param input_metadata A tab-delimited input file or an R data frame of metadata (rows/columns). |
|
|
16 |
#' Samples are expected to have matching sample names with \code{input_features}. |
|
|
17 |
#' This file is ignored when \code{input_features} is a \code{SummarizedExperiment} |
|
|
18 |
#' or a \code{SingleCellExperiment} object with \code{colData} containing the same information. |
|
|
19 |
#' @param output The output folder to write results. |
|
|
20 |
#' @param assay_name If the input is provided as one of the accepted Bioconductor objects |
|
|
21 |
#' (e.g., SummarizedExperiment, RangedSummarizedExperiment, SingleCellExperiment, or TreeSummarizedExperiment), |
|
|
22 |
#' this argument selects the name of the assay slot in the input object that contains the omics measurements. |
|
|
23 |
#' @param abd_threshold If prevalence-abundance filtering is desired, only features that are present (or detected) |
|
|
24 |
#' in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion) |
|
|
25 |
#' are retained. Default value for \code{abd_threshold} is \code{0.0}. |
|
|
26 |
#' To disable prevalence-abundance filtering, set \code{abd_threshold = -Inf}. |
|
|
27 |
#' @param prev_threshold If prevalence-abundance filtering is desired, only features that are present (or detected) |
|
|
28 |
#' in at least \code{prev_threshold} percent of samples at \code{abd_threshold} minimum abundance (read count or proportion) |
|
|
29 |
#' are retained. Default value for \code{prev_threshold} is \code{0.1}. |
|
|
30 |
#' @param var_threshold If variance filtering is desired, only features that have variances greater than |
|
|
31 |
#' \code{var_threshold} are retained. This step is done after the prevalence-abundance filtering. |
|
|
32 |
#' Default value for \code{var_threshold} is \code{0.0} (i.e. no variance filtering). |
|
|
33 |
#' @param entropy_threshold If entropy-based filtering is desired for metadata, only features that have entropy greater than |
|
|
34 |
#' \code{entropy_threshold} are retained. Default value for \code{entropy_threshold} is \code{0.0} (i.e. no entropy filtering). |
|
|
35 |
#' @param base_model The per-feature base model. Default is "CPLM". Must be one of "CPLM", "ZICP", "ZSCP", or "ZACP". |
|
|
36 |
#' @param link A specification of the GLM link function. Default is "log". Must be one of "log", "identity", "sqrt", or "inverse". |
|
|
37 |
#' @param fixed_effects Metadata variable(s) describing the fixed effects coefficients. |
|
|
38 |
#' @param random_effects Metadata variable(s) describing the random effects part of the model. |
|
|
39 |
#' @param cutoff_ZSCP For \code{base_model = "ZSCP"}, the cutoff to stratify features for |
|
|
40 |
#' adaptive ZI modeling based on sparsity (zero-inflation proportion). Default is 0.3. Must be between 0 and 1. |
|
|
41 |
#' @param criteria_ZACP For \code{base_model = "ZACP"}, the criteria to select the |
|
|
42 |
#' best fitting model per feature. The possible options are 'AIC' and BIC' (default). |
|
|
43 |
#' More criteria will be supported in a future release. |
|
|
44 |
#' @param adjust_offset If TRUE (default), an offset term will be included as the logarithm of \code{scale_factor}. |
|
|
45 |
#' @param scale_factor Name of the numerical variable containing library size (for non-normalized data) or scale factor |
|
|
46 |
#' (for normalized data) across samples to be included as an offset in the base model (when \code{adjust_offset = TRUE}). |
|
|
47 |
#' If not found in metadata, defaults to the sample-wise total sums, unless \code{adjust_offset = FALSE}. |
|
|
48 |
#' @param max_significance The q-value threshold for significance. Default is 0.05. |
|
|
49 |
#' @param correction The correction method for computing the q-value (see \code{\link[stats]{p.adjust}} for options, default is 'BH'). |
|
|
50 |
#' @param standardize Should continuous metadata be standardized? Default is TRUE. Bypassed for categorical variables. |
|
|
51 |
#' @param cores An integer that indicates the number of R processes to run in parallel. Default is 1. |
|
|
52 |
#' @param optimizer The optimization routine to be used for estimating the parameters of the Tweedie model. |
|
|
53 |
#' Possible choices are \code{"nlminb"} (the default, see \code{\link[stats]{nlminb}}), |
|
|
54 |
#' \code{"bobyqa"} (\code{\link[minqa]{bobyqa}}), and \code{"L-BFGS-B"} (\code{\link[stats]{optim}}). |
|
|
55 |
#' Ignored for random effects modeling which uses an alternative Template Model Builder (TMB) approach (\code{\link[glmmTMB]{glmmTMB}}). |
|
|
56 |
#' @param na.action How to handle missing values? See \code{\link{na.action}}. Default is \code{\link{na.exclude}}. |
|
|
57 |
#' @param plot_heatmap Logical. If TRUE (default is FALSE), generate a heatmap of the (top \code{heatmap_first_n}) significant associations. |
|
|
58 |
#' @param plot_scatter Logical. If TRUE (default is FALSE), generate scatter/box plots of individual associations. |
|
|
59 |
#' @param heatmap_first_n In heatmap, plot top N features with significant associations (default is 50). |
|
|
60 |
#' @param reference The factor to use as a reference for a variable with more than two levels provided as a string of 'variable,reference' semi-colon delimited for multiple variables (default is NULL). |
|
|
61 |
#' |
|
|
62 |
#' @importFrom grDevices colorRampPalette dev.off jpeg pdf |
|
|
63 |
#' @importFrom stats coef fitted as.formula na.exclude p.adjust plogis sd update |
|
|
64 |
#' @importFrom utils capture.output read.table write.table |
|
|
65 |
#' @importFrom dplyr %>% everything |
|
|
66 |
#' @importFrom parallel clusterExport |
|
|
67 |
#' @return A data frame containing coefficient estimates, p-values, and q-values (multiplicity-adjusted p-values) are returned. |
|
|
68 |
#' |
|
|
69 |
#' @author Himel Mallick, \email{him4004@@med.cornell.edu} |
|
|
70 |
#' |
|
|
71 |
#' @examples |
|
|
72 |
#' |
|
|
73 |
#' \dontrun{ |
|
|
74 |
#' |
|
|
75 |
#' ############################################################################## |
|
|
76 |
#' # Example 1 - Differential Abundance Analysis of Synthetic Microbiome Counts # |
|
|
77 |
#' ############################################################################## |
|
|
78 |
#' |
|
|
79 |
#' ####################################### |
|
|
80 |
#' # Install and Load Required Libraries # |
|
|
81 |
#' ####################################### |
|
|
82 |
#' |
|
|
83 |
#' library(devtools) |
|
|
84 |
#' devtools::install_github('biobakery/sparseDOSSA@@varyLibSize') |
|
|
85 |
#' library(sparseDOSSA) |
|
|
86 |
#' library(stringi) |
|
|
87 |
#' |
|
|
88 |
#' ###################### |
|
|
89 |
#' # Specify Parameters # |
|
|
90 |
#' ###################### |
|
|
91 |
#' |
|
|
92 |
#' n.microbes <- 200 # Number of Features |
|
|
93 |
#' n.samples <- 100 # Number of Samples |
|
|
94 |
#' spike.perc <- 0.02 # Percentage of Spiked-in Bugs |
|
|
95 |
#' spikeStrength<-"20" # Effect Size |
|
|
96 |
#' |
|
|
97 |
#' ########################### |
|
|
98 |
#' # Specify Binary Metadata # |
|
|
99 |
#' ########################### |
|
|
100 |
#' |
|
|
101 |
#' n.metadata <- 1 |
|
|
102 |
#' UserMetadata<-as.matrix(rep(c(0,1), each=n.samples/2)) |
|
|
103 |
#' UserMetadata<-t(UserMetadata) # Transpose |
|
|
104 |
#' |
|
|
105 |
#' ################################################### |
|
|
106 |
#' # Spiked-in Metadata (Which Metadata to Spike-in) # |
|
|
107 |
#' ################################################### |
|
|
108 |
#' |
|
|
109 |
#' Metadatafrozenidx<-1 |
|
|
110 |
#' spikeCount<-as.character(length(Metadatafrozenidx)) |
|
|
111 |
#' significant_metadata<-paste('Metadata', Metadatafrozenidx, sep='') |
|
|
112 |
#' |
|
|
113 |
#' ############################################# |
|
|
114 |
#' # Generate SparseDOSSA Synthetic Abundances # |
|
|
115 |
#' ############################################# |
|
|
116 |
#' |
|
|
117 |
#' DD<-sparseDOSSA::sparseDOSSA(number_features = n.microbes, |
|
|
118 |
#' number_samples = n.samples, |
|
|
119 |
#' UserMetadata=UserMetadata, |
|
|
120 |
#' Metadatafrozenidx=Metadatafrozenidx, |
|
|
121 |
#' datasetCount = 1, |
|
|
122 |
#' spikeCount = spikeCount, |
|
|
123 |
#' spikeStrength = spikeStrength, |
|
|
124 |
#' noZeroInflate=TRUE, |
|
|
125 |
#' percent_spiked=spike.perc, |
|
|
126 |
#' seed = 1234) |
|
|
127 |
#' |
|
|
128 |
#' ############################## |
|
|
129 |
#' # Gather SparseDOSSA Outputs # |
|
|
130 |
#' ############################## |
|
|
131 |
#' |
|
|
132 |
#' sparsedossa_results <- as.data.frame(DD$OTU_count) |
|
|
133 |
#' rownames(sparsedossa_results)<-sparsedossa_results$X1 |
|
|
134 |
#' sparsedossa_results<-sparsedossa_results[-1,-1] |
|
|
135 |
#' colnames(sparsedossa_results)<-paste('Sample', 1:ncol(sparsedossa_results), sep='') |
|
|
136 |
#' data<-as.matrix(sparsedossa_results[-c((n.metadata+1):(2*n.microbes+n.metadata)),]) |
|
|
137 |
#' data<-data.matrix(data) |
|
|
138 |
#' class(data) <- "numeric" |
|
|
139 |
#' truth<-c(unlist(DD$truth)) |
|
|
140 |
#' truth<-truth[!stri_detect_fixed(truth,":")] |
|
|
141 |
#' truth<-truth[(5+n.metadata):length(truth)] |
|
|
142 |
#' truth<-as.data.frame(truth) |
|
|
143 |
#' significant_features<-truth[seq(1, |
|
|
144 |
#' (as.numeric(spikeCount)+1)*(n.microbes*spike.perc), (as.numeric(spikeCount)+1)),] |
|
|
145 |
#' significant_features<-as.vector(significant_features) |
|
|
146 |
#' |
|
|
147 |
#' #################### |
|
|
148 |
#' # Extract Features # |
|
|
149 |
#' #################### |
|
|
150 |
#' |
|
|
151 |
#' features<-as.data.frame(t(data[-c(1:n.metadata),])) |
|
|
152 |
#' |
|
|
153 |
#' #################### |
|
|
154 |
#' # Extract Metadata # |
|
|
155 |
#' #################### |
|
|
156 |
#' |
|
|
157 |
#' metadata<-as.data.frame(data[1,]) |
|
|
158 |
#' colnames(metadata)<-rownames(data)[1] |
|
|
159 |
#' |
|
|
160 |
#' ############################### |
|
|
161 |
#' # Mark True Positive Features # |
|
|
162 |
#' ############################### |
|
|
163 |
#' |
|
|
164 |
#' wh.TP = colnames(features) %in% significant_features |
|
|
165 |
#' colnames(features)<-paste("Feature", 1:n.microbes, sep = "") |
|
|
166 |
#' newname = paste0(colnames(features)[wh.TP], "_TP") |
|
|
167 |
#' colnames(features)[wh.TP] <- newname; |
|
|
168 |
#' colnames(features)[grep('TP', colnames(features))] |
|
|
169 |
#' |
|
|
170 |
#' #################### |
|
|
171 |
#' # Run Tweedieverse # |
|
|
172 |
#' ################### |
|
|
173 |
#' |
|
|
174 |
#' ################### |
|
|
175 |
#' # Default options # |
|
|
176 |
#' ################### |
|
|
177 |
#' |
|
|
178 |
#' CPLM <-Tweedieverse( |
|
|
179 |
#' features, |
|
|
180 |
#' metadata, |
|
|
181 |
#' output = './demo_output/CPLM') # Assuming demo_output exists |
|
|
182 |
#' |
|
|
183 |
#' ############################################### |
|
|
184 |
#' # User-defined prevalence-abundance filtering # |
|
|
185 |
#' ############################################### |
|
|
186 |
#' |
|
|
187 |
#' ZICP<-Tweedieverse( |
|
|
188 |
#' features, |
|
|
189 |
#' metadata, |
|
|
190 |
#' output = './demo_output/ZICP', # Assuming demo_output exists |
|
|
191 |
#' base_model = 'ZICP', |
|
|
192 |
#' abd_threshold = 0.0, |
|
|
193 |
#' prev_threshold = 0.2) |
|
|
194 |
#' |
|
|
195 |
#' #################################### |
|
|
196 |
#' # User-defined variance filtering # |
|
|
197 |
#' #################################### |
|
|
198 |
#' |
|
|
199 |
#' sds<-apply(features, 2, sd) |
|
|
200 |
#' var_threshold = median(sds)/2 |
|
|
201 |
#' ZSCP<-Tweedieverse( |
|
|
202 |
#' features, |
|
|
203 |
#' metadata, |
|
|
204 |
#' output = './demo_output/ZSCP', # Assuming demo_output exists |
|
|
205 |
#' base_model = 'ZSCP', |
|
|
206 |
#' var_threshold = var_threshold) |
|
|
207 |
#' |
|
|
208 |
#' ################## |
|
|
209 |
#' # Multiple cores # |
|
|
210 |
#' ################## |
|
|
211 |
#' |
|
|
212 |
#' ZACP<-Tweedieverse( |
|
|
213 |
#' features, |
|
|
214 |
#' metadata, |
|
|
215 |
#' output = './demo_output/ZACP', # Assuming demo_output exists |
|
|
216 |
#' base_model = 'ZACP', |
|
|
217 |
#' cores = 4) |
|
|
218 |
#' |
|
|
219 |
#' ########################################################################## |
|
|
220 |
#' # Example 2 - Multivariable Association on HMP2 Longitudinal Microbiomes # |
|
|
221 |
#' ########################################################################## |
|
|
222 |
#' |
|
|
223 |
#' ###################### |
|
|
224 |
#' # HMP2 input_features Analysis # |
|
|
225 |
#' ###################### |
|
|
226 |
#' |
|
|
227 |
#' ############# |
|
|
228 |
#' # Load input_features # |
|
|
229 |
#' ############# |
|
|
230 |
#' |
|
|
231 |
#' library(data.table) |
|
|
232 |
#' input_features <- fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_taxonomy.tsv", sep ="\t") |
|
|
233 |
#' input_metadata <-fread("https://raw.githubusercontent.com/biobakery/Maaslin2/master/inst/extdata/HMP2_metadata.tsv", sep ="\t") |
|
|
234 |
#' |
|
|
235 |
#' ############### |
|
|
236 |
#' # Format data # |
|
|
237 |
#' ############### |
|
|
238 |
#' |
|
|
239 |
#' library(tibble) |
|
|
240 |
#' features<- column_to_rownames(input_features, 'ID') |
|
|
241 |
#' metadata<- column_to_rownames(input_metadata, 'ID') |
|
|
242 |
#' |
|
|
243 |
#' ############# |
|
|
244 |
#' # Fit Model # |
|
|
245 |
#' ############# |
|
|
246 |
#' |
|
|
247 |
#' library(Tweedieverse) |
|
|
248 |
#' HMP2 <- Tweedieverse( |
|
|
249 |
#' features, |
|
|
250 |
#' metadata, |
|
|
251 |
#' output = './demo_output/HMP2', # Assuming demo_output exists |
|
|
252 |
#' fixed_effects = c('diagnosis', 'dysbiosisnonIBD','dysbiosisUC','dysbiosisCD', 'antibiotics', 'age'), |
|
|
253 |
#' random_effects = c('site', 'subject'), |
|
|
254 |
#' base_model = 'CPLM', |
|
|
255 |
#' adjust_offset = FALSE, # No offset as the values are relative abundances |
|
|
256 |
#' cores = 8, # Make sure your computer has the capability |
|
|
257 |
#' standardize = FALSE, |
|
|
258 |
#' reference = c('diagnosis,nonIBD')) |
|
|
259 |
#' |
|
|
260 |
#' } |
|
|
261 |
#' @keywords microbiome, metagenomics, multiomics, scRNASeq, tweedie, singlecell |
|
|
262 |
#' @export |
|
|
263 |
Tweedieverse <- function(input_features, |
|
|
264 |
input_metadata = NULL, |
|
|
265 |
output, |
|
|
266 |
assay_name = "counts", |
|
|
267 |
abd_threshold = 0.0, |
|
|
268 |
prev_threshold = 0.1, |
|
|
269 |
var_threshold = 0.0, |
|
|
270 |
entropy_threshold = 0.0, |
|
|
271 |
base_model = "CPLM", |
|
|
272 |
link = "log", |
|
|
273 |
fixed_effects = NULL, |
|
|
274 |
random_effects = NULL, |
|
|
275 |
cutoff_ZSCP = 0.3, |
|
|
276 |
criteria_ZACP = "BIC", |
|
|
277 |
adjust_offset = TRUE, |
|
|
278 |
scale_factor = NULL, |
|
|
279 |
max_significance = 0.05, |
|
|
280 |
correction = "BH", |
|
|
281 |
standardize = TRUE, |
|
|
282 |
cores = 1, |
|
|
283 |
optimizer = "nlminb", |
|
|
284 |
na.action = na.exclude, |
|
|
285 |
plot_heatmap = FALSE, |
|
|
286 |
plot_scatter = FALSE, |
|
|
287 |
heatmap_first_n = 50, |
|
|
288 |
reference = NULL) { |
|
|
289 |
|
|
|
290 |
|
|
|
291 |
|
|
|
292 |
################################# |
|
|
293 |
# Specify all available options # |
|
|
294 |
################################# |
|
|
295 |
|
|
|
296 |
model_choices <- c("CPLM", "ZICP", "ZACP", "ZSCP") |
|
|
297 |
link_choices <- c("log", "identity", "sqrt", "inverse") |
|
|
298 |
criteria_ZACP_choices <- c("AIC", "BIC") |
|
|
299 |
correction_choices <- |
|
|
300 |
c("BH", "holm", "hochberg", "hommel", "bonferroni", "BY") |
|
|
301 |
optimizer_choices <- c("nlminb", "bobyqa", "L-BFGS-B") |
|
|
302 |
|
|
|
303 |
####################################################################### |
|
|
304 |
#=====================================================================# |
|
|
305 |
# Read in the data and metadata, create output folder, initialize log # |
|
|
306 |
#=====================================================================# |
|
|
307 |
####################################################################### |
|
|
308 |
|
|
|
309 |
######################################### |
|
|
310 |
# Support multiple Bioconductor classes # |
|
|
311 |
######################################### |
|
|
312 |
|
|
|
313 |
valid_classes <- c("SummarizedExperiment", "RangedSummarizedExperiment", |
|
|
314 |
"SingleCellExperiment", "TreeSummarizedExperiment") |
|
|
315 |
|
|
|
316 |
############################################################## |
|
|
317 |
# Extract features and metadata based on user-provided input # |
|
|
318 |
############################################################## |
|
|
319 |
|
|
|
320 |
input_class<-class(input_features) |
|
|
321 |
if (input_class %in% valid_classes) { |
|
|
322 |
data <- extractAssay(input_features, assay_name) |
|
|
323 |
if(is.null(input_metadata)) { |
|
|
324 |
metadata <- data.frame(colData(input_features)) |
|
|
325 |
} else{ |
|
|
326 |
metadata<-input_metadata |
|
|
327 |
} |
|
|
328 |
} else if (!(is.character(input_features)) & !(is.data.frame(input_features))) { |
|
|
329 |
stop(cat(paste('Input data of class <', class(input_features), '> not supported. Please use SummarizedExperiment, SingleCellExperiment, RangedSummarizedExperiment, TreeSummarizedExperiment, or data.frame'))) |
|
|
330 |
} else{ |
|
|
331 |
|
|
|
332 |
# if a character string then this is a file name, else it |
|
|
333 |
# is a data frame |
|
|
334 |
if (is.character(input_features)) { |
|
|
335 |
data <- |
|
|
336 |
data.frame( |
|
|
337 |
read.delim::fread(input_features, header = TRUE, sep = '\t'), |
|
|
338 |
header = TRUE, |
|
|
339 |
fill = T, |
|
|
340 |
comment.char = "" , |
|
|
341 |
check.names = F, |
|
|
342 |
row.names = 1 |
|
|
343 |
) |
|
|
344 |
if (nrow(input_features) == 1) { |
|
|
345 |
# read again to get row name |
|
|
346 |
data <- read.delim( |
|
|
347 |
input_features, |
|
|
348 |
header = TRUE, |
|
|
349 |
fill = T, |
|
|
350 |
comment.char = "" , |
|
|
351 |
check.names = F, |
|
|
352 |
row.names = 1 |
|
|
353 |
) |
|
|
354 |
} |
|
|
355 |
} else { |
|
|
356 |
data <- input_features |
|
|
357 |
} |
|
|
358 |
if (is.character(input_metadata)) { |
|
|
359 |
input_metadata <- |
|
|
360 |
data.frame( |
|
|
361 |
read.delim::fread(input_metadata, header = TRUE, sep = '\t'), |
|
|
362 |
header = TRUE, |
|
|
363 |
fill = T, |
|
|
364 |
comment.char = "" , |
|
|
365 |
check.names = F, |
|
|
366 |
row.names = 1 |
|
|
367 |
) |
|
|
368 |
if (nrow(input_metadata) == 1) { |
|
|
369 |
input_metadata <- read.delim( |
|
|
370 |
input_metadata, |
|
|
371 |
header = TRUE, |
|
|
372 |
fill = T, |
|
|
373 |
comment.char = "" , |
|
|
374 |
check.names = F, |
|
|
375 |
row.names = 1 |
|
|
376 |
) |
|
|
377 |
} |
|
|
378 |
} else { |
|
|
379 |
metadata <- input_metadata |
|
|
380 |
} |
|
|
381 |
} |
|
|
382 |
|
|
|
383 |
# create an output folder and figures folder if it does not exist |
|
|
384 |
if (!file.exists(output)) { |
|
|
385 |
print("Creating output folder") |
|
|
386 |
dir.create(output) |
|
|
387 |
} |
|
|
388 |
|
|
|
389 |
#if (plot_heatmap || plot_scatter) { |
|
|
390 |
figures_folder <- file.path(output, "figures") |
|
|
391 |
if (!file.exists(figures_folder)) { |
|
|
392 |
print("Creating output figures folder") |
|
|
393 |
dir.create(figures_folder) |
|
|
394 |
} |
|
|
395 |
#} |
|
|
396 |
|
|
|
397 |
# Create log file (write info to stdout and debug level to log file) |
|
|
398 |
# Set level to finest so all log levels are reviewed |
|
|
399 |
log_file <- file.path(output, "Tweedieverse.log") |
|
|
400 |
# Remove log file if already exists (to avoid append) |
|
|
401 |
if (file.exists(log_file)) { |
|
|
402 |
print(paste("Warning: Deleting existing log file:", log_file)) |
|
|
403 |
unlink(log_file) |
|
|
404 |
} |
|
|
405 |
logging::basicConfig(level = 'FINEST') |
|
|
406 |
logging::addHandler(logging::writeToFile, |
|
|
407 |
file = log_file, level = "DEBUG") |
|
|
408 |
logging::setLevel(20, logging::getHandler('basic.stdout')) |
|
|
409 |
|
|
|
410 |
##################### |
|
|
411 |
# Log the arguments # |
|
|
412 |
##################### |
|
|
413 |
|
|
|
414 |
logging::loginfo("Writing function arguments to log file") |
|
|
415 |
logging::logdebug("Function arguments") |
|
|
416 |
if (is.character(input_features)) { |
|
|
417 |
logging::logdebug("Input data file: %s", input_features) |
|
|
418 |
} |
|
|
419 |
if (is.character(input_metadata)) { |
|
|
420 |
logging::logdebug("Input metadata file: %s", input_metadata) |
|
|
421 |
} |
|
|
422 |
logging::logdebug("Output folder: %s", output) |
|
|
423 |
logging::logdebug("Abundance threshold: %f", abd_threshold) |
|
|
424 |
logging::logdebug("Prevalence threshold: %f", prev_threshold) |
|
|
425 |
logging::logdebug("Variance threshold: %f", var_threshold) |
|
|
426 |
logging::logdebug("Base model: %s", base_model) |
|
|
427 |
logging::logdebug("Link function: %s", link) |
|
|
428 |
logging::logdebug("Fixed effects: %s", fixed_effects) |
|
|
429 |
logging::logdebug("Random effects: %s", random_effects) |
|
|
430 |
logging::logdebug("ZSCP cutoff: %f", cutoff_ZSCP) |
|
|
431 |
logging::logdebug("ZACP criteria: %s", criteria_ZACP) |
|
|
432 |
logging::logdebug("Offset adjustment: %s", adjust_offset) |
|
|
433 |
logging::logdebug("Scale factor: %s", scale_factor) |
|
|
434 |
logging::logdebug("Max significance: %f", max_significance) |
|
|
435 |
logging::logdebug("Correction method: %s", correction) |
|
|
436 |
logging::logdebug("Standardize: %s", standardize) |
|
|
437 |
logging::logdebug("Cores: %d", cores) |
|
|
438 |
logging::logdebug("Optimization routine: %s", optimizer) |
|
|
439 |
|
|
|
440 |
|
|
|
441 |
####################################### |
|
|
442 |
# Check if valid options are selected # |
|
|
443 |
####################################### |
|
|
444 |
|
|
|
445 |
# Check if the selected link is valid |
|
|
446 |
if (!link %in% link_choices) { |
|
|
447 |
option_not_valid_error("Please select a link from the list of available options", |
|
|
448 |
toString(link_choices)) |
|
|
449 |
} |
|
|
450 |
|
|
|
451 |
# Check if the selected base_model is valid |
|
|
452 |
if (!base_model %in% model_choices) { |
|
|
453 |
option_not_valid_error( |
|
|
454 |
paste( |
|
|
455 |
"Please select an analysis method", |
|
|
456 |
"from the list of available options" |
|
|
457 |
), |
|
|
458 |
toString(model_choices) |
|
|
459 |
) |
|
|
460 |
} |
|
|
461 |
|
|
|
462 |
# Check if the selected criteria_ZACP is valid |
|
|
463 |
if (!criteria_ZACP %in% criteria_ZACP_choices) { |
|
|
464 |
option_not_valid_error( |
|
|
465 |
paste( |
|
|
466 |
"Please select a criteria", |
|
|
467 |
"from the list of available options" |
|
|
468 |
), |
|
|
469 |
toString(criteria_ZACP_choices) |
|
|
470 |
) |
|
|
471 |
} |
|
|
472 |
|
|
|
473 |
# Check if the selected correction is valid |
|
|
474 |
if (!correction %in% correction_choices) { |
|
|
475 |
option_not_valid_error( |
|
|
476 |
paste( |
|
|
477 |
"Please select a correction method", |
|
|
478 |
"from the list of available options" |
|
|
479 |
), |
|
|
480 |
toString(correction_choices) |
|
|
481 |
) |
|
|
482 |
} |
|
|
483 |
|
|
|
484 |
# Check if the selected optimizer is valid |
|
|
485 |
if (!optimizer %in% optimizer_choices) { |
|
|
486 |
option_not_valid_error( |
|
|
487 |
paste( |
|
|
488 |
"Please select an optimizer method", |
|
|
489 |
"from the list of available options" |
|
|
490 |
), |
|
|
491 |
toString(correction_choices) |
|
|
492 |
) |
|
|
493 |
} |
|
|
494 |
|
|
|
495 |
############################################################ |
|
|
496 |
# Check if the selected numerical options are within range # |
|
|
497 |
############################################################ |
|
|
498 |
|
|
|
499 |
prop_options <- c(prev_threshold, cutoff_ZSCP, max_significance) |
|
|
500 |
if (any(prop_options < 0) || any(prop_options > 1)) { |
|
|
501 |
stop( |
|
|
502 |
paste( |
|
|
503 |
"One of the following is outside [0, 1]:", |
|
|
504 |
"prev_threshold, cutoff_ZSCP, max_significance" |
|
|
505 |
) |
|
|
506 |
) |
|
|
507 |
} |
|
|
508 |
|
|
|
509 |
############################################################### |
|
|
510 |
# Determine orientation of data in input and reorder to match # |
|
|
511 |
############################################################### |
|
|
512 |
|
|
|
513 |
logging::loginfo("Determining format of input files") |
|
|
514 |
samples_row_row <- intersect(rownames(data), rownames(metadata)) |
|
|
515 |
if (length(samples_row_row) > 0) { |
|
|
516 |
# this is the expected formatting so do not modify data frames |
|
|
517 |
logging::loginfo(paste( |
|
|
518 |
"Input format is data samples", |
|
|
519 |
"as rows and metadata samples as rows" |
|
|
520 |
)) |
|
|
521 |
} else { |
|
|
522 |
samples_column_row <- intersect(colnames(data), rownames(metadata)) |
|
|
523 |
if (length(samples_column_row) > 0) { |
|
|
524 |
logging::loginfo(paste( |
|
|
525 |
"Input format is data samples", |
|
|
526 |
"as columns and metadata samples as rows" |
|
|
527 |
)) |
|
|
528 |
# transpose data frame so samples are rows |
|
|
529 |
data <- type.convert(as.data.frame(t(data))) |
|
|
530 |
logging::logdebug("linked data so samples are rows") |
|
|
531 |
} else { |
|
|
532 |
samples_column_column <- |
|
|
533 |
intersect(colnames(data), colnames(metadata)) |
|
|
534 |
if (length(samples_column_column) > 0) { |
|
|
535 |
logging::loginfo( |
|
|
536 |
paste( |
|
|
537 |
"Input format is data samples", |
|
|
538 |
"as columns and metadata samples as columns" |
|
|
539 |
) |
|
|
540 |
) |
|
|
541 |
data <- type.convert(as.data.frame(t(data))) |
|
|
542 |
metadata <- type.convert(as.data.frame(t(metadata))) |
|
|
543 |
logging::logdebug("linked data and metadata so samples are rows") |
|
|
544 |
} else { |
|
|
545 |
samples_row_column <- |
|
|
546 |
intersect(rownames(data), colnames(metadata)) |
|
|
547 |
if (length(samples_row_column) > 0) { |
|
|
548 |
logging::loginfo( |
|
|
549 |
paste( |
|
|
550 |
"Input format is data samples", |
|
|
551 |
"as rows and metadata samples as columns" |
|
|
552 |
) |
|
|
553 |
) |
|
|
554 |
metadata <- type.convert(as.data.frame(t(metadata))) |
|
|
555 |
logging::logdebug("linked metadata so samples are rows") |
|
|
556 |
} else { |
|
|
557 |
logging::logerror( |
|
|
558 |
paste( |
|
|
559 |
"Unable to find samples in data and", |
|
|
560 |
"metadata files.", |
|
|
561 |
"Rows/columns do not match." |
|
|
562 |
) |
|
|
563 |
) |
|
|
564 |
logging::logdebug("input_features rows: %s", |
|
|
565 |
paste(rownames(data), collapse = ",")) |
|
|
566 |
logging::logdebug("input_features columns: %s", |
|
|
567 |
paste(colnames(data), collapse = ",")) |
|
|
568 |
logging::logdebug("Metadata rows: %s", |
|
|
569 |
paste(rownames(metadata), collapse = ",")) |
|
|
570 |
logging::logdebug("Metadata columns: %s", |
|
|
571 |
paste(colnames(data), collapse = ",")) |
|
|
572 |
stop() |
|
|
573 |
} |
|
|
574 |
} |
|
|
575 |
} |
|
|
576 |
} |
|
|
577 |
|
|
|
578 |
# Replace unexpected characters in feature names |
|
|
579 |
# colnames(data) <- make.names(colnames(data)) |
|
|
580 |
|
|
|
581 |
# Check for samples without metadata |
|
|
582 |
extra_feature_samples <- |
|
|
583 |
setdiff(rownames(data), rownames(metadata)) |
|
|
584 |
if (length(extra_feature_samples) > 0) |
|
|
585 |
logging::logdebug( |
|
|
586 |
paste( |
|
|
587 |
"The following samples were found", |
|
|
588 |
"to have features but no metadata.", |
|
|
589 |
"They will be removed. %s" |
|
|
590 |
), |
|
|
591 |
paste(extra_feature_samples, collapse = ",") |
|
|
592 |
) |
|
|
593 |
|
|
|
594 |
# Check for metadata samples without features |
|
|
595 |
extra_metadata_samples <- |
|
|
596 |
setdiff(rownames(metadata), rownames(data)) |
|
|
597 |
if (length(extra_metadata_samples) > 0) |
|
|
598 |
logging::logdebug( |
|
|
599 |
paste( |
|
|
600 |
"The following samples were found", |
|
|
601 |
"to have metadata but no features.", |
|
|
602 |
"They will be removed. %s" |
|
|
603 |
), |
|
|
604 |
paste(extra_metadata_samples, collapse = ",") |
|
|
605 |
) |
|
|
606 |
|
|
|
607 |
# Get a set of the samples with both metadata and features |
|
|
608 |
intersect_samples <- intersect(rownames(data), rownames(metadata)) |
|
|
609 |
logging::logdebug( |
|
|
610 |
"A total of %s samples were found in both the data and metadata", |
|
|
611 |
length(intersect_samples) |
|
|
612 |
) |
|
|
613 |
|
|
|
614 |
# Now order both data and metadata with the same sample ordering |
|
|
615 |
logging::logdebug("Reordering data/metadata to use same sample ordering") |
|
|
616 |
data <- data[intersect_samples, , drop = FALSE] |
|
|
617 |
metadata <- metadata[intersect_samples, , drop = FALSE] |
|
|
618 |
|
|
|
619 |
|
|
|
620 |
######################################################################## |
|
|
621 |
# Assign reference values to categorical metadata (fixed effects only) # |
|
|
622 |
######################################################################## |
|
|
623 |
|
|
|
624 |
if (is.null(reference)) { |
|
|
625 |
reference <- "," |
|
|
626 |
} |
|
|
627 |
split_reference <- unlist(strsplit(reference, "[,;]")) |
|
|
628 |
|
|
|
629 |
# for each fixed effect, check that a reference level has been set if necessary: number of levels > 2 and metadata isn't already an ordered factor |
|
|
630 |
for (i in fixed_effects) { |
|
|
631 |
# don't check for or require reference levels for numeric metadata |
|
|
632 |
if (is.numeric(metadata[,i])) { |
|
|
633 |
next |
|
|
634 |
} |
|
|
635 |
# respect ordering if a factor is explicitly passed in with no reference set |
|
|
636 |
if (is.factor(metadata[,i]) && !(i %in% split_reference)) { |
|
|
637 |
logging::loginfo(paste("Factor detected for categorial metadata '", |
|
|
638 |
i, "'. Provide a reference argument or manually set factor ordering to change reference level.", sep="")) |
|
|
639 |
next |
|
|
640 |
} |
|
|
641 |
|
|
|
642 |
# set metadata as a factor (ordered alphabetically) |
|
|
643 |
metadata[,i] <- as.factor(metadata[,i]) |
|
|
644 |
mlevels <- levels(metadata[,i]) |
|
|
645 |
|
|
|
646 |
# get reference level for variable being considered, returns NA if not found |
|
|
647 |
ref <- split_reference[match(i, split_reference)+1] |
|
|
648 |
|
|
|
649 |
# if metadata has 2 levels, allow but don't require setting reference level, otherwise require it |
|
|
650 |
if ((length(mlevels) == 2)) { |
|
|
651 |
if(!is.na(ref)) { |
|
|
652 |
metadata[,i] = relevel(metadata[,i], ref = ref) |
|
|
653 |
} |
|
|
654 |
} else if (length(mlevels) > 2) { |
|
|
655 |
if (!is.na(ref)) { |
|
|
656 |
metadata[,i] = relevel(metadata[,i], ref = ref) |
|
|
657 |
} else { |
|
|
658 |
stop(paste("Please provide the reference for the variable '", |
|
|
659 |
i, "' which includes more than 2 levels: ", |
|
|
660 |
paste(as.character(mlevels), collapse=", "), ".", sep="")) |
|
|
661 |
} |
|
|
662 |
} else { |
|
|
663 |
stop("Provided categorical metadata has fewer than 2 unique, non-NA values.") |
|
|
664 |
} |
|
|
665 |
} |
|
|
666 |
|
|
|
667 |
|
|
|
668 |
######################################################### |
|
|
669 |
# Non-specific filtering based on user-provided options # |
|
|
670 |
######################################################### |
|
|
671 |
|
|
|
672 |
unfiltered_data <- data |
|
|
673 |
unfiltered_metadata <- metadata |
|
|
674 |
|
|
|
675 |
# require at least total samples * min prevalence values |
|
|
676 |
# for each feature to be greater than min abundance |
|
|
677 |
logging::loginfo("Filter data based on min abundance and min prevalence") |
|
|
678 |
total_samples <- nrow(unfiltered_data) |
|
|
679 |
logging::loginfo("Total samples in data: %d", total_samples) |
|
|
680 |
min_samples <- total_samples * prev_threshold |
|
|
681 |
logging::loginfo( |
|
|
682 |
paste( |
|
|
683 |
"Min samples required with min abundance", |
|
|
684 |
"for a feature not to be filtered: %f" |
|
|
685 |
), |
|
|
686 |
min_samples |
|
|
687 |
) |
|
|
688 |
|
|
|
689 |
# Filter by abundance using zero as value for NAs |
|
|
690 |
data_zeros <- unfiltered_data |
|
|
691 |
data_zeros[is.na(data_zeros)] <- 0 |
|
|
692 |
filtered_data <- |
|
|
693 |
unfiltered_data[, |
|
|
694 |
colSums(data_zeros > abd_threshold) > min_samples, |
|
|
695 |
drop = FALSE] |
|
|
696 |
total_filtered_features <- |
|
|
697 |
ncol(unfiltered_data) - ncol(filtered_data) |
|
|
698 |
logging::loginfo( |
|
|
699 |
"Total filtered features with prevalence-abundance filtering: %d", |
|
|
700 |
total_filtered_features |
|
|
701 |
) |
|
|
702 |
filtered_feature_names <- |
|
|
703 |
setdiff(names(unfiltered_data), names(filtered_data)) |
|
|
704 |
logging::loginfo("Filtered feature names: %s", |
|
|
705 |
toString(filtered_feature_names)) |
|
|
706 |
|
|
|
707 |
|
|
|
708 |
|
|
|
709 |
################################# |
|
|
710 |
# Filter data based on variance # |
|
|
711 |
################################# |
|
|
712 |
|
|
|
713 |
sds <- apply(filtered_data, 2, na.rm = T, sd) |
|
|
714 |
final_features <- |
|
|
715 |
filtered_data[, which(sds > var_threshold), drop = FALSE] |
|
|
716 |
total_filtered_features_var <- |
|
|
717 |
ncol(filtered_data) - ncol(final_features) |
|
|
718 |
logging::loginfo("Total filtered features with variance filtering: %d", |
|
|
719 |
total_filtered_features_var) |
|
|
720 |
filtered_feature_names_var <- |
|
|
721 |
setdiff(names(filtered_data), names(final_features)) |
|
|
722 |
logging::loginfo("Filtered feature names: %s", |
|
|
723 |
toString(filtered_feature_names_var)) |
|
|
724 |
|
|
|
725 |
|
|
|
726 |
######################################################################## |
|
|
727 |
# Set the scale factor to rowsum if not provided and create a modified # |
|
|
728 |
# metadata table without the scale factor variable (if present) ######## |
|
|
729 |
######################################################################## |
|
|
730 |
|
|
|
731 |
if (is.null(scale_factor)) { |
|
|
732 |
offset <- rowSums(final_features) |
|
|
733 |
} else { |
|
|
734 |
if (!scale_factor %in% colnames(unfiltered_metadata)) { |
|
|
735 |
stop( |
|
|
736 |
paste( |
|
|
737 |
"The specified scale_factor variable is not present in the metadata table:\n", |
|
|
738 |
scale_factor |
|
|
739 |
) |
|
|
740 |
) |
|
|
741 |
} else { |
|
|
742 |
offset <- unfiltered_metadata[, scale_factor] |
|
|
743 |
unfiltered_metadata <- |
|
|
744 |
dplyr::select(unfiltered_metadata,-scale_factor) |
|
|
745 |
} |
|
|
746 |
} |
|
|
747 |
|
|
|
748 |
|
|
|
749 |
#################################### |
|
|
750 |
# Filter metadata based on entropy # |
|
|
751 |
#################################### |
|
|
752 |
|
|
|
753 |
# Reduce metadata to only include those pass entropy threshold |
|
|
754 |
temp_filtered_metadata <- unfiltered_metadata[, apply(unfiltered_metadata, 2, entropy) > entropy_threshold, drop=F] |
|
|
755 |
excluded_metadata <- setdiff(colnames(unfiltered_metadata), colnames(temp_filtered_metadata)) |
|
|
756 |
logging::loginfo( |
|
|
757 |
paste( |
|
|
758 |
"Excluded metadata with", |
|
|
759 |
"entropy less or equal to %s: %s" |
|
|
760 |
), |
|
|
761 |
entropy_threshold, paste(excluded_metadata, collapse = ",") |
|
|
762 |
) |
|
|
763 |
filtered_metadata <- temp_filtered_metadata |
|
|
764 |
|
|
|
765 |
|
|
|
766 |
############################################### |
|
|
767 |
# Compute the formula based on the user input # |
|
|
768 |
############################################### |
|
|
769 |
|
|
|
770 |
##################### |
|
|
771 |
# Determine formula # |
|
|
772 |
##################### |
|
|
773 |
|
|
|
774 |
random_effects_formula <- NULL |
|
|
775 |
# Use all metadata if no fixed effects are provided |
|
|
776 |
if (is.null(fixed_effects)) { |
|
|
777 |
fixed_effects <- colnames(filtered_metadata) |
|
|
778 |
} else { |
|
|
779 |
fixed_effects <- unlist(strsplit(fixed_effects, ",", fixed = TRUE)) |
|
|
780 |
# remove any fixed effects not found in metadata names |
|
|
781 |
to_remove <- setdiff(fixed_effects, colnames(filtered_metadata)) |
|
|
782 |
if (length(to_remove) > 0) |
|
|
783 |
logging::logwarn( |
|
|
784 |
paste( |
|
|
785 |
"Feature name not found in metadata", |
|
|
786 |
"so not applied to formula as fixed effect: %s" |
|
|
787 |
), |
|
|
788 |
paste(to_remove, collapse = " , ") |
|
|
789 |
) |
|
|
790 |
fixed_effects <- setdiff(fixed_effects, to_remove) |
|
|
791 |
if (length(fixed_effects) == 0) { |
|
|
792 |
logging::logerror("No fixed effects included in formula.") |
|
|
793 |
stop() |
|
|
794 |
} |
|
|
795 |
} |
|
|
796 |
|
|
|
797 |
if (!is.null(random_effects)) { |
|
|
798 |
random_effects <- |
|
|
799 |
unlist(strsplit(random_effects, ",", fixed = TRUE)) |
|
|
800 |
# subtract random effects from fixed effects |
|
|
801 |
fixed_effects <- setdiff(fixed_effects, random_effects) |
|
|
802 |
# remove any random effects not found in metadata |
|
|
803 |
to_remove <- |
|
|
804 |
setdiff(random_effects, colnames(filtered_metadata)) |
|
|
805 |
if (length(to_remove) > 0) |
|
|
806 |
logging::logwarn( |
|
|
807 |
paste( |
|
|
808 |
"Feature name not found in metadata", |
|
|
809 |
"so not applied to formula as random effect: %s" |
|
|
810 |
), |
|
|
811 |
paste(to_remove, collapse = " , ") |
|
|
812 |
) |
|
|
813 |
random_effects <- setdiff(random_effects, to_remove) |
|
|
814 |
|
|
|
815 |
# create formula |
|
|
816 |
if (length(random_effects) > 0) { |
|
|
817 |
random_effects_formula_text <- |
|
|
818 |
paste("expr ~ (1 | ", |
|
|
819 |
paste( |
|
|
820 |
random_effects, |
|
|
821 |
")", |
|
|
822 |
sep = '', |
|
|
823 |
collapse = " + (1 | " |
|
|
824 |
), |
|
|
825 |
sep = '') |
|
|
826 |
logging::loginfo("Formula for random effects: %s", |
|
|
827 |
random_effects_formula_text) |
|
|
828 |
random_effects_formula <- |
|
|
829 |
tryCatch( |
|
|
830 |
as.formula(random_effects_formula_text), |
|
|
831 |
error = function(e) |
|
|
832 |
stop( |
|
|
833 |
paste( |
|
|
834 |
"Invalid formula for random effects: ", |
|
|
835 |
random_effects_formula_text |
|
|
836 |
) |
|
|
837 |
) |
|
|
838 |
) |
|
|
839 |
} |
|
|
840 |
} |
|
|
841 |
|
|
|
842 |
# Reduce metadata to only include fixed/random effects in formula |
|
|
843 |
effects_names <- union(fixed_effects, random_effects) |
|
|
844 |
filtered_metadata <- |
|
|
845 |
filtered_metadata[, effects_names, drop = FALSE] |
|
|
846 |
|
|
|
847 |
# Create the fixed effects formula text |
|
|
848 |
formula_text <- |
|
|
849 |
paste("expr ~ ", paste(fixed_effects, collapse = " + ")) |
|
|
850 |
logging::loginfo("Formula for fixed effects: %s", formula_text) |
|
|
851 |
formula <- |
|
|
852 |
tryCatch( |
|
|
853 |
as.formula(formula_text), |
|
|
854 |
error = function(e) |
|
|
855 |
stop( |
|
|
856 |
paste( |
|
|
857 |
"Invalid formula.", |
|
|
858 |
"Please provide a different formula: ", |
|
|
859 |
formula_text |
|
|
860 |
) |
|
|
861 |
) |
|
|
862 |
) |
|
|
863 |
|
|
|
864 |
############################################################# |
|
|
865 |
# Standardize metadata (excpet the offset variable), if set # |
|
|
866 |
############################################################# |
|
|
867 |
|
|
|
868 |
if (standardize) { |
|
|
869 |
logging::loginfo("Applying z-score to standardize continuous metadata") |
|
|
870 |
filtered_metadata <- |
|
|
871 |
filtered_metadata %>% dplyr::mutate_if(is.numeric, scale) |
|
|
872 |
} else { |
|
|
873 |
logging::loginfo("Bypass z-score application to metadata") |
|
|
874 |
} |
|
|
875 |
|
|
|
876 |
################################## |
|
|
877 |
# Merge metadata and offset back # |
|
|
878 |
################################## |
|
|
879 |
|
|
|
880 |
final_metadata <- as.data.frame(cbind.data.frame(filtered_metadata, offset)) |
|
|
881 |
|
|
|
882 |
############################################################## |
|
|
883 |
# Apply the base model to the filtered data with user inputs # |
|
|
884 |
############################################################## |
|
|
885 |
|
|
|
886 |
logging::loginfo("Running selected analysis method: %s", base_model) |
|
|
887 |
|
|
|
888 |
fit_data <- fit.Tweedieverse( |
|
|
889 |
features = final_features, |
|
|
890 |
metadata = final_metadata, |
|
|
891 |
base_model = base_model, |
|
|
892 |
link = link, |
|
|
893 |
formula = formula, |
|
|
894 |
random_effects_formula = random_effects_formula, |
|
|
895 |
cutoff_ZSCP = cutoff_ZSCP, |
|
|
896 |
criteria_ZACP = criteria_ZACP, |
|
|
897 |
adjust_offset = adjust_offset, |
|
|
898 |
correction = correction, |
|
|
899 |
cores = cores, |
|
|
900 |
optimizer = optimizer, |
|
|
901 |
na.action = na.action |
|
|
902 |
) |
|
|
903 |
|
|
|
904 |
|
|
|
905 |
################################################### |
|
|
906 |
# Count the N and Zero-inflation for each feature # |
|
|
907 |
################################################### |
|
|
908 |
|
|
|
909 |
logging::loginfo("Counting prevalence for each feature") |
|
|
910 |
try(fit_data$results$N <- |
|
|
911 |
apply( |
|
|
912 |
fit_data$results, |
|
|
913 |
1, |
|
|
914 |
FUN = function(x) |
|
|
915 |
length(final_features[, x[1]]) |
|
|
916 |
)) |
|
|
917 |
try(fit_data$results$N.not.zero <- |
|
|
918 |
apply( |
|
|
919 |
fit_data$results, |
|
|
920 |
1, |
|
|
921 |
FUN = function(x) |
|
|
922 |
length(which(final_features[, x[1]] > 0)) |
|
|
923 |
)) |
|
|
924 |
try(fit_data$results$percent.zero <- |
|
|
925 |
apply( |
|
|
926 |
fit_data$results, |
|
|
927 |
1, |
|
|
928 |
FUN = function(x) |
|
|
929 |
round(mean(final_features[, x[1]] == 0, na.rm = TRUE), 2) * |
|
|
930 |
100 |
|
|
931 |
)) |
|
|
932 |
|
|
|
933 |
######################### |
|
|
934 |
# Write out the results # |
|
|
935 |
######################### |
|
|
936 |
|
|
|
937 |
results_file <- file.path(output, "all_results.tsv") |
|
|
938 |
logging::loginfo("Writing all results to file (ordered by increasing q-values): %s", |
|
|
939 |
results_file) |
|
|
940 |
ordered_results <- |
|
|
941 |
fit_data$results[order(fit_data$results$qval),] |
|
|
942 |
ordered_results <- |
|
|
943 |
ordered_results[!is.na(ordered_results$qval),] # Remove NA's |
|
|
944 |
ordered_results <- |
|
|
945 |
dplyr::select( |
|
|
946 |
ordered_results, |
|
|
947 |
c( |
|
|
948 |
'feature', |
|
|
949 |
'metadata', |
|
|
950 |
'value', |
|
|
951 |
"coef", |
|
|
952 |
"stderr", |
|
|
953 |
"pval", |
|
|
954 |
"qval" |
|
|
955 |
), |
|
|
956 |
everything() |
|
|
957 |
) |
|
|
958 |
write.table( |
|
|
959 |
ordered_results, |
|
|
960 |
file = results_file, |
|
|
961 |
sep = "\t", |
|
|
962 |
quote = FALSE, |
|
|
963 |
row.names = FALSE |
|
|
964 |
) |
|
|
965 |
|
|
|
966 |
|
|
|
967 |
# Write results passing threshold to file |
|
|
968 |
# (removing any that are NA for the q-value) |
|
|
969 |
significant_results <- |
|
|
970 |
ordered_results[ordered_results$qval <= max_significance,] |
|
|
971 |
significant_results_file <- |
|
|
972 |
file.path(output, "significant_results.tsv") |
|
|
973 |
logging::loginfo( |
|
|
974 |
paste( |
|
|
975 |
"Writing the significant results", |
|
|
976 |
"(those which are less than or equal to the threshold", |
|
|
977 |
"of %f ) to file (ordered by increasing q-values): %s" |
|
|
978 |
), |
|
|
979 |
max_significance, |
|
|
980 |
significant_results_file |
|
|
981 |
) |
|
|
982 |
write.table( |
|
|
983 |
significant_results, |
|
|
984 |
file = significant_results_file, |
|
|
985 |
sep = "\t", |
|
|
986 |
quote = FALSE, |
|
|
987 |
row.names = FALSE |
|
|
988 |
) |
|
|
989 |
|
|
|
990 |
|
|
|
991 |
####################################################### |
|
|
992 |
# Create visualizations for results passing threshold # |
|
|
993 |
####################################################### |
|
|
994 |
|
|
|
995 |
logging::loginfo("Writing Tweedie inxed plot to file: %s", |
|
|
996 |
output) |
|
|
997 |
tryCatch({ |
|
|
998 |
tweedie_index_plot(ordered_results, figures_folder) |
|
|
999 |
|
|
|
1000 |
}, error = function(err) { |
|
|
1001 |
logging::logerror("Unable to do make a Tweedie inxed plot of results!!!") |
|
|
1002 |
logging::logerror(err) |
|
|
1003 |
# dev.off() |
|
|
1004 |
}) |
|
|
1005 |
|
|
|
1006 |
if (plot_heatmap & length(unique(significant_results_file["metdata"])) > 1) { |
|
|
1007 |
heatmap_file <- file.path(output, "Tweedieverse_Heatmap.pdf") |
|
|
1008 |
logging::loginfo("Writing heatmap of significant results to file: %s", |
|
|
1009 |
heatmap_file) |
|
|
1010 |
tryCatch({ |
|
|
1011 |
save_heatmap(significant_results_file, |
|
|
1012 |
heatmap_file, |
|
|
1013 |
figures_folder, |
|
|
1014 |
first_n = heatmap_first_n) |
|
|
1015 |
}, error = function(err) { |
|
|
1016 |
logging::logerror("Unable to do make a hetamp of results!!!") |
|
|
1017 |
logging::logerror(err) |
|
|
1018 |
# dev.off() |
|
|
1019 |
}) |
|
|
1020 |
} |
|
|
1021 |
|
|
|
1022 |
if (plot_scatter) { |
|
|
1023 |
logging::loginfo( |
|
|
1024 |
paste( |
|
|
1025 |
"Writing association plots", |
|
|
1026 |
"(one for each significant association)", |
|
|
1027 |
"to output folder: %s" |
|
|
1028 |
), |
|
|
1029 |
output |
|
|
1030 |
) |
|
|
1031 |
association_plots( |
|
|
1032 |
metadata, |
|
|
1033 |
final_features/offset, |
|
|
1034 |
significant_results_file, |
|
|
1035 |
output, |
|
|
1036 |
figures_folder |
|
|
1037 |
) |
|
|
1038 |
} |
|
|
1039 |
|
|
|
1040 |
return(significant_results) |
|
|
1041 |
} |
|
|
1042 |
|
|
|
1043 |
|
|
|
1044 |
option_not_valid_error <- function(message, valid_options) { |
|
|
1045 |
logging::logerror(paste(message, ": %s"), toString(valid_options)) |
|
|
1046 |
stop("Option not valid", call. = FALSE) |
|
|
1047 |
} |
|
|
1048 |
|
|
|
1049 |
## Quiets concerns of R CMD check |
|
|
1050 |
utils::globalVariables(c("name")) |