[e26484]: / OmicsFold / nextflow_pipeline / 1_find_ncomp.R

Download this file

62 lines (44 with data), 2.1 kB

 1
 2
 3
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
#!~/.conda/envs/OmicsFold/bin/Rscript
#tune optimal number of components
args = commandArgs(trailingOnly=TRUE)
#args[1]: data
#args[2]: data labels
library(OmicsFold)
library(mixOmics)
# Read data
data<-readRDS(args[1])
data.labels<-readRDS(args[2])
#define design matrix
#values closer to 1 will identify more correlated features, but the model will be less discriminative
design <- matrix(0.1, ncol = length(data), nrow = length(data), dimnames = list(names(data), names(data)))
diag(design) = 0
# Define parameters
mfold.folds <- 10
nrepeat.find.ncomp <- 50
# Open a letter sized PDF to capture plots from this stage
pdf(file = file.path("1_find_ncomp_plots.pdf"),
width = 11, # The width of the plots in inches
height = 8.5) # The height of the plots in inches
# Tuning #1 ---------------------------------------------------------------
# Tune the optimum number of components up to a specified limit. Max components shouldn't be more
# than ten or so.
diablo.untrained <- mixOmics::block.splsda(X = data, Y = data.labels, ncomp = 5, design = design)
perf.untrained <- mixOmics::perf(diablo.untrained, validation = 'Mfold', folds = mfold.folds,
nrepeat = nrepeat.find.ncomp, progressBar = TRUE, cpus = 20)
saveRDS(perf.untrained, "1_find_ncomp_perf_result.rds")
# Pick the optimum number of components according to balanced error rate and mahalanobis distance
# If only a single component is selected, manual adjustment may be useful. Prior analysis of single omics independently may inform the optimum
# number of components to choose here.
plot(perf.untrained)
# Output the error rates for each number of components
sink("1_find_ncomp_output.txt")
cat("Dimensions of the input data:\n")
for (block.name in names(data)) {
dimensions = dim(data[[block.name]])
cat(sprintf("%s: %i obs x %i features\n", block.name, dimensions[1], dimensions[2]))
}
cat("\nAdvised number of components based on BER:\n")
perf.untrained$choice.ncomp$WeightedVote["Overall.BER", "mahalanobis.dist"]
sink()
# Close the PDF file
dev.off()