[e26484]: / OmicsFold / nextflow_pipeline / 2_find_keepx.R

Download this file

140 lines (102 with data), 4.6 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
#!~/.conda/envs/OmicsFold/bin/Rscript
#tune optimal number of features per component
args = commandArgs(trailingOnly=TRUE)
#args[1]: data
#args[2]: data labels
#args[3]: perf untrained
library(OmicsFold)
library(mixOmics)
library(BiocParallel)
library(batchtools)
# Read data
data<-readRDS(args[1])
data.labels<-readRDS(args[2])
perf.untrained <- readRDS(args[3])
#define design matrix
#values closer to 1 will identify more correlated features, but the model will be less discriminative
design <- matrix(0.1, ncol = length(data), nrow = length(data), dimnames = list(names(data), names(data)))
diag(design) = 0
# Define parameters
mfold.folds <- 10
nrepeat.keepX <- 50
#determine optimal number of components to mahalanobis distance
ncomp.keepX <- perf.untrained$choice.ncomp$WeightedVote["Overall.BER", "mahalanobis.dist"]
#overwrite to ncomp to 2 if 1
if (ncomp.keepX == 1){
ncomp.keepX = 2
}
# Set to test a range of variables to test
#test 10%, 20%, 30%, 40% of features
keepx_grid <- function(i) {
block_length <- ncol(data[[names(data[i])]])
list.keepX <- c(round(block_length/10, digits=0), round(block_length/5, digits=0), round(block_length/3.3, digits=0), round(block_length/2.5, digits=0))
return(list.keepX)
}
test.keepX <- lapply(names(data),keepx_grid)
names(test.keepX) <- names(data)
#loop over each block, and if number of features is greater than max available, use 50% of available features
for (i in names(test.keepX)) {
max_features <- ncol(data[[i]])
for(x in 1:length(test.keepX[[i]])){
print(test.keepX[[i]][[x]])
if (test.keepX[[i]][[x]] > max_features){
test.keepX[[i]][[x]] <- round(max_features/2)
}
}
}
# Open a letter sized PDF to capture plots from this stage
pdf(file = "2_find_keepX_plots.pdf",
width = 11, # The width of the plots in inches
height = 8.5) # The height of the plots in inches
# Tuning #2 ---------------------------------------------------------------
# Establish the optimum number of parameters to retain in the sparse model. DIABLO uses lasso penalization to select variables, but the mixOmics
# implementation allows the user to specify a number of variables rather than lamdba parameters (which can be difficult to estimate).
# The optimum number of parameters to retain has similar considerations to sPLS-DA - aim for 1/3 of total variables, and ensure the low range is well covered
# to allow elimination of noisy variable as much as possible.
# Every component of every block can have a different penalization parameter, so this step can be very computationally intensive to run. As for component
# numbers, prior analysis of single omics can inform the range of keepX values to test here and so cut down on the number that have to be processed.
if (!exists("already.tested.X")) {
already.tested.X <- list()
for (block.name in names(data)) {
already.tested.X[[block.name]] <- vector()
}
}
tune.diablo <- tune.block.splsda(X = data, Y = data.labels, ncomp = ncomp.keepX, test.keepX = test.keepX, design = design,
validation = 'Mfold', folds = mfold.folds, nrepeat = nrepeat.keepX,
dist = "mahalanobis.dist", progressBar = FALSE, BPPARAM = BatchtoolsParam(workers = 20, cluster="slurm"))
saveRDS(tune.diablo, "2_find_keepX_tune_result.rds")
# Find the error rate for the tuning at different parameter values, the optimal number of components, and the number of selected variables for each component
sink("2_find_keepX_output.txt")
cat("\n\n")
cat("keepX values tested:\n")
test.keepX
cat("\n\n")
cat("Advised counts for keepX:\n")
tune.diablo$choice.keepX
ber.per.component <- function(keepX.results) {
ber.per.component <- vector()
last.comp <- length(keepX.results$choice.keepX[[1]])
first.comp <- last.comp - ncol(keepX.results$error.rate) + 1
for (comp in first.comp:last.comp) {
keepX.values <- array()
for (block.index in 1:length(keepX.results$choice.keepX)) {
keepX.values[block.index] <- keepX.results$choice.keepX[[block.index]][comp]
}
row.key <- paste(keepX.values, collapse="_")
col.key <- sprintf("comp%i", comp)
ber.per.component[col.key] <- keepX.results$error.rate[row.key, col.key]
}
return(ber.per.component)
}
cat("\n")
cat("Error rates for selected keepX values:\n")
print(ber.per.component(tune.diablo))
cat("\n")
cat("Advised number of components:\n")
print(tune.diablo$choice.ncomp)
cat("Error rates per component / keepX value combination:\n")
tune.diablo$error.rate
sink()
plot(tune.diablo, col = color.jet(ncomp.keepX))
# Close plots PDF
dev.off()