Omicsfold / Git / [e26484] /OmicsFold/nextflow_pipeline/2_find

Models:
AlyssaS/
Omicsfold
Downloads: 1
[e26484]: / OmicsFold / nextflow_pipeline / 2_find_keepx.R
History
Download this file
140 lines (102 with data), 4.6 kB

#!~/.conda/envs/OmicsFold/bin/Rscript

#tune optimal number of features per component

args = commandArgs(trailingOnly=TRUE)
#args[1]: data
#args[2]: data labels
#args[3]: perf untrained

library(OmicsFold)
library(mixOmics)
library(BiocParallel)
library(batchtools)

# Read data
data<-readRDS(args[1])
data.labels<-readRDS(args[2])
perf.untrained <- readRDS(args[3])

#define design matrix
#values closer to 1 will identify more correlated features, but the model will be less discriminative
design <- matrix(0.1, ncol = length(data), nrow = length(data), dimnames = list(names(data), names(data)))
diag(design) = 0

# Define parameters

mfold.folds <- 10
nrepeat.keepX <- 50

#determine optimal number of components to mahalanobis distance
ncomp.keepX <- perf.untrained$choice.ncomp$WeightedVote["Overall.BER", "mahalanobis.dist"]

#overwrite to ncomp to 2 if 1
if (ncomp.keepX == 1){
  ncomp.keepX = 2
}

# Set to test a range of variables to test
#test 10%, 20%, 30%, 40% of features
keepx_grid <- function(i) {
 block_length <- ncol(data[[names(data[i])]])
 list.keepX <- c(round(block_length/10, digits=0),  round(block_length/5, digits=0),  round(block_length/3.3, digits=0), round(block_length/2.5, digits=0))
 return(list.keepX)
}

test.keepX <- lapply(names(data),keepx_grid)
names(test.keepX) <- names(data)

#loop over each block, and if number of features is greater than max available, use 50% of available features
for (i in names(test.keepX)) {
  max_features <- ncol(data[[i]])
  for(x in 1:length(test.keepX[[i]])){
    print(test.keepX[[i]][[x]])
    if (test.keepX[[i]][[x]] > max_features){
    test.keepX[[i]][[x]] <- round(max_features/2)
    }
  }  
}


# Open a letter sized PDF to capture plots from this stage
pdf(file = "2_find_keepX_plots.pdf",
    width = 11, # The width of the plots in inches
    height = 8.5) # The height of the plots in inches


# Tuning #2 ---------------------------------------------------------------

# Establish the optimum number of parameters to retain in the sparse model. DIABLO uses lasso penalization to select variables, but the mixOmics
# implementation allows the user to specify a number of variables rather than lamdba parameters (which can be difficult to estimate).
# The optimum number of parameters to retain has similar considerations to sPLS-DA - aim for 1/3 of total variables, and ensure the low range is well covered
# to allow elimination of noisy variable as much as possible.
# Every component of every block can have a different penalization parameter, so this step can be very computationally intensive to run. As for component
# numbers, prior analysis of single omics can inform the range of keepX values to test here and so cut down on the number that have to be processed.

if (!exists("already.tested.X")) {
  already.tested.X <- list()
  for (block.name in names(data)) {
    already.tested.X[[block.name]] <- vector()
  }
}


tune.diablo <- tune.block.splsda(X = data, Y = data.labels, ncomp = ncomp.keepX, test.keepX = test.keepX, design = design,
                                 validation = 'Mfold', folds = mfold.folds, nrepeat = nrepeat.keepX,
                                dist = "mahalanobis.dist", progressBar = FALSE, BPPARAM = BatchtoolsParam(workers = 20, cluster="slurm"))

saveRDS(tune.diablo, "2_find_keepX_tune_result.rds")

# Find the error rate for the tuning at different parameter values, the optimal number of components, and the number of selected variables for each component
sink("2_find_keepX_output.txt")

cat("\n\n")
cat("keepX values tested:\n")
test.keepX

cat("\n\n")
cat("Advised counts for keepX:\n")
tune.diablo$choice.keepX

ber.per.component <- function(keepX.results) {
  ber.per.component <- vector()

  last.comp <- length(keepX.results$choice.keepX[[1]])
  first.comp <- last.comp - ncol(keepX.results$error.rate) + 1

  for (comp in first.comp:last.comp) {
    keepX.values <- array()
    for (block.index in 1:length(keepX.results$choice.keepX)) {
      keepX.values[block.index] <- keepX.results$choice.keepX[[block.index]][comp]
    }

    row.key <- paste(keepX.values, collapse="_")
    col.key <- sprintf("comp%i", comp)

    ber.per.component[col.key] <- keepX.results$error.rate[row.key, col.key]
  }

  return(ber.per.component)
}

cat("\n")
cat("Error rates for selected keepX values:\n")
print(ber.per.component(tune.diablo))

cat("\n")
cat("Advised number of components:\n")
print(tune.diablo$choice.ncomp)

cat("Error rates per component / keepX value combination:\n")
tune.diablo$error.rate

sink()

plot(tune.diablo, col = color.jet(ncomp.keepX))

# Close plots PDF
dev.off()