Omicsfold / Git / [e26484] /OmicsFold/R/feature

Models:
AlyssaS/
Omicsfold
Downloads: 1
[e26484]: / OmicsFold / R / feature_loadings.R
History
Download this file
425 lines (387 with data), 15.2 kB

#' Extract sPLS-DA feature selection stability
#'
#' @description
#' Extract feature selection stabilities for a given component from a
#' performance validated mixOmics sPLS-DA model Feature stabilities are an
#' important indicator of the confidence that a selected feature is predictive
#' for the outcome classes in the model, and hence (in combination with the
#' loading weight) is likely to be of biological significance.
#'
#' @param splsda.model Trained mixOmics sPLS-DA model.
#' @param splsda.perf Performance evaluation of the sPLS-DA model generated by
#' mixOmics perf.
#' @param comp Component number of which to retrieve feature selection
#' stabilities.
#'
#' @return A data frame containing the features selected for the specificed
#' component and their relative stability, as a proportion of trained models
#' 0-1.
#' @export
#'
#' @examples
#' \dontrun{
#' feature.selection.stability(splsda.analysis, perf.splsda.analysis, comp=1)
#' }
feature.selection.stability <- function(splsda.model, splsda.perf, comp) {
  ind.match = match(selectVar(splsda.model, comp = comp)$name,
                    names(splsda.perf$features$stable[[comp]]))
  freq = as.numeric(splsda.perf$features$stable[[comp]][ind.match])
  stability <- data.frame(selectVar(splsda.model, comp = comp)$value, freq)
  colnames(stability) <- c("value", "stability")

  return(stability)
}


#' Get an sPLS-DA loadings table
#'
#' @description
#' Get a dataframe indicating the loading factors for a trained mixOmics sPLS-DA
#' model. Limited to two components.
#'
#' @param splsda.analysis Trained mixOmics sPLS-DA model
#'
#' @return Data frame including ordered loadings over the first two components
#' of the model. Selected features are ordered by component one followed by
#' component two, and then from highest to lowest (in absolute terms).
#' @export
#'
#' @examples
#' \dontrun{
#' get.loadings.table(splsda.analysis)
#' }
get.loadings.table <- function(splsda.analysis) {
  rna.loadings <- splsda.analysis$loadings$X
  rna.loadings <- rna.loadings[rowSums(rna.loadings) != 0,]
  rna.loadings[rna.loadings == 0] <- NA
  rna.loadings <- as.data.frame(rna.loadings)

  # Sort by components columns in reverse order
  for (column.id in rev(colnames(rna.loadings))) {
    rna.loadings <-
        rna.loadings[order(abs(rna.loadings[, column.id]), decreasing = TRUE), ,
                     drop = FALSE]
  }

  return(rna.loadings)
}


#' Extract DIABLO feature selection stability
#'
#' @description
#' Extract feature selection stabilities for a given component from a
#' performance validated mixOmics multi-omics DIABLO model. Feature stabilities
#' are an important indicator of the confidence that a selected feature is
#' predictive for the outcome classes in the model, and hence (in combination
#' with the loading weight) is likely to be of biological significance. Features
#' are retrieved for a single specified block and component.
#'
#' @param diablo.perf Performance evaluation of the DIABLO model generated by
#' mixOmics perf.
#' @param comp Component number for which to retrieve feature selection
#' stabilities.
#' @param block Block number for which to retrieve feature selection
#' stabilities.
#'
#' @return A data frame containing the  the features selected for the specificed
#' component and their relative stability, as a proportion of trained models
#' 0-1.
#' @export
#'
#' @examples
#' \dontrun{
#' diablo.selection.stability(perf.diablo.analysis, comp=1, block=1)
#' }
diablo.selection.stability <- function(diablo.perf, comp, block) {
  stable <- diablo.perf$features$stable
  stable.wms <- lapply(stable, "[[", block)
  stable.wms <- lapply(stable.wms, "[[", comp)

  melted <- reshape2::melt(stable.wms)
  stability.mean <- melted %>%
    group_by(Var1) %>%
    summarise(stability = mean(value)) %>%
    arrange(desc(stability)) %>%
    dplyr::filter(!is.na(Var1)) %>%
    as.data.frame()

  colnames(stability.mean) <- c('feature', 'stability')
  rownames(stability.mean) <- stability.mean$feature

  return(stability.mean)
}


#' Get a DIABLO loadings table
#'
#' @description
#' Get a dataframe indicating the top ranking loading factors for a trained
#' mixOmics multi-omics DIABLO model for a provided block from the model.
#'
#' @param diablo.loadings Loadings from the trained model.
#' @param feature.count The number of top loadings from each component to
#' include.
#'
#' @return A data frame containing the top (by absolute magnitude) factor
#' loadings for the first, then second component, ordered by absolute weight.
#' @export
#'
#' @examples
#' \dontrun{
#' get.diablo.top.loadings(diablo.tuned.analysis$loadings$metab)
#' }
get.diablo.top.loadings <- function(diablo.loadings, feature.count = 20) {
  rna.loadings <- diablo.loadings
  rna.loadings <- rna.loadings[rowSums(rna.loadings) != 0,]
  rna.loadings[rna.loadings == 0] <- NA
  rna.loadings <- as.data.frame(rna.loadings)

  # Construct new data frame with each component's top features included
  loadings <- data.frame()
  for (col.index in 1:ncol(rna.loadings)) {
    rna.loadings <-
        rna.loadings[order(abs(rna.loadings[, col.index]), decreasing = TRUE), ,
                     drop = FALSE]
    col.name <- colnames(rna.loadings)[col.index]
    rows.to.copy <- min(feature.count, nrow(rna.loadings))
    for (row.index in 1:rows.to.copy) {
      row.name <- rownames(rna.loadings)[row.index]
      loadings[row.name, col.name] <- rna.loadings[row.name, col.name]
    }
  }

  # Sort the columns, going from right to left
  for (column.id in rev(colnames(loadings))) {
    loadings <-
        loadings[order(abs(loadings[, column.id]), decreasing = TRUE), ,
                 drop = FALSE]
  }

  return(loadings)
}


#' Get top loadings for a single component DIABLO model
#'
#' @description
#' Get a dataframe indicating the top ranking loading factors for a trained
#' mixOmics multi-omics DIABLO model for a provided block from the model.
#'
#' @param diablo.loadings DIABLO loadings for a single component model.
#' @param feature.count The number of top loadings from each component to
#' include.
#'
#' @return A data frame containing the top (by absolute magnitude) factor
#' loadings for the single component, ordered by absolute weight.
#' @export
#'
#' @examples
#' \dontrun{
#' get.diablo.top.loadings.1comp(diablo.tuned.analysis$loadings$metab)
#' }
get.diablo.top.loadings.1comp <- function(diablo.loadings, feature.count = 20) {
  return(get.diablo.top.loadings(diablo.loadings, feature.count))
}


#' Get DIABLO top loadings and stabilities
#'
#' @description
#' Get a dataframe showing the top ranked loading factors for a trained mixOmics
#' multi-omics DIABLO model along with selection stability values.  Where a
#' feature appears in more than one component, the selection stability value
#' shown is for the lowest numbered component.
#'
#' @param trained.model A multiomics DIABLO model.
#' @param perf.result The results from a perf test on the above model.
#' @param block The name or index of the block to get the data for.
#' @param feature.count The number of top loadings from each component to
#' include.
#'
#' @return A dataframe containing the top (by absolute magnitude) factor
#' loadings for each component and selection stability values for those
#' features.
#' @export
#'
#' @examples
#' \dontrun{
#' get.diablo.top.loadings.with.stability(diablo.model, diablo.perf.result, 'species', 100)
#' }
get.diablo.top.loadings.with.stability <- function(trained.model, perf.result,
                                                   block, feature.count = 20) {
  loadings <- get.diablo.top.loadings(trained.model$loadings[[block]],
                                      feature.count = feature.count)

  # Bind on stability values in reverse component order
  for (comp in rev(colnames(loadings))) {
    selection.stability <-
        diablo.selection.stability(perf.result, comp = comp, block = block)
    component.features <-
        rownames(loadings[!is.na(loadings[comp]), comp, drop = FALSE])
    loadings[component.features, "stability"] <-
        selection.stability[component.features, "stability"]
  }

  # Remove any features with only NA values
  loadings <- loadings[apply(loadings, 1, function(x) { !all(is.na(x)) }) ,]

  return(loadings)
}


#' Merge DIABLO stability onto a data frame of features
#'
#' @description
#' Utility function to combine top loading factors extracted from a trained
#' multi-omics mixOmics (DIABLO) model with the feature stability, previously
#' retrieved from the performance evaluation using e.g.
#' diablo.selection.stability and create a nicely formatted data frame.
#'
#' @param features Data frame of top features, e.g. provided by
#' get.diablo.top.loadings().
#' @param stability Data frame of feature stability, e.g. provided by
#' diablo.selection.stability().
#'
#' @return Data frame combining top features with their measured stability.
#' @export merge.feature.stability
#'
#' @examples
#' \dontrun{
#' merge.feature.stability(get.diablo.top.loadings(diablo.tuned.analysis$loadings$metab), diablo.selection.stability(perf.diablo.analysis, comp=1, block=1))
#' }
merge.feature.stability <- function(features, stability) {
  merged <- merge(x = features, y = stability[, "stability", drop = FALSE],
                  by.x = "row.names", by.y = "row.names", all.x = TRUE)
  for (column.id in rev(colnames(features))) {
    merged <- merged[order(abs(merged[,column.id]), decreasing = TRUE),]
  }

  merged <- as.data.frame(merged)
  colnames(merged)[1] <- "feature"
  return(merged)
}


#' Merge stability onto a data frame of features in a single component DIABLO
#' model
#'
#' @description
#' Utility function to combine top loading factors extracted from a trained
#' multi-omics mixOmics (DIABLO) model with the feature stability, previously
#' retrieved from the performance evaluation using e.g.
#' diablo.selection.stability and create a nicely formatted data frame.
#' Specialised for DIABLO models with a single component, which otherwise
#' behaves differently in R's data structures.
#'
#' @param features Data frame of top features, e.g. provided by
#' get.diablo.top.loadings.1comp().
#' @param stability Data frame of feature stability, e.g. provided by
#' diablo.selection.stability().
#'
#' @return Data frame combining top features with their measured stability.
#' @export merge.feature.stability.1comp
#'
#' @examples
#' \dontrun{
#' merge.feature.stability.1comp(get.diablo.top.loadings.1comp(diablo.tuned.analysis$loadings$metab), diablo.selection.stability(perf.diablo.analysis, comp=1, block=1))
#' }
merge.feature.stability.1comp <- function(features, stability) {
  return(
    merge.feature.stability(features, stability)
  )
}

#' Plot feature stability
#'
#' @description
#' Plot feature stability for a summary overview.
#'
#' @param selection.stability Stability data for features selected for a
#' component.
#'
#' @return Plotted stability data.
#' @export plot.feature.stability
#'
#' @examples
#' \dontrun{
#' plot.feature.stability(diablo.selection.stability(perf.diablo.analysis, comp=1, block=1))
#' }
plot.feature.stability <- function(selection.stability) {
  return(
    plot(selection.stability, type = 'h', ylab = 'Stability', xlab = 'Features',
         main = 'Selection stability')
  )
}

#' Export a covariance matrix as a network CSV file
#'
#' @description
#' Export a covariance matrix, e.g. one generated using
#' `find.feature.associations` as data describing the association network and
#' suitable for e.g. loading into Cytoscape. The method can optionally take a
#' cutoff value, which is used to threshold (absolute) association scores. The
#' output is written to disk as a CSV file using the filename given.
#'
#' @param m Symmetrical matrix of association scores.
#' @param filename Filename to which to write network output.
#' @param cutoff Optional value of which to filter associations.
#' @param block.association If the matrix does not contain equal number of
#' variables from each block, then a custom list of block associations can be
#' passed in.
#' @param block.feature.count The number of features selected from each block,
#' defaulting to 20.
#'
#' @return The long-form network data that is also written to disk
#' @export
#'
#' @examples
#' \dontrun{
#' export.matrix.as.network(mat, "network.csv")
#' export.matrix.as.network(find.feature.associations(diablo.trained.analysis, 3), filename="network.csv", cutoff=0.7)
#' }
export.matrix.as.network <- function(m, filename, cutoff=NA,
                                     block.association=NULL,
                                     block.feature.count=20) {
  block.labels <- as.data.frame(rownames(m))

  if (is.null(block.association)) {
    num.blocks <- length(rownames(m)) / block.feature.count
    block.association <- rep(1:num.blocks, each=block.feature.count)
  }
  block.labels$block <- block.association
  colnames(block.labels) <- c("feature", "block")

  melted <- reshape2::melt(replace(m, lower.tri(m, TRUE), NA), na.rm = TRUE)
  melted <- merge(x = melted, y = block.labels,
                  by.x = "Var1", by.y = "feature", all.x = TRUE)
  colnames(melted) <- c("feature.1", "feature.2", "value", "block")

  if (!is.na(cutoff)) {
    melted <- melted[abs(melted$value) >= cutoff,]
  }

  write.csv(melted, filename, row.names=FALSE, quote=FALSE)

  return(melted)
}

#' Get model variance
#'
#' @description
#' Get percent of variance explained by each component for a tuned model. Works
#' with sPLS-DA models or DIABLO models as input, but the block parameter must
#' be specified for DIABLO models.
#'
#' @param tuned.model Either an sPLS-DA model from singleomics or a DIABLO model
#' from multiomics.
#' @param block Where the model type is DIABLO, the block to get model variance
#' for.
#'
#' @return A data frame with the model variance for the specified block.
#' @export
#'
#' @examples
#' \dontrun{
#' get.model.variance(splsda.model)
#' get.model.variance(diablo.model, block = 'transcriptomics')
#' }
get.model.variance <- function(tuned.model, block = "X"){
  model.var <- cor(as.numeric(as.factor(tuned.model$Y)),
                   tuned.model$variates[[block]][, 1:tuned.model$ncomp[block]])
  model.var <- apply(model.var^2, 2, sum)
  model.var <- cbind(model.var, cumsum(model.var))
  colnames(model.var) <- c("Proportion", "Cumulative")

  return(model.var)
}

#' Center truncate long strings
#'
#' @description
#' Truncate a string to no longer than 43 characters. Where a string is longer
#' than 43 characters, the first and last 20 characters sandwich three periods.
#' This can be useful for shortening long feature names before plotting.
#'
#' @param x A string to truncate.
#'
#' @return A string where each has been reduced to 43 characters or left as is.
#' @export
#'
#' @examples
#' \dontrun{
#' center.truncate('A string to truncate if long enough')
#' }
center.truncate <- function(x) {
  string.length <- nchar(x)
  if (string.length < 43) {
    return(x)
  } else {
    return(paste0(substr(x, 1, 20),
                  "...",
                  substr(x, string.length - 20, string.length)))
  }
}