Omicsfold / Git / Diff of /OmicsFold/R/feature

Models:
AlyssaS/
Omicsfold
Downloads: 1
Diff of /OmicsFold/R/feature_loadings.R [000000] .. [e26484]
Switch to side-by-side view

--- a
+++ b/OmicsFold/R/feature_loadings.R
@@ -0,0 +1,424 @@
+#' Extract sPLS-DA feature selection stability
+#'
+#' @description
+#' Extract feature selection stabilities for a given component from a
+#' performance validated mixOmics sPLS-DA model Feature stabilities are an
+#' important indicator of the confidence that a selected feature is predictive
+#' for the outcome classes in the model, and hence (in combination with the
+#' loading weight) is likely to be of biological significance.
+#'
+#' @param splsda.model Trained mixOmics sPLS-DA model.
+#' @param splsda.perf Performance evaluation of the sPLS-DA model generated by
+#' mixOmics perf.
+#' @param comp Component number of which to retrieve feature selection
+#' stabilities.
+#'
+#' @return A data frame containing the features selected for the specificed
+#' component and their relative stability, as a proportion of trained models
+#' 0-1.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' feature.selection.stability(splsda.analysis, perf.splsda.analysis, comp=1)
+#' }
+feature.selection.stability <- function(splsda.model, splsda.perf, comp) {
+  ind.match = match(selectVar(splsda.model, comp = comp)$name,
+                    names(splsda.perf$features$stable[[comp]]))
+  freq = as.numeric(splsda.perf$features$stable[[comp]][ind.match])
+  stability <- data.frame(selectVar(splsda.model, comp = comp)$value, freq)
+  colnames(stability) <- c("value", "stability")
+
+  return(stability)
+}
+
+
+#' Get an sPLS-DA loadings table
+#'
+#' @description
+#' Get a dataframe indicating the loading factors for a trained mixOmics sPLS-DA
+#' model. Limited to two components.
+#'
+#' @param splsda.analysis Trained mixOmics sPLS-DA model
+#'
+#' @return Data frame including ordered loadings over the first two components
+#' of the model. Selected features are ordered by component one followed by
+#' component two, and then from highest to lowest (in absolute terms).
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' get.loadings.table(splsda.analysis)
+#' }
+get.loadings.table <- function(splsda.analysis) {
+  rna.loadings <- splsda.analysis$loadings$X
+  rna.loadings <- rna.loadings[rowSums(rna.loadings) != 0,]
+  rna.loadings[rna.loadings == 0] <- NA
+  rna.loadings <- as.data.frame(rna.loadings)
+
+  # Sort by components columns in reverse order
+  for (column.id in rev(colnames(rna.loadings))) {
+    rna.loadings <-
+        rna.loadings[order(abs(rna.loadings[, column.id]), decreasing = TRUE), ,
+                     drop = FALSE]
+  }
+
+  return(rna.loadings)
+}
+
+
+#' Extract DIABLO feature selection stability
+#'
+#' @description
+#' Extract feature selection stabilities for a given component from a
+#' performance validated mixOmics multi-omics DIABLO model. Feature stabilities
+#' are an important indicator of the confidence that a selected feature is
+#' predictive for the outcome classes in the model, and hence (in combination
+#' with the loading weight) is likely to be of biological significance. Features
+#' are retrieved for a single specified block and component.
+#'
+#' @param diablo.perf Performance evaluation of the DIABLO model generated by
+#' mixOmics perf.
+#' @param comp Component number for which to retrieve feature selection
+#' stabilities.
+#' @param block Block number for which to retrieve feature selection
+#' stabilities.
+#'
+#' @return A data frame containing the  the features selected for the specificed
+#' component and their relative stability, as a proportion of trained models
+#' 0-1.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' diablo.selection.stability(perf.diablo.analysis, comp=1, block=1)
+#' }
+diablo.selection.stability <- function(diablo.perf, comp, block) {
+  stable <- diablo.perf$features$stable
+  stable.wms <- lapply(stable, "[[", block)
+  stable.wms <- lapply(stable.wms, "[[", comp)
+
+  melted <- reshape2::melt(stable.wms)
+  stability.mean <- melted %>%
+    group_by(Var1) %>%
+    summarise(stability = mean(value)) %>%
+    arrange(desc(stability)) %>%
+    dplyr::filter(!is.na(Var1)) %>%
+    as.data.frame()
+
+  colnames(stability.mean) <- c('feature', 'stability')
+  rownames(stability.mean) <- stability.mean$feature
+
+  return(stability.mean)
+}
+
+
+#' Get a DIABLO loadings table
+#'
+#' @description
+#' Get a dataframe indicating the top ranking loading factors for a trained
+#' mixOmics multi-omics DIABLO model for a provided block from the model.
+#'
+#' @param diablo.loadings Loadings from the trained model.
+#' @param feature.count The number of top loadings from each component to
+#' include.
+#'
+#' @return A data frame containing the top (by absolute magnitude) factor
+#' loadings for the first, then second component, ordered by absolute weight.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' get.diablo.top.loadings(diablo.tuned.analysis$loadings$metab)
+#' }
+get.diablo.top.loadings <- function(diablo.loadings, feature.count = 20) {
+  rna.loadings <- diablo.loadings
+  rna.loadings <- rna.loadings[rowSums(rna.loadings) != 0,]
+  rna.loadings[rna.loadings == 0] <- NA
+  rna.loadings <- as.data.frame(rna.loadings)
+
+  # Construct new data frame with each component's top features included
+  loadings <- data.frame()
+  for (col.index in 1:ncol(rna.loadings)) {
+    rna.loadings <-
+        rna.loadings[order(abs(rna.loadings[, col.index]), decreasing = TRUE), ,
+                     drop = FALSE]
+    col.name <- colnames(rna.loadings)[col.index]
+    rows.to.copy <- min(feature.count, nrow(rna.loadings))
+    for (row.index in 1:rows.to.copy) {
+      row.name <- rownames(rna.loadings)[row.index]
+      loadings[row.name, col.name] <- rna.loadings[row.name, col.name]
+    }
+  }
+
+  # Sort the columns, going from right to left
+  for (column.id in rev(colnames(loadings))) {
+    loadings <-
+        loadings[order(abs(loadings[, column.id]), decreasing = TRUE), ,
+                 drop = FALSE]
+  }
+
+  return(loadings)
+}
+
+
+#' Get top loadings for a single component DIABLO model
+#'
+#' @description
+#' Get a dataframe indicating the top ranking loading factors for a trained
+#' mixOmics multi-omics DIABLO model for a provided block from the model.
+#'
+#' @param diablo.loadings DIABLO loadings for a single component model.
+#' @param feature.count The number of top loadings from each component to
+#' include.
+#'
+#' @return A data frame containing the top (by absolute magnitude) factor
+#' loadings for the single component, ordered by absolute weight.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' get.diablo.top.loadings.1comp(diablo.tuned.analysis$loadings$metab)
+#' }
+get.diablo.top.loadings.1comp <- function(diablo.loadings, feature.count = 20) {
+  return(get.diablo.top.loadings(diablo.loadings, feature.count))
+}
+
+
+#' Get DIABLO top loadings and stabilities
+#'
+#' @description
+#' Get a dataframe showing the top ranked loading factors for a trained mixOmics
+#' multi-omics DIABLO model along with selection stability values.  Where a
+#' feature appears in more than one component, the selection stability value
+#' shown is for the lowest numbered component.
+#'
+#' @param trained.model A multiomics DIABLO model.
+#' @param perf.result The results from a perf test on the above model.
+#' @param block The name or index of the block to get the data for.
+#' @param feature.count The number of top loadings from each component to
+#' include.
+#'
+#' @return A dataframe containing the top (by absolute magnitude) factor
+#' loadings for each component and selection stability values for those
+#' features.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' get.diablo.top.loadings.with.stability(diablo.model, diablo.perf.result, 'species', 100)
+#' }
+get.diablo.top.loadings.with.stability <- function(trained.model, perf.result,
+                                                   block, feature.count = 20) {
+  loadings <- get.diablo.top.loadings(trained.model$loadings[[block]],
+                                      feature.count = feature.count)
+
+  # Bind on stability values in reverse component order
+  for (comp in rev(colnames(loadings))) {
+    selection.stability <-
+        diablo.selection.stability(perf.result, comp = comp, block = block)
+    component.features <-
+        rownames(loadings[!is.na(loadings[comp]), comp, drop = FALSE])
+    loadings[component.features, "stability"] <-
+        selection.stability[component.features, "stability"]
+  }
+
+  # Remove any features with only NA values
+  loadings <- loadings[apply(loadings, 1, function(x) { !all(is.na(x)) }) ,]
+
+  return(loadings)
+}
+
+
+#' Merge DIABLO stability onto a data frame of features
+#'
+#' @description
+#' Utility function to combine top loading factors extracted from a trained
+#' multi-omics mixOmics (DIABLO) model with the feature stability, previously
+#' retrieved from the performance evaluation using e.g.
+#' diablo.selection.stability and create a nicely formatted data frame.
+#'
+#' @param features Data frame of top features, e.g. provided by
+#' get.diablo.top.loadings().
+#' @param stability Data frame of feature stability, e.g. provided by
+#' diablo.selection.stability().
+#'
+#' @return Data frame combining top features with their measured stability.
+#' @export merge.feature.stability
+#'
+#' @examples
+#' \dontrun{
+#' merge.feature.stability(get.diablo.top.loadings(diablo.tuned.analysis$loadings$metab), diablo.selection.stability(perf.diablo.analysis, comp=1, block=1))
+#' }
+merge.feature.stability <- function(features, stability) {
+  merged <- merge(x = features, y = stability[, "stability", drop = FALSE],
+                  by.x = "row.names", by.y = "row.names", all.x = TRUE)
+  for (column.id in rev(colnames(features))) {
+    merged <- merged[order(abs(merged[,column.id]), decreasing = TRUE),]
+  }
+
+  merged <- as.data.frame(merged)
+  colnames(merged)[1] <- "feature"
+  return(merged)
+}
+
+
+#' Merge stability onto a data frame of features in a single component DIABLO
+#' model
+#'
+#' @description
+#' Utility function to combine top loading factors extracted from a trained
+#' multi-omics mixOmics (DIABLO) model with the feature stability, previously
+#' retrieved from the performance evaluation using e.g.
+#' diablo.selection.stability and create a nicely formatted data frame.
+#' Specialised for DIABLO models with a single component, which otherwise
+#' behaves differently in R's data structures.
+#'
+#' @param features Data frame of top features, e.g. provided by
+#' get.diablo.top.loadings.1comp().
+#' @param stability Data frame of feature stability, e.g. provided by
+#' diablo.selection.stability().
+#'
+#' @return Data frame combining top features with their measured stability.
+#' @export merge.feature.stability.1comp
+#'
+#' @examples
+#' \dontrun{
+#' merge.feature.stability.1comp(get.diablo.top.loadings.1comp(diablo.tuned.analysis$loadings$metab), diablo.selection.stability(perf.diablo.analysis, comp=1, block=1))
+#' }
+merge.feature.stability.1comp <- function(features, stability) {
+  return(
+    merge.feature.stability(features, stability)
+  )
+}
+
+#' Plot feature stability
+#'
+#' @description
+#' Plot feature stability for a summary overview.
+#'
+#' @param selection.stability Stability data for features selected for a
+#' component.
+#'
+#' @return Plotted stability data.
+#' @export plot.feature.stability
+#'
+#' @examples
+#' \dontrun{
+#' plot.feature.stability(diablo.selection.stability(perf.diablo.analysis, comp=1, block=1))
+#' }
+plot.feature.stability <- function(selection.stability) {
+  return(
+    plot(selection.stability, type = 'h', ylab = 'Stability', xlab = 'Features',
+         main = 'Selection stability')
+  )
+}
+
+#' Export a covariance matrix as a network CSV file
+#'
+#' @description
+#' Export a covariance matrix, e.g. one generated using
+#' `find.feature.associations` as data describing the association network and
+#' suitable for e.g. loading into Cytoscape. The method can optionally take a
+#' cutoff value, which is used to threshold (absolute) association scores. The
+#' output is written to disk as a CSV file using the filename given.
+#'
+#' @param m Symmetrical matrix of association scores.
+#' @param filename Filename to which to write network output.
+#' @param cutoff Optional value of which to filter associations.
+#' @param block.association If the matrix does not contain equal number of
+#' variables from each block, then a custom list of block associations can be
+#' passed in.
+#' @param block.feature.count The number of features selected from each block,
+#' defaulting to 20.
+#'
+#' @return The long-form network data that is also written to disk
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' export.matrix.as.network(mat, "network.csv")
+#' export.matrix.as.network(find.feature.associations(diablo.trained.analysis, 3), filename="network.csv", cutoff=0.7)
+#' }
+export.matrix.as.network <- function(m, filename, cutoff=NA,
+                                     block.association=NULL,
+                                     block.feature.count=20) {
+  block.labels <- as.data.frame(rownames(m))
+
+  if (is.null(block.association)) {
+    num.blocks <- length(rownames(m)) / block.feature.count
+    block.association <- rep(1:num.blocks, each=block.feature.count)
+  }
+  block.labels$block <- block.association
+  colnames(block.labels) <- c("feature", "block")
+
+  melted <- reshape2::melt(replace(m, lower.tri(m, TRUE), NA), na.rm = TRUE)
+  melted <- merge(x = melted, y = block.labels,
+                  by.x = "Var1", by.y = "feature", all.x = TRUE)
+  colnames(melted) <- c("feature.1", "feature.2", "value", "block")
+
+  if (!is.na(cutoff)) {
+    melted <- melted[abs(melted$value) >= cutoff,]
+  }
+
+  write.csv(melted, filename, row.names=FALSE, quote=FALSE)
+
+  return(melted)
+}
+
+#' Get model variance
+#'
+#' @description
+#' Get percent of variance explained by each component for a tuned model. Works
+#' with sPLS-DA models or DIABLO models as input, but the block parameter must
+#' be specified for DIABLO models.
+#'
+#' @param tuned.model Either an sPLS-DA model from singleomics or a DIABLO model
+#' from multiomics.
+#' @param block Where the model type is DIABLO, the block to get model variance
+#' for.
+#'
+#' @return A data frame with the model variance for the specified block.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' get.model.variance(splsda.model)
+#' get.model.variance(diablo.model, block = 'transcriptomics')
+#' }
+get.model.variance <- function(tuned.model, block = "X"){
+  model.var <- cor(as.numeric(as.factor(tuned.model$Y)),
+                   tuned.model$variates[[block]][, 1:tuned.model$ncomp[block]])
+  model.var <- apply(model.var^2, 2, sum)
+  model.var <- cbind(model.var, cumsum(model.var))
+  colnames(model.var) <- c("Proportion", "Cumulative")
+
+  return(model.var)
+}
+
+#' Center truncate long strings
+#'
+#' @description
+#' Truncate a string to no longer than 43 characters. Where a string is longer
+#' than 43 characters, the first and last 20 characters sandwich three periods.
+#' This can be useful for shortening long feature names before plotting.
+#'
+#' @param x A string to truncate.
+#'
+#' @return A string where each has been reduced to 43 characters or left as is.
+#' @export
+#'
+#' @examples
+#' \dontrun{
+#' center.truncate('A string to truncate if long enough')
+#' }
+center.truncate <- function(x) {
+  string.length <- nchar(x)
+  if (string.length < 43) {
+    return(x)
+  } else {
+    return(paste0(substr(x, 1, 20),
+                  "...",
+                  substr(x, string.length - 20, string.length)))
+  }
+}