--- a +++ b/OmicsFold/R/feature_loadings.R @@ -0,0 +1,424 @@ +#' Extract sPLS-DA feature selection stability +#' +#' @description +#' Extract feature selection stabilities for a given component from a +#' performance validated mixOmics sPLS-DA model Feature stabilities are an +#' important indicator of the confidence that a selected feature is predictive +#' for the outcome classes in the model, and hence (in combination with the +#' loading weight) is likely to be of biological significance. +#' +#' @param splsda.model Trained mixOmics sPLS-DA model. +#' @param splsda.perf Performance evaluation of the sPLS-DA model generated by +#' mixOmics perf. +#' @param comp Component number of which to retrieve feature selection +#' stabilities. +#' +#' @return A data frame containing the features selected for the specificed +#' component and their relative stability, as a proportion of trained models +#' 0-1. +#' @export +#' +#' @examples +#' \dontrun{ +#' feature.selection.stability(splsda.analysis, perf.splsda.analysis, comp=1) +#' } +feature.selection.stability <- function(splsda.model, splsda.perf, comp) { + ind.match = match(selectVar(splsda.model, comp = comp)$name, + names(splsda.perf$features$stable[[comp]])) + freq = as.numeric(splsda.perf$features$stable[[comp]][ind.match]) + stability <- data.frame(selectVar(splsda.model, comp = comp)$value, freq) + colnames(stability) <- c("value", "stability") + + return(stability) +} + + +#' Get an sPLS-DA loadings table +#' +#' @description +#' Get a dataframe indicating the loading factors for a trained mixOmics sPLS-DA +#' model. Limited to two components. +#' +#' @param splsda.analysis Trained mixOmics sPLS-DA model +#' +#' @return Data frame including ordered loadings over the first two components +#' of the model. Selected features are ordered by component one followed by +#' component two, and then from highest to lowest (in absolute terms). +#' @export +#' +#' @examples +#' \dontrun{ +#' get.loadings.table(splsda.analysis) +#' } +get.loadings.table <- function(splsda.analysis) { + rna.loadings <- splsda.analysis$loadings$X + rna.loadings <- rna.loadings[rowSums(rna.loadings) != 0,] + rna.loadings[rna.loadings == 0] <- NA + rna.loadings <- as.data.frame(rna.loadings) + + # Sort by components columns in reverse order + for (column.id in rev(colnames(rna.loadings))) { + rna.loadings <- + rna.loadings[order(abs(rna.loadings[, column.id]), decreasing = TRUE), , + drop = FALSE] + } + + return(rna.loadings) +} + + +#' Extract DIABLO feature selection stability +#' +#' @description +#' Extract feature selection stabilities for a given component from a +#' performance validated mixOmics multi-omics DIABLO model. Feature stabilities +#' are an important indicator of the confidence that a selected feature is +#' predictive for the outcome classes in the model, and hence (in combination +#' with the loading weight) is likely to be of biological significance. Features +#' are retrieved for a single specified block and component. +#' +#' @param diablo.perf Performance evaluation of the DIABLO model generated by +#' mixOmics perf. +#' @param comp Component number for which to retrieve feature selection +#' stabilities. +#' @param block Block number for which to retrieve feature selection +#' stabilities. +#' +#' @return A data frame containing the the features selected for the specificed +#' component and their relative stability, as a proportion of trained models +#' 0-1. +#' @export +#' +#' @examples +#' \dontrun{ +#' diablo.selection.stability(perf.diablo.analysis, comp=1, block=1) +#' } +diablo.selection.stability <- function(diablo.perf, comp, block) { + stable <- diablo.perf$features$stable + stable.wms <- lapply(stable, "[[", block) + stable.wms <- lapply(stable.wms, "[[", comp) + + melted <- reshape2::melt(stable.wms) + stability.mean <- melted %>% + group_by(Var1) %>% + summarise(stability = mean(value)) %>% + arrange(desc(stability)) %>% + dplyr::filter(!is.na(Var1)) %>% + as.data.frame() + + colnames(stability.mean) <- c('feature', 'stability') + rownames(stability.mean) <- stability.mean$feature + + return(stability.mean) +} + + +#' Get a DIABLO loadings table +#' +#' @description +#' Get a dataframe indicating the top ranking loading factors for a trained +#' mixOmics multi-omics DIABLO model for a provided block from the model. +#' +#' @param diablo.loadings Loadings from the trained model. +#' @param feature.count The number of top loadings from each component to +#' include. +#' +#' @return A data frame containing the top (by absolute magnitude) factor +#' loadings for the first, then second component, ordered by absolute weight. +#' @export +#' +#' @examples +#' \dontrun{ +#' get.diablo.top.loadings(diablo.tuned.analysis$loadings$metab) +#' } +get.diablo.top.loadings <- function(diablo.loadings, feature.count = 20) { + rna.loadings <- diablo.loadings + rna.loadings <- rna.loadings[rowSums(rna.loadings) != 0,] + rna.loadings[rna.loadings == 0] <- NA + rna.loadings <- as.data.frame(rna.loadings) + + # Construct new data frame with each component's top features included + loadings <- data.frame() + for (col.index in 1:ncol(rna.loadings)) { + rna.loadings <- + rna.loadings[order(abs(rna.loadings[, col.index]), decreasing = TRUE), , + drop = FALSE] + col.name <- colnames(rna.loadings)[col.index] + rows.to.copy <- min(feature.count, nrow(rna.loadings)) + for (row.index in 1:rows.to.copy) { + row.name <- rownames(rna.loadings)[row.index] + loadings[row.name, col.name] <- rna.loadings[row.name, col.name] + } + } + + # Sort the columns, going from right to left + for (column.id in rev(colnames(loadings))) { + loadings <- + loadings[order(abs(loadings[, column.id]), decreasing = TRUE), , + drop = FALSE] + } + + return(loadings) +} + + +#' Get top loadings for a single component DIABLO model +#' +#' @description +#' Get a dataframe indicating the top ranking loading factors for a trained +#' mixOmics multi-omics DIABLO model for a provided block from the model. +#' +#' @param diablo.loadings DIABLO loadings for a single component model. +#' @param feature.count The number of top loadings from each component to +#' include. +#' +#' @return A data frame containing the top (by absolute magnitude) factor +#' loadings for the single component, ordered by absolute weight. +#' @export +#' +#' @examples +#' \dontrun{ +#' get.diablo.top.loadings.1comp(diablo.tuned.analysis$loadings$metab) +#' } +get.diablo.top.loadings.1comp <- function(diablo.loadings, feature.count = 20) { + return(get.diablo.top.loadings(diablo.loadings, feature.count)) +} + + +#' Get DIABLO top loadings and stabilities +#' +#' @description +#' Get a dataframe showing the top ranked loading factors for a trained mixOmics +#' multi-omics DIABLO model along with selection stability values. Where a +#' feature appears in more than one component, the selection stability value +#' shown is for the lowest numbered component. +#' +#' @param trained.model A multiomics DIABLO model. +#' @param perf.result The results from a perf test on the above model. +#' @param block The name or index of the block to get the data for. +#' @param feature.count The number of top loadings from each component to +#' include. +#' +#' @return A dataframe containing the top (by absolute magnitude) factor +#' loadings for each component and selection stability values for those +#' features. +#' @export +#' +#' @examples +#' \dontrun{ +#' get.diablo.top.loadings.with.stability(diablo.model, diablo.perf.result, 'species', 100) +#' } +get.diablo.top.loadings.with.stability <- function(trained.model, perf.result, + block, feature.count = 20) { + loadings <- get.diablo.top.loadings(trained.model$loadings[[block]], + feature.count = feature.count) + + # Bind on stability values in reverse component order + for (comp in rev(colnames(loadings))) { + selection.stability <- + diablo.selection.stability(perf.result, comp = comp, block = block) + component.features <- + rownames(loadings[!is.na(loadings[comp]), comp, drop = FALSE]) + loadings[component.features, "stability"] <- + selection.stability[component.features, "stability"] + } + + # Remove any features with only NA values + loadings <- loadings[apply(loadings, 1, function(x) { !all(is.na(x)) }) ,] + + return(loadings) +} + + +#' Merge DIABLO stability onto a data frame of features +#' +#' @description +#' Utility function to combine top loading factors extracted from a trained +#' multi-omics mixOmics (DIABLO) model with the feature stability, previously +#' retrieved from the performance evaluation using e.g. +#' diablo.selection.stability and create a nicely formatted data frame. +#' +#' @param features Data frame of top features, e.g. provided by +#' get.diablo.top.loadings(). +#' @param stability Data frame of feature stability, e.g. provided by +#' diablo.selection.stability(). +#' +#' @return Data frame combining top features with their measured stability. +#' @export merge.feature.stability +#' +#' @examples +#' \dontrun{ +#' merge.feature.stability(get.diablo.top.loadings(diablo.tuned.analysis$loadings$metab), diablo.selection.stability(perf.diablo.analysis, comp=1, block=1)) +#' } +merge.feature.stability <- function(features, stability) { + merged <- merge(x = features, y = stability[, "stability", drop = FALSE], + by.x = "row.names", by.y = "row.names", all.x = TRUE) + for (column.id in rev(colnames(features))) { + merged <- merged[order(abs(merged[,column.id]), decreasing = TRUE),] + } + + merged <- as.data.frame(merged) + colnames(merged)[1] <- "feature" + return(merged) +} + + +#' Merge stability onto a data frame of features in a single component DIABLO +#' model +#' +#' @description +#' Utility function to combine top loading factors extracted from a trained +#' multi-omics mixOmics (DIABLO) model with the feature stability, previously +#' retrieved from the performance evaluation using e.g. +#' diablo.selection.stability and create a nicely formatted data frame. +#' Specialised for DIABLO models with a single component, which otherwise +#' behaves differently in R's data structures. +#' +#' @param features Data frame of top features, e.g. provided by +#' get.diablo.top.loadings.1comp(). +#' @param stability Data frame of feature stability, e.g. provided by +#' diablo.selection.stability(). +#' +#' @return Data frame combining top features with their measured stability. +#' @export merge.feature.stability.1comp +#' +#' @examples +#' \dontrun{ +#' merge.feature.stability.1comp(get.diablo.top.loadings.1comp(diablo.tuned.analysis$loadings$metab), diablo.selection.stability(perf.diablo.analysis, comp=1, block=1)) +#' } +merge.feature.stability.1comp <- function(features, stability) { + return( + merge.feature.stability(features, stability) + ) +} + +#' Plot feature stability +#' +#' @description +#' Plot feature stability for a summary overview. +#' +#' @param selection.stability Stability data for features selected for a +#' component. +#' +#' @return Plotted stability data. +#' @export plot.feature.stability +#' +#' @examples +#' \dontrun{ +#' plot.feature.stability(diablo.selection.stability(perf.diablo.analysis, comp=1, block=1)) +#' } +plot.feature.stability <- function(selection.stability) { + return( + plot(selection.stability, type = 'h', ylab = 'Stability', xlab = 'Features', + main = 'Selection stability') + ) +} + +#' Export a covariance matrix as a network CSV file +#' +#' @description +#' Export a covariance matrix, e.g. one generated using +#' `find.feature.associations` as data describing the association network and +#' suitable for e.g. loading into Cytoscape. The method can optionally take a +#' cutoff value, which is used to threshold (absolute) association scores. The +#' output is written to disk as a CSV file using the filename given. +#' +#' @param m Symmetrical matrix of association scores. +#' @param filename Filename to which to write network output. +#' @param cutoff Optional value of which to filter associations. +#' @param block.association If the matrix does not contain equal number of +#' variables from each block, then a custom list of block associations can be +#' passed in. +#' @param block.feature.count The number of features selected from each block, +#' defaulting to 20. +#' +#' @return The long-form network data that is also written to disk +#' @export +#' +#' @examples +#' \dontrun{ +#' export.matrix.as.network(mat, "network.csv") +#' export.matrix.as.network(find.feature.associations(diablo.trained.analysis, 3), filename="network.csv", cutoff=0.7) +#' } +export.matrix.as.network <- function(m, filename, cutoff=NA, + block.association=NULL, + block.feature.count=20) { + block.labels <- as.data.frame(rownames(m)) + + if (is.null(block.association)) { + num.blocks <- length(rownames(m)) / block.feature.count + block.association <- rep(1:num.blocks, each=block.feature.count) + } + block.labels$block <- block.association + colnames(block.labels) <- c("feature", "block") + + melted <- reshape2::melt(replace(m, lower.tri(m, TRUE), NA), na.rm = TRUE) + melted <- merge(x = melted, y = block.labels, + by.x = "Var1", by.y = "feature", all.x = TRUE) + colnames(melted) <- c("feature.1", "feature.2", "value", "block") + + if (!is.na(cutoff)) { + melted <- melted[abs(melted$value) >= cutoff,] + } + + write.csv(melted, filename, row.names=FALSE, quote=FALSE) + + return(melted) +} + +#' Get model variance +#' +#' @description +#' Get percent of variance explained by each component for a tuned model. Works +#' with sPLS-DA models or DIABLO models as input, but the block parameter must +#' be specified for DIABLO models. +#' +#' @param tuned.model Either an sPLS-DA model from singleomics or a DIABLO model +#' from multiomics. +#' @param block Where the model type is DIABLO, the block to get model variance +#' for. +#' +#' @return A data frame with the model variance for the specified block. +#' @export +#' +#' @examples +#' \dontrun{ +#' get.model.variance(splsda.model) +#' get.model.variance(diablo.model, block = 'transcriptomics') +#' } +get.model.variance <- function(tuned.model, block = "X"){ + model.var <- cor(as.numeric(as.factor(tuned.model$Y)), + tuned.model$variates[[block]][, 1:tuned.model$ncomp[block]]) + model.var <- apply(model.var^2, 2, sum) + model.var <- cbind(model.var, cumsum(model.var)) + colnames(model.var) <- c("Proportion", "Cumulative") + + return(model.var) +} + +#' Center truncate long strings +#' +#' @description +#' Truncate a string to no longer than 43 characters. Where a string is longer +#' than 43 characters, the first and last 20 characters sandwich three periods. +#' This can be useful for shortening long feature names before plotting. +#' +#' @param x A string to truncate. +#' +#' @return A string where each has been reduced to 43 characters or left as is. +#' @export +#' +#' @examples +#' \dontrun{ +#' center.truncate('A string to truncate if long enough') +#' } +center.truncate <- function(x) { + string.length <- nchar(x) + if (string.length < 43) { + return(x) + } else { + return(paste0(substr(x, 1, 20), + "...", + substr(x, string.length - 20, string.length))) + } +}