--- a +++ b/OmicsFold/R/sample_analysis.R @@ -0,0 +1,419 @@ +#' A null device for running plot functions without outputting graphics. +.nulldev <- function(file = nullfile(), ...) { + type <- getOption("R.devices.nulldev", "pdf") + do.call(type, args = list(nullfile())) +} + +#' Get block centroids for a DIABLO model +#' +#' @description +#' Function to get the centroids of samples transformed into the same model +#' space across all blocks for a trained multi-omics mixOmics (DIABLO) model. +#' Prepares plots and raw data of consensus sample classification, which (for +#' some reason) isn't provided in the base package. +#' +#' @param diablo.trained Trained multi-omics mixOmics DIABLO model. +#' +#' @return Simple object containing consensus data (as `consensus.summary`) and +#' a ggplot plot of the distribution in the first two components (as `plot`). +#' @export +#' +#' @examples +#' \dontrun{ +#' consensus.centroids <- get.block.centroids(diablo.trained.analysis, 20, 3) +#' print(consensus.centroids$plot) +#' } +get.block.centroids <- function(diablo.trained) { + # Establish dimensionality + block.count <- length(diablo.trained$X) + sample.count <- length(diablo.trained$Y) + comp.count <- diablo.trained$ncomp[1] + + # Perform the same analysis as the arrow plot -- sending graphics output to + # the null device + .nulldev() + + # Set up the initial columns + arrow <- mixOmics::plotArrow(diablo.trained) + consensus <- data.frame(arrow$Block, arrow$group) + consensus$sample <- rep(1:sample.count, block.count + 1) + + # Add component columns 1 at a time using arrow plots + comp.col.names <- vector() + for (comp in 1:comp.count) { + comp.col.name <- sprintf("comp%i", comp) + comp.col.names[comp] <- comp.col.name + arrow <- mixOmics::plotArrow(diablo.trained, comp = c(1, comp)) + consensus[,comp.col.name] <- arrow$y + } + + # Remove the outcome block + consensus <- subset(consensus, !(arrow.Block == "Block: Y")) + + # Perform a group by on each sample, finding the centroid of each + consensus.summary <- consensus %>% + dplyr::group_by(sample, arrow.group) %>% + dplyr::summarise_at(comp.col.names, mean) + colnames(consensus.summary)[2] <- "label" + + dev.off() + + # Plot the consensus plot + p <- ggplot2::ggplot(consensus.summary, + ggplot2::aes(comp1, comp2, color = label)) + p <- p + ggplot2::geom_point() + ggplot2::stat_ellipse() + + # Return the consensus values + return(list("consensus" = consensus.summary, "plot" = p)) +} + + +#' Plot projections for a prediction result +#' +#' @description +#' Function to plot a single-omics sPLS-DA prediction projected into the model +#' space. While confusion matrices of predicted data can easily be obtained, +#' there is no built-in function to plot the projection into the model space and +#' hence give a visualisation of the quality of the prediction. The plot will +#' show the centroids of the classes as large points, surrounded by the sample +#' projection as small points, coloured according to class. A good prediction +#' will cluster each class around the appropriate centroid. +#' +#' @param prediction mixOmics sPLS-DA prediction object, generated by the +#' `predict` method. +#' @param classes.new Factor indicating the classes of the new data. +#' +#' @return ggplot plot of the predicted data in the sPLS-DA model space. +#' @export plot.predicted.projection +#' +#' @examples +#' \dontrun{ +#' prediction.projection <- plot.predicted.projection(prediction.replicate.data, replicate.data.classes) +#' print(prediction.projection) +#' } +plot.predicted.projection <- function(prediction, classes.new) { + variates <- as.data.frame(prediction$variates[,1:2]) + variates$label <- classes.new + colnames(variates) <- c("Component1", "Component2", "label") + + p <- ggplot2::ggplot(variates, ggplot2::aes(x=Component1, y=Component2)) + + ggplot2::geom_point(ggplot2::aes(color=label)) + + ggplot2::theme_bw() + + classes.count <- length(levels(classes.new)) + pal <- scales::hue_pal()(classes.count) + for (i in seq(1, classes.count)) { + p <- p + annotate("point", x = prediction$centroids[i,1], + y = prediction$centroids[i,2], color = pal[[i]], cex = 6) + } + return(p) +} + +#' Extract feature associations from a multi-omics DIABLO model +#' +#' @description +#' Extract feature vs. feature association (mutual information) data from a +#' multi-omics mixOmics (DIABLO) model. This is the +#' same data used to create the circos and network plots, but includes all +#' interactions for the top features according to either blockrank or loading scores. +#' +#' @param diablo.model Trained mixOmics multi-omics (DIABLO) model. +#' @param feature_number Number of top features to select (default set to 20). If using loading scores, +#' top features will be evenly divided across the number of blocks. +#' @param score_type Type of score to select top features by. Accepted options: "blockrank" or "loading" +#' (default set to "blockrank") +#' +#' @return Matrix including the associations of top selected features in DIABLO model according +#' to either blockrank scores or loading values +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' find.feature.associations(diablo.model, feature_number=50, score_type="blockrank") +#' } + +find.feature.associations <- function(diablo.model, feature_number=20, score_type="blockrank") { + + #Extract the covariance matrix from the circosPlot function. Disable + #graphical output to avoid it being overwhelmed + circos <- mixOmics::circosPlot(diablo.model, cutoff=0.7, + line = TRUE, size.labels = 1.5) + dev.off() + if (score_type == "blockrank"){ + #find top features across all blocks by ranking blockrank scores high to low + + #run diablo blockrank + blockrank.i <- blockrank.diablo(diablo.model) + + #compile blockrank results + plot.data <- vector(mode = "list", length = length(blockrank.i)) + for (q in seq_along(blockrank.i)){ + block <- names(blockrank.i)[q] + feature <- names(blockrank.i[[q]]) + blockrank.score <- blockrank.i[[q]] + plot.data[[q]] <- data.frame(block, feature, blockrank.score) + } + plot.data <- bind_rows(plot.data) + plot.data <- arrange(plot.data, desc(blockrank.score)) + + #filter to desired number of features + plot.data <- head(plot.data,n=feature_number) + + #Find the top factors across all blocks + selected.features <- plot.data$feature + + } else if (score_type == "loading") { + #Find the top n features across all blocks according to loading score. + #Divide by feature_number by number of blocks to evenly attain number of scores per block + blocks <- length(diablo.model$loadings)-1 + feature_number_per_block <- round(feature_number/blocks, digits = 0) + loading_scores <- vector(mode = "list", length = length(blocks)) + for (i in 1:blocks){ + dev.new(width = 3000, height = 3000, unit = "px") + loadings <- mixOmics::plotLoadings(diablo.model, block=i, comp = 1, + contrib = 'max', method = 'median', + ndisplay=feature_number_per_block) + loading_scores[[i]] <- data.frame(rownames(loadings)) + dev.off() + } + loading_scores <- bind_rows(loading_scores) + selected.features <- loading_scores$rownames.loadings. + + } else{ + stop("score_type must be one of the following: blockrank, loading", call.=FALSE) + + } + + + #Filter the covariance matrix for the top features based on blockrank scores + circos.selected <- circos[rownames(circos) %in% selected.features, + colnames(circos) %in% selected.features] + diag(circos.selected) <- 1 + + # Return the covariance matrix + return(circos.selected) +} + + +#' Filter and reformat feature association matrix +#' +#' @description +#' Reformats association matrix into network dataframe. +#' Filters feature network to remove associations that do not meet a correlation cut off. +#' Filters feature network to remove associations between features from the same block. +#' Filters feature network to only include associations to a list of selected features. +#' +#' +#' @param diablo.model Trained mixOmics multi-omics (DIABLO) model +#' @param associations Matrix consisting the associations of top discriminative features from find.feature.associations() +#' @param feature_list List of features of interest to filter network by. If argument not supplied, network will not be filtered +#' @param cutoff Correlation cut off to filter feature network. (default set to 0.7) +#' @param remove_intrablock If set to TRUE, removes associations between features from the same block (default set to FALSE) +#' +#' @return Dataframe describing associations between features and their blocks +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' filter.network(diablo.model=diablo.model, associations=associations, cutoff=0.7, feature_list= feature_list, remove_intrablock = FALSE) +#' } + +filter.network <- function(diablo.model, associations, feature_list=NULL, cutoff=0.7, remove_intrablock=FALSE){ + +'%!in%' <- function(x,y)!('%in%'(x,y)) + +#generate block names for all features in model +feature_block_names <- data.frame() +number_of_blocks <- length(diablo.model$loadings)-1 +for (i in 1:number_of_blocks) { + active.block <- diablo.model$loadings[[i]] + active.block.names <- data.frame(rownames(active.block), names(diablo.model$loadings[i])) + feature_block_names <- rbind(feature_block_names, active.block.names) +} + +#identify blocks for top features +block_association <- feature_block_names[feature_block_names$rownames.active.block. %in% rownames(associations),]$names.diablo.model.loadings.i.. + +#export network file with correlation cut off +network <- export.matrix.as.network(associations, cutoff= cutoff, + filename = "network.csv", block.association = block_association) +if (is.null(feature_list)){ + + #not filtering network by top features + #merge in block names for nodes + network_subset_merged <- merge(network, feature_block_names, by.x="feature.2", by.y="rownames.active.block.", all.x = TRUE) + names(network_subset_merged)[4] <- "feature.1_block" + names(network_subset_merged)[5] <- "feature.2_block" + network <- network_subset_merged %>% select("feature.1", "feature.2", "value", "feature.1_block", "feature.2_block") + } + + else{ + + #subset network to only include associations with features of interest + network_subset_switch <- network[network$feature.1 %in% feature_list | network$feature.2 %in% feature_list,] + + #conditionally switching features where the feature of interest node is in second column + + #identify which rows need to be switched and do not have the correct source node + i <- which(network_subset_switch$feature.2 %in% feature_list) + #create vectors with new row values + new_feature_1 <- as.character(network_subset_switch$feature.2[i]) + new_feature_2 <- as.character(network_subset_switch$feature.1[i]) + #change row values for the rows that need to be switched + network_subset_switch$feature.1[i] <- new_feature_1 + network_subset_switch$feature.2[i] <- new_feature_2 + #merge in block names for nodes + network_subset_merged <- merge(network_subset_switch, feature_block_names, by.x="feature.2", by.y="rownames.active.block.", all.x = TRUE) + names(network_subset_merged)[4] <- "feature.1_block" + names(network_subset_merged)[5] <- "feature.2_block" + network <- network_subset_merged %>% select("feature.1", "feature.2", "value", "feature.1_block", "feature.2_block") + + #add error message if a feature of interest is not in top n features by blockrank score or loading score + feature_found <- FALSE + + for (i in seq_along(feature_list)){ + if (feature_list[i] %in% network$feature.1) { + feature_found <- TRUE + } + else if (feature_list[i] %!in% network$feature.1) { + message(paste0(feature_list[i], " is not present in top features selected")) + } + } + + if(feature_found == FALSE) { + stop("None of the entered features are in the top selected features", call.=FALSE) + } + + } + + +#remove intrablock connections if TRUE +if (remove_intrablock == TRUE){ + + #create primary key for each id + network$id <- rownames(network) + #identify intrablock connections to remove + network_to_remove <- network[network$feature.1_block == network$feature.2_block,] + #remove the unwanted intrablock connections + network <- network[network$id %!in% network_to_remove$id,] + #remove primary key + network$id <- NULL + + +} +#stop function is network has been filtered to to 0 associations +if (nrow(network) < 1){ + stop("Final network is empty. Please try again with different parameters.", call.=FALSE) +} + +return(network) + +} + +#' Visualize feature interaction network +#' +#' @description +#' Plots feature interaction network output dataframe from filter.network(). Nodes represent biological features and edges +#' represents correlations between nodes. Nodes are colored by block and edges are colored by correlation value. +#' Uses Kamada-Kawai layout algorithm to position nodes. More strongly connected nodes will be pulled together, +#' while more weakly connected nodes will pushed away from other nodes. +#' +#' +#' @param omicsfold_network Network dataframe generated by filter.network() that describes +#' associations between features and their associated blocks +#' @param diablo.model Trained mixOmics multi-omics (DIABLO) model +#' +#' @return Visualization of feature interaction network +#' +#' @export +#' +#' @examples +#' \dontrun{ +#' plot.network(omicsfold_network=omicsfold_network, diablo.model=diablo.model) +#' } + + +#visualize network +plot.network <- function(omicsfold_network, diablo.model){ + + #plot network + thm <- theme_minimal() + + theme( + axis.title = element_blank(), + axis.text = element_blank(), + panel.grid = element_blank(), + panel.grid.major = element_blank(), + ) + + theme_set(thm) + + final_network_graph <- as_tbl_graph(omicsfold_network) + + final_network_graph <- final_network_graph %>% + activate(nodes) %>% + mutate( + title = str_to_title(name), + label = str_replace_all(title, " ", "\n") + ) + + #generate block names for all features in model + feature_block_names <- data.frame() + number_of_blocks <- length(diablo.model$loadings)-1 + for (i in 1:number_of_blocks) { + active.block <- diablo.model$loadings[[i]] + active.block.names <- data.frame(rownames(active.block), names(diablo.model$loadings[i])) + feature_block_names <- rbind(feature_block_names, active.block.names) + } + names(feature_block_names)[2] <- "block" + + #merge in block data + final_network_graph <- left_join(final_network_graph %>% + activate(nodes), feature_block_names, by= c("name" = "rownames.active.block.")) + + network_plot <- final_network_graph %>% + ggraph(layout = "kk") + + geom_edge_link(aes(color = value), edge_width=3, alpha = 0.7) + + scale_edge_colour_continuous( + low = "orange", + high = "red", + space = "Lab", + na.value = "grey50", + guide = "edge_colourbar")+ + geom_node_point(size=6,show.legend = FALSE)+ + geom_node_label(aes(label=name, fill=block), repel = T, show.legend = TRUE, alpha=0.9, + color = "white", # text + size = 5, # font size + label.r = unit(20, "pt"), # corner radius of label box + label.size = .1, # label border size + label.padding = unit(1, "lines")) + return(network_plot) +} + +.order.circos.selected <- function(circos.selected, circos.order) { + circos.order.vars <- rownames(circos.selected)[circos.order] + circos.ordered <- + circos.selected[match(rownames(circos.selected), circos.order.vars), + match(colnames(circos.selected), circos.order.vars)] + return(circos.ordered) +} + +#' Perform a permanova signficance test +#' +#' @description +#' Perform a permanova significance testing using the adonis function (from +#' vegan) upon consensus classes, as returned from the get.block.centroids +#' method. Limited to first two components. +#' +#' @param consensus.summary Consensus summary as generated by +#' get.block.centroids(). +#' +#' @return Measurement of p-value for separation of clusters. +#' @export +permanova.clusters <- function(consensus.summary) { + consensus.points <- data.frame(consensus.summary$comp1, + consensus.summary$comp2) + return(vegan::adonis(consensus.points ~ sample(consensus.summary$arrow.group), + method='euclidean')) +}