Omicsfold / Git / [e26484] /OmicsFold/R/sample

Models:
AlyssaS/
Omicsfold
Downloads: 1
[e26484]: / OmicsFold / R / sample_analysis.R
History
Download this file
420 lines (358 with data), 16.6 kB

#' A null device for running plot functions without outputting graphics.
.nulldev <- function(file = nullfile(), ...) {
  type <- getOption("R.devices.nulldev", "pdf")
  do.call(type, args = list(nullfile()))
}

#' Get block centroids for a DIABLO model
#'
#' @description
#' Function to get the centroids of samples transformed into the same model
#' space across all blocks for a trained multi-omics mixOmics (DIABLO) model.
#' Prepares plots and raw data of consensus sample classification, which (for
#' some reason) isn't provided in the base package.
#'
#' @param diablo.trained Trained multi-omics mixOmics DIABLO model.
#'
#' @return Simple object containing consensus data (as `consensus.summary`) and
#' a ggplot plot of the distribution in the first two components (as `plot`).
#' @export
#'
#' @examples
#' \dontrun{
#' consensus.centroids <- get.block.centroids(diablo.trained.analysis, 20, 3)
#' print(consensus.centroids$plot)
#' }
get.block.centroids <- function(diablo.trained) {
  # Establish dimensionality
  block.count <- length(diablo.trained$X)
  sample.count <- length(diablo.trained$Y)
  comp.count <- diablo.trained$ncomp[1]

  # Perform the same analysis as the arrow plot -- sending graphics output to
  # the null device
  .nulldev()

  # Set up the initial columns
  arrow <- mixOmics::plotArrow(diablo.trained)
  consensus <- data.frame(arrow$Block, arrow$group)
  consensus$sample <- rep(1:sample.count, block.count + 1)

  # Add component columns 1 at a time using arrow plots
  comp.col.names <- vector()
  for (comp in 1:comp.count) {
    comp.col.name <- sprintf("comp%i", comp)
    comp.col.names[comp] <- comp.col.name
    arrow <- mixOmics::plotArrow(diablo.trained, comp = c(1, comp))
    consensus[,comp.col.name] <- arrow$y
  }

  # Remove the outcome block
  consensus <- subset(consensus, !(arrow.Block == "Block: Y"))

  # Perform a group by on each sample, finding the centroid of each
  consensus.summary <- consensus %>%
      dplyr::group_by(sample, arrow.group) %>%
      dplyr::summarise_at(comp.col.names, mean)
  colnames(consensus.summary)[2] <- "label"

  dev.off()

  # Plot the consensus plot
  p <- ggplot2::ggplot(consensus.summary,
                       ggplot2::aes(comp1, comp2, color = label))
  p <- p + ggplot2::geom_point() + ggplot2::stat_ellipse()

  # Return the consensus values
  return(list("consensus" = consensus.summary, "plot" = p))
}


#' Plot projections for a prediction result
#'
#' @description
#' Function to plot a single-omics sPLS-DA prediction projected into the model
#' space. While confusion matrices of predicted data can easily be obtained,
#' there is no built-in function to plot the projection into the model space and
#' hence give a visualisation of the quality of the prediction. The plot will
#' show the centroids of the classes as large points, surrounded by the sample
#' projection as small points, coloured according to class. A good prediction
#' will cluster each class around the appropriate centroid.
#'
#' @param prediction mixOmics sPLS-DA prediction object, generated by the
#' `predict` method.
#' @param classes.new Factor indicating the classes of the new data.
#'
#' @return ggplot plot of the predicted data in the sPLS-DA model space.
#' @export plot.predicted.projection
#'
#' @examples
#' \dontrun{
#' prediction.projection <- plot.predicted.projection(prediction.replicate.data, replicate.data.classes)
#' print(prediction.projection)
#' }
plot.predicted.projection <- function(prediction, classes.new) {
  variates <- as.data.frame(prediction$variates[,1:2])
  variates$label <- classes.new
  colnames(variates) <- c("Component1", "Component2", "label")

  p <- ggplot2::ggplot(variates, ggplot2::aes(x=Component1, y=Component2)) +
       ggplot2::geom_point(ggplot2::aes(color=label)) +
       ggplot2::theme_bw()

  classes.count <- length(levels(classes.new))
  pal <- scales::hue_pal()(classes.count)
  for (i in seq(1, classes.count)) {
    p <- p + annotate("point", x = prediction$centroids[i,1],
                      y = prediction$centroids[i,2], color = pal[[i]], cex = 6)
  }
  return(p)
}

#' Extract feature associations from a multi-omics DIABLO model
#'
#' @description
#' Extract feature vs. feature association (mutual information) data from a
#' multi-omics mixOmics (DIABLO) model. This is the
#' same data used to create the circos and network plots, but includes all
#' interactions for the top features according to either blockrank or loading scores.
#'
#' @param diablo.model Trained mixOmics multi-omics (DIABLO) model.
#' @param feature_number Number of top features to select (default set to 20). If using loading scores,
#' top features will be evenly divided across the number of blocks.
#' @param score_type Type of score to select top features by. Accepted options: "blockrank" or "loading" 
#' (default set to "blockrank")
#'
#' @return Matrix including the associations of top selected features in DIABLO model according 
#' to either blockrank scores or loading values
#'
#' @export
#'
#' @examples
#' \dontrun{
#' find.feature.associations(diablo.model, feature_number=50, score_type="blockrank")
#' }

find.feature.associations <- function(diablo.model, feature_number=20, score_type="blockrank") {
  
   #Extract the covariance matrix from the circosPlot function. Disable
   #graphical output to avoid it being overwhelmed
   circos <- mixOmics::circosPlot(diablo.model, cutoff=0.7,
                                 line = TRUE, size.labels = 1.5)
   dev.off()
   if (score_type == "blockrank"){
   #find top features across all blocks by ranking blockrank scores high to low
  
   #run diablo blockrank
   blockrank.i <- blockrank.diablo(diablo.model)
    
   #compile blockrank results
   plot.data <- vector(mode = "list", length = length(blockrank.i))
    for (q in seq_along(blockrank.i)){
      block <- names(blockrank.i)[q]
      feature <- names(blockrank.i[[q]])
      blockrank.score <- blockrank.i[[q]]
      plot.data[[q]] <- data.frame(block, feature, blockrank.score)
    }
   plot.data <- bind_rows(plot.data)
   plot.data <- arrange(plot.data, desc(blockrank.score))
    
   #filter to desired number of features
   plot.data <- head(plot.data,n=feature_number)
    
   #Find the top factors across all blocks
   selected.features <- plot.data$feature
   
   } else if (score_type == "loading") {
     #Find the top n features across all blocks according to loading score. 
     #Divide by feature_number by number of blocks to evenly attain number of scores per block 
     blocks <- length(diablo.model$loadings)-1
     feature_number_per_block <- round(feature_number/blocks, digits = 0)
     loading_scores <- vector(mode = "list", length = length(blocks))
     for (i in 1:blocks){
       dev.new(width = 3000, height = 3000, unit = "px")
       loadings <- mixOmics::plotLoadings(diablo.model, block=i, comp = 1,
                                          contrib = 'max', method = 'median',
                                          ndisplay=feature_number_per_block)
       loading_scores[[i]] <- data.frame(rownames(loadings))
       dev.off()
       }
     loading_scores <- bind_rows(loading_scores)
     selected.features <- loading_scores$rownames.loadings.

   } else{
     stop("score_type must be one of the following: blockrank, loading", call.=FALSE)
     
   }
  
  
   #Filter the covariance matrix for the top features based on blockrank scores
   circos.selected <- circos[rownames(circos) %in% selected.features,
                            colnames(circos) %in% selected.features]
   diag(circos.selected) <- 1

   # Return the covariance matrix
   return(circos.selected)
}


#' Filter and reformat feature association matrix 
#'
#' @description
#' Reformats association matrix into network dataframe.
#' Filters feature network to remove associations that do not meet a correlation cut off.
#' Filters feature network to remove associations between features from the same block.
#' Filters feature network to only include associations to a list of selected features. 
#' 
#'
#' @param diablo.model Trained mixOmics multi-omics (DIABLO) model
#' @param associations Matrix consisting the associations of top discriminative features from find.feature.associations()
#' @param feature_list List of features of interest to filter network by. If argument not supplied, network will not be filtered
#' @param cutoff Correlation cut off to filter feature network. (default set to 0.7)
#' @param remove_intrablock If set to TRUE, removes associations between features from the same block (default set to FALSE)
#'
#' @return Dataframe describing associations between features and their blocks
#'
#' @export
#'
#' @examples
#' \dontrun{
#' filter.network(diablo.model=diablo.model, associations=associations, cutoff=0.7, feature_list= feature_list, remove_intrablock = FALSE)
#' }

filter.network <- function(diablo.model, associations, feature_list=NULL, cutoff=0.7, remove_intrablock=FALSE){

'%!in%' <- function(x,y)!('%in%'(x,y))
  
#generate block names for all features in model
feature_block_names <- data.frame()
number_of_blocks <- length(diablo.model$loadings)-1
for (i in 1:number_of_blocks) {
  active.block <- diablo.model$loadings[[i]]
  active.block.names <- data.frame(rownames(active.block), names(diablo.model$loadings[i]))
  feature_block_names <- rbind(feature_block_names, active.block.names)
}

#identify blocks for top features
block_association <- feature_block_names[feature_block_names$rownames.active.block. %in% rownames(associations),]$names.diablo.model.loadings.i..

#export network file with correlation cut off
network <- export.matrix.as.network(associations, cutoff= cutoff, 
                                    filename = "network.csv", block.association = block_association)
if (is.null(feature_list)){

  #not filtering network by top features
  #merge in block names for nodes
  network_subset_merged <- merge(network, feature_block_names, by.x="feature.2", by.y="rownames.active.block.", all.x = TRUE)
  names(network_subset_merged)[4] <- "feature.1_block"
  names(network_subset_merged)[5] <- "feature.2_block"
  network <- network_subset_merged %>% select("feature.1", "feature.2", "value", "feature.1_block", "feature.2_block")
  }
  
  else{

  #subset network to only include associations with features of interest
  network_subset_switch <- network[network$feature.1 %in% feature_list | network$feature.2 %in% feature_list,]
  
  #conditionally switching features where the feature of interest node is in second column
   
  #identify which rows need to be switched and do not have the correct source node
  i <- which(network_subset_switch$feature.2 %in% feature_list)
  #create vectors with new row values
  new_feature_1 <- as.character(network_subset_switch$feature.2[i])
  new_feature_2 <- as.character(network_subset_switch$feature.1[i])
  #change row values for the rows that need to be switched
  network_subset_switch$feature.1[i] <- new_feature_1
  network_subset_switch$feature.2[i] <- new_feature_2
  #merge in block names for  nodes
  network_subset_merged <- merge(network_subset_switch, feature_block_names, by.x="feature.2", by.y="rownames.active.block.", all.x = TRUE)
  names(network_subset_merged)[4] <- "feature.1_block"
  names(network_subset_merged)[5] <- "feature.2_block"
  network <- network_subset_merged %>% select("feature.1", "feature.2", "value", "feature.1_block", "feature.2_block")
  
  #add error message if a feature of interest is not in top n features by blockrank score or loading score
  feature_found <- FALSE

  for (i in seq_along(feature_list)){
    if (feature_list[i] %in% network$feature.1)  {
      feature_found <- TRUE
    }
    else if (feature_list[i] %!in% network$feature.1) {
      message(paste0(feature_list[i], " is not present in top features selected"))
    }
  }
  
  if(feature_found == FALSE) {
    stop("None of the entered features are in the top selected features", call.=FALSE)
  }

  }
  

#remove intrablock connections if TRUE
if (remove_intrablock == TRUE){

  #create primary key for each id
  network$id <- rownames(network)
  #identify intrablock connections to remove
  network_to_remove <- network[network$feature.1_block == network$feature.2_block,]
  #remove the unwanted intrablock connections
  network <- network[network$id %!in% network_to_remove$id,]
  #remove primary key
  network$id <- NULL

  
}
#stop function is network has been filtered to to 0 associations
if (nrow(network) < 1){
  stop("Final network is empty. Please try again with different parameters.", call.=FALSE)
}

return(network)

}

#' Visualize feature interaction network 
#'
#' @description
#' Plots feature interaction network output dataframe from filter.network(). Nodes represent biological features and edges
#' represents correlations between nodes. Nodes are colored by block and edges are colored by correlation value.
#' Uses Kamada-Kawai layout algorithm to position nodes. More strongly connected nodes will be pulled together,
#' while more weakly connected nodes will pushed away from other nodes.
#' 
#'
#' @param omicsfold_network Network dataframe generated by filter.network() that describes
#' associations between features and their associated blocks
#' @param diablo.model Trained mixOmics multi-omics (DIABLO) model
#'
#' @return  Visualization of feature interaction network
#'
#' @export
#'
#' @examples
#' \dontrun{
#' plot.network(omicsfold_network=omicsfold_network, diablo.model=diablo.model)
#' }


#visualize network
plot.network <- function(omicsfold_network, diablo.model){
  
  #plot network
  thm <- theme_minimal() +
    theme(
      axis.title = element_blank(),
      axis.text = element_blank(),
      panel.grid = element_blank(),
      panel.grid.major = element_blank(),
    ) 
  
  theme_set(thm)
  
  final_network_graph <- as_tbl_graph(omicsfold_network)
  
  final_network_graph <- final_network_graph %>%
    activate(nodes) %>%
    mutate(
      title = str_to_title(name),
      label = str_replace_all(title, " ", "\n")
    )
  
  #generate block names for all features in model
  feature_block_names <- data.frame()
  number_of_blocks <- length(diablo.model$loadings)-1
  for (i in 1:number_of_blocks) {
    active.block <- diablo.model$loadings[[i]]
    active.block.names <- data.frame(rownames(active.block), names(diablo.model$loadings[i]))
    feature_block_names <- rbind(feature_block_names, active.block.names)
  }
  names(feature_block_names)[2] <- "block"
  
  #merge in block data
  final_network_graph <-  left_join(final_network_graph %>%
                                      activate(nodes), feature_block_names, by= c("name" = "rownames.active.block."))
  
  network_plot <- final_network_graph %>%
      ggraph(layout = "kk") +
      geom_edge_link(aes(color = value), edge_width=3, alpha = 0.7) +
      scale_edge_colour_continuous(
        low = "orange",
        high = "red",
        space = "Lab",
        na.value = "grey50",
        guide = "edge_colourbar")+
      geom_node_point(size=6,show.legend = FALSE)+
      geom_node_label(aes(label=name, fill=block), repel = T, show.legend = TRUE, alpha=0.9,
                      color = "white",          # text
                      size = 5,                 # font size
                      label.r = unit(20, "pt"), # corner radius of label box
                      label.size = .1,          # label border size
                      label.padding = unit(1, "lines"))
  return(network_plot)
}

.order.circos.selected <- function(circos.selected, circos.order) {
  circos.order.vars <- rownames(circos.selected)[circos.order]
  circos.ordered <-
      circos.selected[match(rownames(circos.selected), circos.order.vars),
                      match(colnames(circos.selected), circos.order.vars)]
  return(circos.ordered)
}

#' Perform a permanova signficance test
#'
#' @description
#' Perform a permanova significance testing using the adonis function (from
#' vegan) upon consensus classes, as returned from the get.block.centroids
#' method. Limited to first two components.
#'
#' @param consensus.summary Consensus summary as generated by
#' get.block.centroids().
#'
#' @return Measurement of p-value for separation of clusters.
#' @export
permanova.clusters <- function(consensus.summary) {
  consensus.points <- data.frame(consensus.summary$comp1,
                                 consensus.summary$comp2)
  return(vegan::adonis(consensus.points ~ sample(consensus.summary$arrow.group),
                       method='euclidean'))
}