Diff of /R/bipartites.R [000000] .. [9abfcf]

Switch to side-by-side view

--- a
+++ b/R/bipartites.R
@@ -0,0 +1,410 @@
+#' Compute links between TFs and DNA regions (ATAC peaks)
+#'
+#' Compute and add bipartite between TFs and DNA regions to hummus object.
+#' Links are computed based on the binding motifs of TFs and their locations
+#' on a reference genome.
+#' Currently based on Signac AddMotifs function (--> motifmachR, itself based on
+#' MOODs algorithm).
+#'
+#' @param hummus_object (hummus_object) - Hummus object.
+#' @param tf_expr_assay (character) - Name of assay containing the TF expression
+#' data. If NULL, all TFs with a motif are used. Default: "RNA".
+#' @param peak_assay (character) - Name of the assay containing the DNA regions
+#' (ATAC peaks). Default: "peaks".
+#' @param tf_multiplex_name (character) - Name of multiplex containing the TFs.
+#' If NULL, the name of the TF assay is used.
+#' @param peak_multiplex_name (character) - Name of the multiplex containing the
+#' DNA regions (ATAC peaks). If NULL, the name of the peak assay is used.
+#' @param genome (BSgenome object) - Reference genome.
+#' @param store_network (bool) - Save the bipartite directly
+#' (\code{TRUE}, default) or return without saving on disk (\code{FALSE}).
+#' @param output_file (character) - Name of the output_file
+#' (if store_bipartite == \code{TRUE}). Default: NULL.
+#' @param verbose (integer) Display function messages.
+#' Set to 0 for no message displayed, >= 1 for more details. Default: 1.
+#' @param bipartite_name (character) - Name of bipartite. Default: "tf_peak".
+#'
+#' @return hummus_object (hummus_object) - Hummus object with TF-peak bipartite
+#' added to the multilayer slot
+#' @export
+#'
+#' @examples hummus <- bipartite_tfs2peaks(
+#'                      hummus_object = hummus,
+#'                      tf_expr_assay = "RNA",
+#'                      peak_assay = "peaks",
+#'                      tf_multiplex_name = "TF",
+#'                      peak_multiplex_name = "peaks",
+#'           genome = BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,
+#'                      store_network = FALSE,
+#'                      verbose = 1,
+#'                      bipartite_name = "tf_peak")
+
+bipartite_tfs2peaks <- function(
+  hummus_object,
+  tf_expr_assay = "RNA",
+  peak_assay = "peaks",
+  tf_multiplex_name = NULL,
+  peak_multiplex_name = NULL,
+  genome,
+  store_network = FALSE,
+  output_file = NULL,
+  verbose = 1,
+  bipartite_name = "tf_peak"
+  ) {
+
+  if (verbose > 0) {
+    cat("Computing TF-peak bipartite\n")
+  }
+  # Cck if tf_gene_assay is NULL
+  if (!is.null(tf_expr_assay)) {
+    # Check if the gene assay is present in the seurat object
+    if (!tf_expr_assay %in% names(hummus_object@assays)) {
+      stop("The gene assay is not present in the seurat object")
+    }
+    # Get TFs expressed in  assay AND having known binding motifs
+    tfs_use <- get_tfs(hummus_object,
+                       assay = tf_expr_assay,
+                       store_tfs = FALSE,
+                       verbose = verbose)
+  } else { # No filtering on expression assay, use all TFs with a motif
+    if (verbose > 0) {
+      cat("No filtering on expression assay, using all TFs with a motif.\n")
+    }
+    tfs_use <- unique(hummus_object@motifs_db@tf2motifs$tf)
+  }
+
+  # Check if the peak assay is present in the seurat object
+  if (!peak_assay %in% names(hummus_object@assays)) {
+    stop("The peak assay is not present in the seurat object")
+  }
+  # Check if the peak assay is a ChromatinAssay object
+  if (!inherits(hummus_object@assays[[peak_assay]],
+                     "ChromatinAssay")) {
+    stop("The peak assay is not a ChromatinAssay object 
+    or does not have annotations (gene.range object))")
+  }
+  # Check if the peak assay has gene.range annotations
+  if (is.null(Signac::Annotation(hummus_object[[peak_assay]]))) {
+      stop("The peak assay does not have annotations (gene.range object)")
+  }
+
+  # Add motifs to the peaks
+  motif_pos <- Signac::AddMotifs(
+    object = hummus_object[[peak_assay]],
+    genome = genome,
+    pfm = hummus_object@motifs_db@motifs #add verbose options
+  )
+
+  ## The 17 following lines are inspired from the Pando package :
+  # https://github.com/quadbiolab/Pando/blob/main/R/regions.R
+  # Add TF info for motifs
+  if (verbose > 0) {
+    cat("\tAdding TF info\n")
+  }
+
+  # Spread dataframe to sparse matrix
+  tf2motifs <- hummus_object@motifs_db@tf2motifs
+  # Select motif and tf columns
+  tf2motifs <- dplyr::"%>%"(tf2motifs, dplyr::select("motif" = 1, "tf" = 2))
+  tf2motifs <- dplyr::"%>%"(tf2motifs, dplyr::distinct()) # Remove duplicates
+  # Add value column
+  tf2motifs <- dplyr::"%>%"(tf2motifs, dplyr::mutate(val = 1))
+  tf2motifs <- dplyr::"%>%"(tf2motifs, # Spread TFs
+                  tidyr::pivot_wider(names_from = "tf",
+                                     values_from = val,
+                                     values_fill = 0)
+                            )
+  # Set motif as rownames
+  tf2motifs <- dplyr::"%>%"(tf2motifs, tibble::column_to_rownames("motif"))
+  tf2motifs <- dplyr::"%>%"(tf2motifs, as.matrix()) # Convert to matrix
+
+  # Convert to sparse matrix
+  tf2motifs <- dplyr::"%>%"(tf2motifs, Matrix::Matrix(sparse = TRUE))
+
+  if (length(tfs_use) == 0) { # If no TFs are found in the dataset
+    stop("None of the provided TFs were found in the dataset.
+    Consider providing a custom motif-to-TF map as `motif_tfs`")
+  }
+
+  # Get TF peak links
+  # Keep only the TFs that are in our tf list
+  TFs_Peaks <- motif_pos@motifs@data %*% tf2motifs[, tfs_use]
+
+  # Remove values equal to 0
+  tfs2peaks <- expand.grid(rownames(TFs_Peaks),
+                           colnames(TFs_Peaks))[as.vector(TFs_Peaks > 0), ]
+                          # TF-peak links
+  colnames(tfs2peaks) <- c("peak", "TF")     # set column names
+
+  # Save TF-peak links
+  store_network(network = tfs2peaks,
+                store_network = store_network,
+                output_file = output_file,
+                verbose = verbose)
+
+  if (verbose > 0) {
+    cat("\tReturning TF-peak links as bipartite object\n")
+  }
+
+  # Set default names for the networks if not provided
+  if (is.null(tf_multiplex_name)) {
+    cat("no TF layer name provided, using tf_expr_assay name\n")
+    tf_multiplex_name <- tf_expr_assay
+  }
+  if (is.null(peak_multiplex_name)) {
+    cat("no peak layer name provided, using peak_assay name\n")
+    peak_multiplex_name <- peak_assay
+  }
+
+  # Return tf-peak bipartite
+  hummus_object@multilayer@bipartites[[bipartite_name]] <- new("bipartite",
+                           "network" = tfs2peaks,
+                           "multiplex_left" = peak_multiplex_name,
+                           "multiplex_right" = tf_multiplex_name)
+  return(hummus_object) # Return TF-peak bipartite object
+}
+
+
+
+
+#' Compute links between DNA regions and genenames
+#'
+#' Compute and add bipartite between DNA regions and genenames to hummus object.
+#' Links are computed based on the distance between peaks and gene's TSS
+#' location from gene.range annotations.
+#' Call find_peaks_near_genes function, that can use different methods.
+#'
+#' @param hummus_object (hummus_object) - Hummus object.
+#' @param gene_assay (character) - Name of assay containing the gene expression
+#' data. Default: "RNA".
+#' @param peak_assay (character) - Name of the assay containing the DNA regions
+#' (ATAC peaks). Default: "peaks".
+#' @param gene_multiplex_name (character) - Name of the multiplex containing the
+#' genes.
+#' If NULL, the name of the gene assay is used.
+#' @param peak_multiplex_name (character) - Name of the multiplex containing the
+#' DNA regions (ATAC peaks). If NULL, the name of the peak assay is used.
+#' @param peak_to_gene_method (character) - Method to use to compute the links
+#' between peaks and genes. Default: "Signac".
+#' * \code{'Signac'} - Use Signac::Extend to extend genes.
+#' * \code{'GREAT'} - Not implemented yet.
+#' @param upstream (int) - Upstream distance from TSS
+#' to consider as potential promoter.
+#' @param downstream (int) - Downstream distance from TSS
+#' to consider as potential promoter.
+#' @param only_tss (logical) - If TRUE, only TSS will be considered.
+#' @param store_network (bool) - Save the bipartite directly
+#' (\code{TRUE}, default) or return without saving on disk (\code{FALSE}).
+#' @param output_file (character) - Name of the output_file
+#' (if store_bipartite == \code{TRUE}). Default: NULL.
+#' @param verbose (integer) Display function messages.
+#' Set to 0 for no message displayed, >= 1 for more details. Default: 1.
+#' @param bipartite_name (character) - Name of bipartite. Default: "atac_rna".
+#' 
+#' @return hummus_object (hummus_object) - Hummus object w/ atac-rna bipartite
+#' added to the multilayer slot
+#' @export
+#'
+#' @examples hummus <- bipartite_peaks2genes(
+#'                        hummus_object = hummus,
+#'                        gene_assay = "RNA",
+#'                        peak_assay = "peaks",
+#'                        gene_multiplex_name = "RNA",
+#'                        peak_multiplex_name = "peaks",
+#'                        peak_to_gene_method = "Signac",
+#'                        upstream = 500,
+#'                        downstream = 500,
+#'                        only_tss = TRUE,
+#'                        store_network = FALSE,
+#'                        bipartite_name = "atac_rna")
+
+bipartite_peaks2genes <- function(
+  hummus_object,
+  gene_assay = "RNA",
+  peak_assay = "peaks",
+  gene_multiplex_name = NULL,
+  peak_multiplex_name = NULL,
+  peak_to_gene_method = "Signac",
+  upstream = 500,
+  downstream = 500,
+  only_tss = TRUE,
+  store_network = FALSE,
+  output_file = NULL,
+  bipartite_name = "atac_rna"
+  ) {
+  # Check if the gene assay is present in the hummus object
+  if (!gene_assay %in% names(hummus_object@assays)) {
+      stop("The gene assay is not present in the hummus object")
+  } else if (!peak_assay %in% names(hummus_object@assays)) {
+      # Check if the peak assay is present in the hummus object
+      stop("The peak assay is not present in the hummus object")
+  } else if (!inherits(hummus_object@assays[[peak_assay]],
+                      "ChromatinAssay")) {
+      # Check if the peak assay is a ChromatinAssay object
+      stop("The peak assay is not a ChromatinAssay object 
+      or does not have annotations (gene.range object))")
+  } else if (is.null(Signac::Annotation(hummus_object[[peak_assay]]))) {
+      # Check if the peak assay has gene.range annotations
+      stop("The peak assay does not have annotations (gene.range object)")
+  }
+
+  # Find candidate regions near gene bodies
+  peaks_near_genes <- find_peaks_near_genes(
+                        peaks = hummus_object[[peak_assay]]@ranges,
+                        method = peak_to_gene_method,
+                        genes = Signac::Annotation(hummus_object[[peak_assay]]),
+                        upstream = upstream,
+                        downstream = downstream,
+                        only_tss = only_tss)
+  # Aggregate candidate regions to gene bodies (peak to gene matrix)
+  peaks2genes <- aggregate_matrix(Matrix::t(peaks_near_genes),
+                                 groups = colnames(peaks_near_genes),
+                                 fun = "sum")
+  # Keep only the genes that are in our scRNA-seq dataset
+  peaks2genes <- peaks2genes[rownames(peaks2genes)
+                 %in% rownames(hummus_object@assays[[gene_assay]]), ]
+  # Remove rows/cols with only zeros
+  peaks2genes <- peaks2genes[Matrix::rowSums(peaks2genes) != 0,
+                             Matrix::colSums(peaks2genes) != 0]
+  # peak-gene links
+  peaks2genes <- expand.grid(rownames(peaks2genes),
+                          colnames(peaks2genes))[as.vector(peaks2genes > 0), ]
+  colnames(peaks2genes) <- c("gene", "peak") # set column names
+
+
+  # Save peak-gene links
+  store_network(network = peaks2genes,
+                store_network = store_network,
+                output_file = output_file,
+                verbose = 1)
+
+  # Set default names for the networks if not provided
+  if (is.null(gene_multiplex_name)) {
+    gene_multiplex_name <- gene_assay
+  }
+  if (is.null(peak_multiplex_name)) {
+    peak_multiplex_name <- peak_assay
+  }
+
+  # Return atac-rna bipartite
+  hummus_object@multilayer@bipartites[[bipartite_name]] <- new("bipartite",
+                           "network" = peaks2genes,
+                           "multiplex_left" = gene_multiplex_name,
+                           "multiplex_right" = peak_multiplex_name)
+  return(hummus_object)
+}
+
+#' @title Associate peaks to genes based on distance to TSS (or gene body)
+#'
+#' @param peaks vector(character) - List of peaks.
+#' @param genes vector(character) - List of genes.
+#' @param sep vector(character) - Separator between chromosome,
+#'            start and end position. Default: c('-', '-').
+#' @param method (character) - Method to use. Default: "Signac".
+#' * \code{'Signac'} - Use Signac::Extend to extend genes.
+#' * \code{'GREAT'} - Not implemented yet.
+#' @param upstream (int) - Upstream distance from TSS
+#' to consider as potential promoter.
+#' @param downstream (int) - Downstream distance from TSS
+#' to consider as potential promoter.
+#' @param extend (int) - Integer defining the distance from the upstream
+#' and downstream of the basal regulatory region. Used only by method 'GREAT'.
+#' @param only_tss (logical) - If TRUE, only TSS will be considered.
+#' @param verbose (logical) - If TRUE, print progress messages.
+#'
+#' @return (matrix) - Matrix of peaks x genes with 1 if peak is near gene.
+#' @export
+#'
+#' @examples TODO
+find_peaks_near_genes <- function(
+  peaks,
+  genes,
+  sep = c("-", "-"),
+  method = c("Signac", "GREAT"),
+  upstream = 100000,
+  downstream = 0,
+  extend = 1000000,
+  only_tss = FALSE,
+  verbose = TRUE
+  ) {
+  # Match arg
+  method <- match.arg(method)
+
+  if (method == "Signac") {
+
+    if (only_tss) {
+      genes <- IRanges::resize(x = genes, width = 1, fix = "start")
+    }
+    genes_extended <- suppressWarnings(
+      expr = Signac::Extend(
+        genes, upstream = upstream, downstream = downstream
+      )
+    )
+    overlaps <- IRanges::findOverlaps(
+      query = peaks,
+      subject = genes_extended,
+      type = "any",
+      select = "all"
+    )
+    hit_matrix <- Matrix::sparseMatrix(
+      i = S4Vectors::queryHits(overlaps),
+      j = S4Vectors::subjectHits(overlaps),
+      x = 1,
+      dims = c(length(peaks), length(genes_extended))
+    )
+    rownames(hit_matrix) <- Signac::GRangesToString(grange = peaks, sep = sep)
+    colnames(hit_matrix) <- genes_extended$gene_name
+
+  } else {
+    stop("method must be either 'Signac' or 'GREAT' ; 
+          please check that current version of HuMMuS
+          already accepts GREAT as a method.")
+  }
+  return(hit_matrix)
+}
+
+
+#' @title Filter peaks to those overlapping specific (regulatory) elements
+#' @description Function to reduce list of "Peaks" to the ones overlapping with
+#' list of "RegEl", e.g. regulatory elements, evolutionary conserved regions
+#'
+#' @param Peaks (character) vector of genomic coordinates of peaks
+#' @param RegEl (character) vector of genomic coordinates of regulatory elements
+#' @param sep_Peak1 (character) separator between chromosome and
+#'                              start position of peak
+#' @param sep_Peak2 (character) separator between start position
+#'                              and end position of peak
+#' @param sep_RegEl1 (character) separator between chromosome and
+#'                               start position of regulatory element
+#' @param sep_RegEl2 (character) separator between start position and
+#'                               end position of regulatory element
+#'
+#' @return (character) vector of genomic coordinates of peaks overlapping
+#' @export
+#'
+#' @examples peaks_in_regulatory_elements(peaks, RegEl)
+peaks_in_regulatory_elements <- function(
+  Peaks,
+  RegEl,
+  sep_Peak1 = "-",
+  sep_Peak2 = "-",
+  sep_RegEl1 = "-",
+  sep_RegEl2 = "-"
+  ) {
+  # Make sure Peaks and RegEl are unique
+  Peaks <- unique(Peaks)
+  RegEl <- unique(RegEl)
+
+  # convert genomic corrdinate string to GRanges object
+  Peak_GRangesObj <- Signac::StringToGRanges(Peaks, 
+                                             sep = c(sep_Peak1, sep_Peak2))
+  RegEl_GRangesObj <- Signac::StringToGRanges(RegEl,
+                                              sep = c(sep_RegEl1, sep_RegEl2))
+
+  # find overlap between peaks and regulatory elements
+  PeakOverlaps <- IRanges::findOverlaps(query = RegEl_GRangesObj,
+                                        subject = Peak_GRangesObj)
+
+  # return peaks that overlapped with regulatory element
+  return(Peaks[unique(as.matrix(PeakOverlaps)[, 2])])
+}
\ No newline at end of file