--- a +++ b/R/bipartites.R @@ -0,0 +1,410 @@ +#' Compute links between TFs and DNA regions (ATAC peaks) +#' +#' Compute and add bipartite between TFs and DNA regions to hummus object. +#' Links are computed based on the binding motifs of TFs and their locations +#' on a reference genome. +#' Currently based on Signac AddMotifs function (--> motifmachR, itself based on +#' MOODs algorithm). +#' +#' @param hummus_object (hummus_object) - Hummus object. +#' @param tf_expr_assay (character) - Name of assay containing the TF expression +#' data. If NULL, all TFs with a motif are used. Default: "RNA". +#' @param peak_assay (character) - Name of the assay containing the DNA regions +#' (ATAC peaks). Default: "peaks". +#' @param tf_multiplex_name (character) - Name of multiplex containing the TFs. +#' If NULL, the name of the TF assay is used. +#' @param peak_multiplex_name (character) - Name of the multiplex containing the +#' DNA regions (ATAC peaks). If NULL, the name of the peak assay is used. +#' @param genome (BSgenome object) - Reference genome. +#' @param store_network (bool) - Save the bipartite directly +#' (\code{TRUE}, default) or return without saving on disk (\code{FALSE}). +#' @param output_file (character) - Name of the output_file +#' (if store_bipartite == \code{TRUE}). Default: NULL. +#' @param verbose (integer) Display function messages. +#' Set to 0 for no message displayed, >= 1 for more details. Default: 1. +#' @param bipartite_name (character) - Name of bipartite. Default: "tf_peak". +#' +#' @return hummus_object (hummus_object) - Hummus object with TF-peak bipartite +#' added to the multilayer slot +#' @export +#' +#' @examples hummus <- bipartite_tfs2peaks( +#' hummus_object = hummus, +#' tf_expr_assay = "RNA", +#' peak_assay = "peaks", +#' tf_multiplex_name = "TF", +#' peak_multiplex_name = "peaks", +#' genome = BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38, +#' store_network = FALSE, +#' verbose = 1, +#' bipartite_name = "tf_peak") + +bipartite_tfs2peaks <- function( + hummus_object, + tf_expr_assay = "RNA", + peak_assay = "peaks", + tf_multiplex_name = NULL, + peak_multiplex_name = NULL, + genome, + store_network = FALSE, + output_file = NULL, + verbose = 1, + bipartite_name = "tf_peak" + ) { + + if (verbose > 0) { + cat("Computing TF-peak bipartite\n") + } + # Cck if tf_gene_assay is NULL + if (!is.null(tf_expr_assay)) { + # Check if the gene assay is present in the seurat object + if (!tf_expr_assay %in% names(hummus_object@assays)) { + stop("The gene assay is not present in the seurat object") + } + # Get TFs expressed in assay AND having known binding motifs + tfs_use <- get_tfs(hummus_object, + assay = tf_expr_assay, + store_tfs = FALSE, + verbose = verbose) + } else { # No filtering on expression assay, use all TFs with a motif + if (verbose > 0) { + cat("No filtering on expression assay, using all TFs with a motif.\n") + } + tfs_use <- unique(hummus_object@motifs_db@tf2motifs$tf) + } + + # Check if the peak assay is present in the seurat object + if (!peak_assay %in% names(hummus_object@assays)) { + stop("The peak assay is not present in the seurat object") + } + # Check if the peak assay is a ChromatinAssay object + if (!inherits(hummus_object@assays[[peak_assay]], + "ChromatinAssay")) { + stop("The peak assay is not a ChromatinAssay object + or does not have annotations (gene.range object))") + } + # Check if the peak assay has gene.range annotations + if (is.null(Signac::Annotation(hummus_object[[peak_assay]]))) { + stop("The peak assay does not have annotations (gene.range object)") + } + + # Add motifs to the peaks + motif_pos <- Signac::AddMotifs( + object = hummus_object[[peak_assay]], + genome = genome, + pfm = hummus_object@motifs_db@motifs #add verbose options + ) + + ## The 17 following lines are inspired from the Pando package : + # https://github.com/quadbiolab/Pando/blob/main/R/regions.R + # Add TF info for motifs + if (verbose > 0) { + cat("\tAdding TF info\n") + } + + # Spread dataframe to sparse matrix + tf2motifs <- hummus_object@motifs_db@tf2motifs + # Select motif and tf columns + tf2motifs <- dplyr::"%>%"(tf2motifs, dplyr::select("motif" = 1, "tf" = 2)) + tf2motifs <- dplyr::"%>%"(tf2motifs, dplyr::distinct()) # Remove duplicates + # Add value column + tf2motifs <- dplyr::"%>%"(tf2motifs, dplyr::mutate(val = 1)) + tf2motifs <- dplyr::"%>%"(tf2motifs, # Spread TFs + tidyr::pivot_wider(names_from = "tf", + values_from = val, + values_fill = 0) + ) + # Set motif as rownames + tf2motifs <- dplyr::"%>%"(tf2motifs, tibble::column_to_rownames("motif")) + tf2motifs <- dplyr::"%>%"(tf2motifs, as.matrix()) # Convert to matrix + + # Convert to sparse matrix + tf2motifs <- dplyr::"%>%"(tf2motifs, Matrix::Matrix(sparse = TRUE)) + + if (length(tfs_use) == 0) { # If no TFs are found in the dataset + stop("None of the provided TFs were found in the dataset. + Consider providing a custom motif-to-TF map as `motif_tfs`") + } + + # Get TF peak links + # Keep only the TFs that are in our tf list + TFs_Peaks <- motif_pos@motifs@data %*% tf2motifs[, tfs_use] + + # Remove values equal to 0 + tfs2peaks <- expand.grid(rownames(TFs_Peaks), + colnames(TFs_Peaks))[as.vector(TFs_Peaks > 0), ] + # TF-peak links + colnames(tfs2peaks) <- c("peak", "TF") # set column names + + # Save TF-peak links + store_network(network = tfs2peaks, + store_network = store_network, + output_file = output_file, + verbose = verbose) + + if (verbose > 0) { + cat("\tReturning TF-peak links as bipartite object\n") + } + + # Set default names for the networks if not provided + if (is.null(tf_multiplex_name)) { + cat("no TF layer name provided, using tf_expr_assay name\n") + tf_multiplex_name <- tf_expr_assay + } + if (is.null(peak_multiplex_name)) { + cat("no peak layer name provided, using peak_assay name\n") + peak_multiplex_name <- peak_assay + } + + # Return tf-peak bipartite + hummus_object@multilayer@bipartites[[bipartite_name]] <- new("bipartite", + "network" = tfs2peaks, + "multiplex_left" = peak_multiplex_name, + "multiplex_right" = tf_multiplex_name) + return(hummus_object) # Return TF-peak bipartite object +} + + + + +#' Compute links between DNA regions and genenames +#' +#' Compute and add bipartite between DNA regions and genenames to hummus object. +#' Links are computed based on the distance between peaks and gene's TSS +#' location from gene.range annotations. +#' Call find_peaks_near_genes function, that can use different methods. +#' +#' @param hummus_object (hummus_object) - Hummus object. +#' @param gene_assay (character) - Name of assay containing the gene expression +#' data. Default: "RNA". +#' @param peak_assay (character) - Name of the assay containing the DNA regions +#' (ATAC peaks). Default: "peaks". +#' @param gene_multiplex_name (character) - Name of the multiplex containing the +#' genes. +#' If NULL, the name of the gene assay is used. +#' @param peak_multiplex_name (character) - Name of the multiplex containing the +#' DNA regions (ATAC peaks). If NULL, the name of the peak assay is used. +#' @param peak_to_gene_method (character) - Method to use to compute the links +#' between peaks and genes. Default: "Signac". +#' * \code{'Signac'} - Use Signac::Extend to extend genes. +#' * \code{'GREAT'} - Not implemented yet. +#' @param upstream (int) - Upstream distance from TSS +#' to consider as potential promoter. +#' @param downstream (int) - Downstream distance from TSS +#' to consider as potential promoter. +#' @param only_tss (logical) - If TRUE, only TSS will be considered. +#' @param store_network (bool) - Save the bipartite directly +#' (\code{TRUE}, default) or return without saving on disk (\code{FALSE}). +#' @param output_file (character) - Name of the output_file +#' (if store_bipartite == \code{TRUE}). Default: NULL. +#' @param verbose (integer) Display function messages. +#' Set to 0 for no message displayed, >= 1 for more details. Default: 1. +#' @param bipartite_name (character) - Name of bipartite. Default: "atac_rna". +#' +#' @return hummus_object (hummus_object) - Hummus object w/ atac-rna bipartite +#' added to the multilayer slot +#' @export +#' +#' @examples hummus <- bipartite_peaks2genes( +#' hummus_object = hummus, +#' gene_assay = "RNA", +#' peak_assay = "peaks", +#' gene_multiplex_name = "RNA", +#' peak_multiplex_name = "peaks", +#' peak_to_gene_method = "Signac", +#' upstream = 500, +#' downstream = 500, +#' only_tss = TRUE, +#' store_network = FALSE, +#' bipartite_name = "atac_rna") + +bipartite_peaks2genes <- function( + hummus_object, + gene_assay = "RNA", + peak_assay = "peaks", + gene_multiplex_name = NULL, + peak_multiplex_name = NULL, + peak_to_gene_method = "Signac", + upstream = 500, + downstream = 500, + only_tss = TRUE, + store_network = FALSE, + output_file = NULL, + bipartite_name = "atac_rna" + ) { + # Check if the gene assay is present in the hummus object + if (!gene_assay %in% names(hummus_object@assays)) { + stop("The gene assay is not present in the hummus object") + } else if (!peak_assay %in% names(hummus_object@assays)) { + # Check if the peak assay is present in the hummus object + stop("The peak assay is not present in the hummus object") + } else if (!inherits(hummus_object@assays[[peak_assay]], + "ChromatinAssay")) { + # Check if the peak assay is a ChromatinAssay object + stop("The peak assay is not a ChromatinAssay object + or does not have annotations (gene.range object))") + } else if (is.null(Signac::Annotation(hummus_object[[peak_assay]]))) { + # Check if the peak assay has gene.range annotations + stop("The peak assay does not have annotations (gene.range object)") + } + + # Find candidate regions near gene bodies + peaks_near_genes <- find_peaks_near_genes( + peaks = hummus_object[[peak_assay]]@ranges, + method = peak_to_gene_method, + genes = Signac::Annotation(hummus_object[[peak_assay]]), + upstream = upstream, + downstream = downstream, + only_tss = only_tss) + # Aggregate candidate regions to gene bodies (peak to gene matrix) + peaks2genes <- aggregate_matrix(Matrix::t(peaks_near_genes), + groups = colnames(peaks_near_genes), + fun = "sum") + # Keep only the genes that are in our scRNA-seq dataset + peaks2genes <- peaks2genes[rownames(peaks2genes) + %in% rownames(hummus_object@assays[[gene_assay]]), ] + # Remove rows/cols with only zeros + peaks2genes <- peaks2genes[Matrix::rowSums(peaks2genes) != 0, + Matrix::colSums(peaks2genes) != 0] + # peak-gene links + peaks2genes <- expand.grid(rownames(peaks2genes), + colnames(peaks2genes))[as.vector(peaks2genes > 0), ] + colnames(peaks2genes) <- c("gene", "peak") # set column names + + + # Save peak-gene links + store_network(network = peaks2genes, + store_network = store_network, + output_file = output_file, + verbose = 1) + + # Set default names for the networks if not provided + if (is.null(gene_multiplex_name)) { + gene_multiplex_name <- gene_assay + } + if (is.null(peak_multiplex_name)) { + peak_multiplex_name <- peak_assay + } + + # Return atac-rna bipartite + hummus_object@multilayer@bipartites[[bipartite_name]] <- new("bipartite", + "network" = peaks2genes, + "multiplex_left" = gene_multiplex_name, + "multiplex_right" = peak_multiplex_name) + return(hummus_object) +} + +#' @title Associate peaks to genes based on distance to TSS (or gene body) +#' +#' @param peaks vector(character) - List of peaks. +#' @param genes vector(character) - List of genes. +#' @param sep vector(character) - Separator between chromosome, +#' start and end position. Default: c('-', '-'). +#' @param method (character) - Method to use. Default: "Signac". +#' * \code{'Signac'} - Use Signac::Extend to extend genes. +#' * \code{'GREAT'} - Not implemented yet. +#' @param upstream (int) - Upstream distance from TSS +#' to consider as potential promoter. +#' @param downstream (int) - Downstream distance from TSS +#' to consider as potential promoter. +#' @param extend (int) - Integer defining the distance from the upstream +#' and downstream of the basal regulatory region. Used only by method 'GREAT'. +#' @param only_tss (logical) - If TRUE, only TSS will be considered. +#' @param verbose (logical) - If TRUE, print progress messages. +#' +#' @return (matrix) - Matrix of peaks x genes with 1 if peak is near gene. +#' @export +#' +#' @examples TODO +find_peaks_near_genes <- function( + peaks, + genes, + sep = c("-", "-"), + method = c("Signac", "GREAT"), + upstream = 100000, + downstream = 0, + extend = 1000000, + only_tss = FALSE, + verbose = TRUE + ) { + # Match arg + method <- match.arg(method) + + if (method == "Signac") { + + if (only_tss) { + genes <- IRanges::resize(x = genes, width = 1, fix = "start") + } + genes_extended <- suppressWarnings( + expr = Signac::Extend( + genes, upstream = upstream, downstream = downstream + ) + ) + overlaps <- IRanges::findOverlaps( + query = peaks, + subject = genes_extended, + type = "any", + select = "all" + ) + hit_matrix <- Matrix::sparseMatrix( + i = S4Vectors::queryHits(overlaps), + j = S4Vectors::subjectHits(overlaps), + x = 1, + dims = c(length(peaks), length(genes_extended)) + ) + rownames(hit_matrix) <- Signac::GRangesToString(grange = peaks, sep = sep) + colnames(hit_matrix) <- genes_extended$gene_name + + } else { + stop("method must be either 'Signac' or 'GREAT' ; + please check that current version of HuMMuS + already accepts GREAT as a method.") + } + return(hit_matrix) +} + + +#' @title Filter peaks to those overlapping specific (regulatory) elements +#' @description Function to reduce list of "Peaks" to the ones overlapping with +#' list of "RegEl", e.g. regulatory elements, evolutionary conserved regions +#' +#' @param Peaks (character) vector of genomic coordinates of peaks +#' @param RegEl (character) vector of genomic coordinates of regulatory elements +#' @param sep_Peak1 (character) separator between chromosome and +#' start position of peak +#' @param sep_Peak2 (character) separator between start position +#' and end position of peak +#' @param sep_RegEl1 (character) separator between chromosome and +#' start position of regulatory element +#' @param sep_RegEl2 (character) separator between start position and +#' end position of regulatory element +#' +#' @return (character) vector of genomic coordinates of peaks overlapping +#' @export +#' +#' @examples peaks_in_regulatory_elements(peaks, RegEl) +peaks_in_regulatory_elements <- function( + Peaks, + RegEl, + sep_Peak1 = "-", + sep_Peak2 = "-", + sep_RegEl1 = "-", + sep_RegEl2 = "-" + ) { + # Make sure Peaks and RegEl are unique + Peaks <- unique(Peaks) + RegEl <- unique(RegEl) + + # convert genomic corrdinate string to GRanges object + Peak_GRangesObj <- Signac::StringToGRanges(Peaks, + sep = c(sep_Peak1, sep_Peak2)) + RegEl_GRangesObj <- Signac::StringToGRanges(RegEl, + sep = c(sep_RegEl1, sep_RegEl2)) + + # find overlap between peaks and regulatory elements + PeakOverlaps <- IRanges::findOverlaps(query = RegEl_GRangesObj, + subject = Peak_GRangesObj) + + # return peaks that overlapped with regulatory element + return(Peaks[unique(as.matrix(PeakOverlaps)[, 2])]) +} \ No newline at end of file