Diff of /R/bipartites.R [000000] .. [9abfcf]

Switch to unified view

a b/R/bipartites.R
1
#' Compute links between TFs and DNA regions (ATAC peaks)
2
#'
3
#' Compute and add bipartite between TFs and DNA regions to hummus object.
4
#' Links are computed based on the binding motifs of TFs and their locations
5
#' on a reference genome.
6
#' Currently based on Signac AddMotifs function (--> motifmachR, itself based on
7
#' MOODs algorithm).
8
#'
9
#' @param hummus_object (hummus_object) - Hummus object.
10
#' @param tf_expr_assay (character) - Name of assay containing the TF expression
11
#' data. If NULL, all TFs with a motif are used. Default: "RNA".
12
#' @param peak_assay (character) - Name of the assay containing the DNA regions
13
#' (ATAC peaks). Default: "peaks".
14
#' @param tf_multiplex_name (character) - Name of multiplex containing the TFs.
15
#' If NULL, the name of the TF assay is used.
16
#' @param peak_multiplex_name (character) - Name of the multiplex containing the
17
#' DNA regions (ATAC peaks). If NULL, the name of the peak assay is used.
18
#' @param genome (BSgenome object) - Reference genome.
19
#' @param store_network (bool) - Save the bipartite directly
20
#' (\code{TRUE}, default) or return without saving on disk (\code{FALSE}).
21
#' @param output_file (character) - Name of the output_file
22
#' (if store_bipartite == \code{TRUE}). Default: NULL.
23
#' @param verbose (integer) Display function messages.
24
#' Set to 0 for no message displayed, >= 1 for more details. Default: 1.
25
#' @param bipartite_name (character) - Name of bipartite. Default: "tf_peak".
26
#'
27
#' @return hummus_object (hummus_object) - Hummus object with TF-peak bipartite
28
#' added to the multilayer slot
29
#' @export
30
#'
31
#' @examples hummus <- bipartite_tfs2peaks(
32
#'                      hummus_object = hummus,
33
#'                      tf_expr_assay = "RNA",
34
#'                      peak_assay = "peaks",
35
#'                      tf_multiplex_name = "TF",
36
#'                      peak_multiplex_name = "peaks",
37
#'           genome = BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38,
38
#'                      store_network = FALSE,
39
#'                      verbose = 1,
40
#'                      bipartite_name = "tf_peak")
41
42
bipartite_tfs2peaks <- function(
43
  hummus_object,
44
  tf_expr_assay = "RNA",
45
  peak_assay = "peaks",
46
  tf_multiplex_name = NULL,
47
  peak_multiplex_name = NULL,
48
  genome,
49
  store_network = FALSE,
50
  output_file = NULL,
51
  verbose = 1,
52
  bipartite_name = "tf_peak"
53
  ) {
54
55
  if (verbose > 0) {
56
    cat("Computing TF-peak bipartite\n")
57
  }
58
  # Cck if tf_gene_assay is NULL
59
  if (!is.null(tf_expr_assay)) {
60
    # Check if the gene assay is present in the seurat object
61
    if (!tf_expr_assay %in% names(hummus_object@assays)) {
62
      stop("The gene assay is not present in the seurat object")
63
    }
64
    # Get TFs expressed in  assay AND having known binding motifs
65
    tfs_use <- get_tfs(hummus_object,
66
                       assay = tf_expr_assay,
67
                       store_tfs = FALSE,
68
                       verbose = verbose)
69
  } else { # No filtering on expression assay, use all TFs with a motif
70
    if (verbose > 0) {
71
      cat("No filtering on expression assay, using all TFs with a motif.\n")
72
    }
73
    tfs_use <- unique(hummus_object@motifs_db@tf2motifs$tf)
74
  }
75
76
  # Check if the peak assay is present in the seurat object
77
  if (!peak_assay %in% names(hummus_object@assays)) {
78
    stop("The peak assay is not present in the seurat object")
79
  }
80
  # Check if the peak assay is a ChromatinAssay object
81
  if (!inherits(hummus_object@assays[[peak_assay]],
82
                     "ChromatinAssay")) {
83
    stop("The peak assay is not a ChromatinAssay object 
84
    or does not have annotations (gene.range object))")
85
  }
86
  # Check if the peak assay has gene.range annotations
87
  if (is.null(Signac::Annotation(hummus_object[[peak_assay]]))) {
88
      stop("The peak assay does not have annotations (gene.range object)")
89
  }
90
91
  # Add motifs to the peaks
92
  motif_pos <- Signac::AddMotifs(
93
    object = hummus_object[[peak_assay]],
94
    genome = genome,
95
    pfm = hummus_object@motifs_db@motifs #add verbose options
96
  )
97
98
  ## The 17 following lines are inspired from the Pando package :
99
  # https://github.com/quadbiolab/Pando/blob/main/R/regions.R
100
  # Add TF info for motifs
101
  if (verbose > 0) {
102
    cat("\tAdding TF info\n")
103
  }
104
105
  # Spread dataframe to sparse matrix
106
  tf2motifs <- hummus_object@motifs_db@tf2motifs
107
  # Select motif and tf columns
108
  tf2motifs <- dplyr::"%>%"(tf2motifs, dplyr::select("motif" = 1, "tf" = 2))
109
  tf2motifs <- dplyr::"%>%"(tf2motifs, dplyr::distinct()) # Remove duplicates
110
  # Add value column
111
  tf2motifs <- dplyr::"%>%"(tf2motifs, dplyr::mutate(val = 1))
112
  tf2motifs <- dplyr::"%>%"(tf2motifs, # Spread TFs
113
                  tidyr::pivot_wider(names_from = "tf",
114
                                     values_from = val,
115
                                     values_fill = 0)
116
                            )
117
  # Set motif as rownames
118
  tf2motifs <- dplyr::"%>%"(tf2motifs, tibble::column_to_rownames("motif"))
119
  tf2motifs <- dplyr::"%>%"(tf2motifs, as.matrix()) # Convert to matrix
120
121
  # Convert to sparse matrix
122
  tf2motifs <- dplyr::"%>%"(tf2motifs, Matrix::Matrix(sparse = TRUE))
123
124
  if (length(tfs_use) == 0) { # If no TFs are found in the dataset
125
    stop("None of the provided TFs were found in the dataset.
126
    Consider providing a custom motif-to-TF map as `motif_tfs`")
127
  }
128
129
  # Get TF peak links
130
  # Keep only the TFs that are in our tf list
131
  TFs_Peaks <- motif_pos@motifs@data %*% tf2motifs[, tfs_use]
132
133
  # Remove values equal to 0
134
  tfs2peaks <- expand.grid(rownames(TFs_Peaks),
135
                           colnames(TFs_Peaks))[as.vector(TFs_Peaks > 0), ]
136
                          # TF-peak links
137
  colnames(tfs2peaks) <- c("peak", "TF")     # set column names
138
139
  # Save TF-peak links
140
  store_network(network = tfs2peaks,
141
                store_network = store_network,
142
                output_file = output_file,
143
                verbose = verbose)
144
145
  if (verbose > 0) {
146
    cat("\tReturning TF-peak links as bipartite object\n")
147
  }
148
149
  # Set default names for the networks if not provided
150
  if (is.null(tf_multiplex_name)) {
151
    cat("no TF layer name provided, using tf_expr_assay name\n")
152
    tf_multiplex_name <- tf_expr_assay
153
  }
154
  if (is.null(peak_multiplex_name)) {
155
    cat("no peak layer name provided, using peak_assay name\n")
156
    peak_multiplex_name <- peak_assay
157
  }
158
159
  # Return tf-peak bipartite
160
  hummus_object@multilayer@bipartites[[bipartite_name]] <- new("bipartite",
161
                           "network" = tfs2peaks,
162
                           "multiplex_left" = peak_multiplex_name,
163
                           "multiplex_right" = tf_multiplex_name)
164
  return(hummus_object) # Return TF-peak bipartite object
165
}
166
167
168
169
170
#' Compute links between DNA regions and genenames
171
#'
172
#' Compute and add bipartite between DNA regions and genenames to hummus object.
173
#' Links are computed based on the distance between peaks and gene's TSS
174
#' location from gene.range annotations.
175
#' Call find_peaks_near_genes function, that can use different methods.
176
#'
177
#' @param hummus_object (hummus_object) - Hummus object.
178
#' @param gene_assay (character) - Name of assay containing the gene expression
179
#' data. Default: "RNA".
180
#' @param peak_assay (character) - Name of the assay containing the DNA regions
181
#' (ATAC peaks). Default: "peaks".
182
#' @param gene_multiplex_name (character) - Name of the multiplex containing the
183
#' genes.
184
#' If NULL, the name of the gene assay is used.
185
#' @param peak_multiplex_name (character) - Name of the multiplex containing the
186
#' DNA regions (ATAC peaks). If NULL, the name of the peak assay is used.
187
#' @param peak_to_gene_method (character) - Method to use to compute the links
188
#' between peaks and genes. Default: "Signac".
189
#' * \code{'Signac'} - Use Signac::Extend to extend genes.
190
#' * \code{'GREAT'} - Not implemented yet.
191
#' @param upstream (int) - Upstream distance from TSS
192
#' to consider as potential promoter.
193
#' @param downstream (int) - Downstream distance from TSS
194
#' to consider as potential promoter.
195
#' @param only_tss (logical) - If TRUE, only TSS will be considered.
196
#' @param store_network (bool) - Save the bipartite directly
197
#' (\code{TRUE}, default) or return without saving on disk (\code{FALSE}).
198
#' @param output_file (character) - Name of the output_file
199
#' (if store_bipartite == \code{TRUE}). Default: NULL.
200
#' @param verbose (integer) Display function messages.
201
#' Set to 0 for no message displayed, >= 1 for more details. Default: 1.
202
#' @param bipartite_name (character) - Name of bipartite. Default: "atac_rna".
203
#' 
204
#' @return hummus_object (hummus_object) - Hummus object w/ atac-rna bipartite
205
#' added to the multilayer slot
206
#' @export
207
#'
208
#' @examples hummus <- bipartite_peaks2genes(
209
#'                        hummus_object = hummus,
210
#'                        gene_assay = "RNA",
211
#'                        peak_assay = "peaks",
212
#'                        gene_multiplex_name = "RNA",
213
#'                        peak_multiplex_name = "peaks",
214
#'                        peak_to_gene_method = "Signac",
215
#'                        upstream = 500,
216
#'                        downstream = 500,
217
#'                        only_tss = TRUE,
218
#'                        store_network = FALSE,
219
#'                        bipartite_name = "atac_rna")
220
221
bipartite_peaks2genes <- function(
222
  hummus_object,
223
  gene_assay = "RNA",
224
  peak_assay = "peaks",
225
  gene_multiplex_name = NULL,
226
  peak_multiplex_name = NULL,
227
  peak_to_gene_method = "Signac",
228
  upstream = 500,
229
  downstream = 500,
230
  only_tss = TRUE,
231
  store_network = FALSE,
232
  output_file = NULL,
233
  bipartite_name = "atac_rna"
234
  ) {
235
  # Check if the gene assay is present in the hummus object
236
  if (!gene_assay %in% names(hummus_object@assays)) {
237
      stop("The gene assay is not present in the hummus object")
238
  } else if (!peak_assay %in% names(hummus_object@assays)) {
239
      # Check if the peak assay is present in the hummus object
240
      stop("The peak assay is not present in the hummus object")
241
  } else if (!inherits(hummus_object@assays[[peak_assay]],
242
                      "ChromatinAssay")) {
243
      # Check if the peak assay is a ChromatinAssay object
244
      stop("The peak assay is not a ChromatinAssay object 
245
      or does not have annotations (gene.range object))")
246
  } else if (is.null(Signac::Annotation(hummus_object[[peak_assay]]))) {
247
      # Check if the peak assay has gene.range annotations
248
      stop("The peak assay does not have annotations (gene.range object)")
249
  }
250
251
  # Find candidate regions near gene bodies
252
  peaks_near_genes <- find_peaks_near_genes(
253
                        peaks = hummus_object[[peak_assay]]@ranges,
254
                        method = peak_to_gene_method,
255
                        genes = Signac::Annotation(hummus_object[[peak_assay]]),
256
                        upstream = upstream,
257
                        downstream = downstream,
258
                        only_tss = only_tss)
259
  # Aggregate candidate regions to gene bodies (peak to gene matrix)
260
  peaks2genes <- aggregate_matrix(Matrix::t(peaks_near_genes),
261
                                 groups = colnames(peaks_near_genes),
262
                                 fun = "sum")
263
  # Keep only the genes that are in our scRNA-seq dataset
264
  peaks2genes <- peaks2genes[rownames(peaks2genes)
265
                 %in% rownames(hummus_object@assays[[gene_assay]]), ]
266
  # Remove rows/cols with only zeros
267
  peaks2genes <- peaks2genes[Matrix::rowSums(peaks2genes) != 0,
268
                             Matrix::colSums(peaks2genes) != 0]
269
  # peak-gene links
270
  peaks2genes <- expand.grid(rownames(peaks2genes),
271
                          colnames(peaks2genes))[as.vector(peaks2genes > 0), ]
272
  colnames(peaks2genes) <- c("gene", "peak") # set column names
273
274
275
  # Save peak-gene links
276
  store_network(network = peaks2genes,
277
                store_network = store_network,
278
                output_file = output_file,
279
                verbose = 1)
280
281
  # Set default names for the networks if not provided
282
  if (is.null(gene_multiplex_name)) {
283
    gene_multiplex_name <- gene_assay
284
  }
285
  if (is.null(peak_multiplex_name)) {
286
    peak_multiplex_name <- peak_assay
287
  }
288
289
  # Return atac-rna bipartite
290
  hummus_object@multilayer@bipartites[[bipartite_name]] <- new("bipartite",
291
                           "network" = peaks2genes,
292
                           "multiplex_left" = gene_multiplex_name,
293
                           "multiplex_right" = peak_multiplex_name)
294
  return(hummus_object)
295
}
296
297
#' @title Associate peaks to genes based on distance to TSS (or gene body)
298
#'
299
#' @param peaks vector(character) - List of peaks.
300
#' @param genes vector(character) - List of genes.
301
#' @param sep vector(character) - Separator between chromosome,
302
#'            start and end position. Default: c('-', '-').
303
#' @param method (character) - Method to use. Default: "Signac".
304
#' * \code{'Signac'} - Use Signac::Extend to extend genes.
305
#' * \code{'GREAT'} - Not implemented yet.
306
#' @param upstream (int) - Upstream distance from TSS
307
#' to consider as potential promoter.
308
#' @param downstream (int) - Downstream distance from TSS
309
#' to consider as potential promoter.
310
#' @param extend (int) - Integer defining the distance from the upstream
311
#' and downstream of the basal regulatory region. Used only by method 'GREAT'.
312
#' @param only_tss (logical) - If TRUE, only TSS will be considered.
313
#' @param verbose (logical) - If TRUE, print progress messages.
314
#'
315
#' @return (matrix) - Matrix of peaks x genes with 1 if peak is near gene.
316
#' @export
317
#'
318
#' @examples TODO
319
find_peaks_near_genes <- function(
320
  peaks,
321
  genes,
322
  sep = c("-", "-"),
323
  method = c("Signac", "GREAT"),
324
  upstream = 100000,
325
  downstream = 0,
326
  extend = 1000000,
327
  only_tss = FALSE,
328
  verbose = TRUE
329
  ) {
330
  # Match arg
331
  method <- match.arg(method)
332
333
  if (method == "Signac") {
334
335
    if (only_tss) {
336
      genes <- IRanges::resize(x = genes, width = 1, fix = "start")
337
    }
338
    genes_extended <- suppressWarnings(
339
      expr = Signac::Extend(
340
        genes, upstream = upstream, downstream = downstream
341
      )
342
    )
343
    overlaps <- IRanges::findOverlaps(
344
      query = peaks,
345
      subject = genes_extended,
346
      type = "any",
347
      select = "all"
348
    )
349
    hit_matrix <- Matrix::sparseMatrix(
350
      i = S4Vectors::queryHits(overlaps),
351
      j = S4Vectors::subjectHits(overlaps),
352
      x = 1,
353
      dims = c(length(peaks), length(genes_extended))
354
    )
355
    rownames(hit_matrix) <- Signac::GRangesToString(grange = peaks, sep = sep)
356
    colnames(hit_matrix) <- genes_extended$gene_name
357
358
  } else {
359
    stop("method must be either 'Signac' or 'GREAT' ; 
360
          please check that current version of HuMMuS
361
          already accepts GREAT as a method.")
362
  }
363
  return(hit_matrix)
364
}
365
366
367
#' @title Filter peaks to those overlapping specific (regulatory) elements
368
#' @description Function to reduce list of "Peaks" to the ones overlapping with
369
#' list of "RegEl", e.g. regulatory elements, evolutionary conserved regions
370
#'
371
#' @param Peaks (character) vector of genomic coordinates of peaks
372
#' @param RegEl (character) vector of genomic coordinates of regulatory elements
373
#' @param sep_Peak1 (character) separator between chromosome and
374
#'                              start position of peak
375
#' @param sep_Peak2 (character) separator between start position
376
#'                              and end position of peak
377
#' @param sep_RegEl1 (character) separator between chromosome and
378
#'                               start position of regulatory element
379
#' @param sep_RegEl2 (character) separator between start position and
380
#'                               end position of regulatory element
381
#'
382
#' @return (character) vector of genomic coordinates of peaks overlapping
383
#' @export
384
#'
385
#' @examples peaks_in_regulatory_elements(peaks, RegEl)
386
peaks_in_regulatory_elements <- function(
387
  Peaks,
388
  RegEl,
389
  sep_Peak1 = "-",
390
  sep_Peak2 = "-",
391
  sep_RegEl1 = "-",
392
  sep_RegEl2 = "-"
393
  ) {
394
  # Make sure Peaks and RegEl are unique
395
  Peaks <- unique(Peaks)
396
  RegEl <- unique(RegEl)
397
398
  # convert genomic corrdinate string to GRanges object
399
  Peak_GRangesObj <- Signac::StringToGRanges(Peaks, 
400
                                             sep = c(sep_Peak1, sep_Peak2))
401
  RegEl_GRangesObj <- Signac::StringToGRanges(RegEl,
402
                                              sep = c(sep_RegEl1, sep_RegEl2))
403
404
  # find overlap between peaks and regulatory elements
405
  PeakOverlaps <- IRanges::findOverlaps(query = RegEl_GRangesObj,
406
                                        subject = Peak_GRangesObj)
407
408
  # return peaks that overlapped with regulatory element
409
  return(Peaks[unique(as.matrix(PeakOverlaps)[, 2])])
410
}