|
a |
|
b/R/bipartites.R |
|
|
1 |
#' Compute links between TFs and DNA regions (ATAC peaks) |
|
|
2 |
#' |
|
|
3 |
#' Compute and add bipartite between TFs and DNA regions to hummus object. |
|
|
4 |
#' Links are computed based on the binding motifs of TFs and their locations |
|
|
5 |
#' on a reference genome. |
|
|
6 |
#' Currently based on Signac AddMotifs function (--> motifmachR, itself based on |
|
|
7 |
#' MOODs algorithm). |
|
|
8 |
#' |
|
|
9 |
#' @param hummus_object (hummus_object) - Hummus object. |
|
|
10 |
#' @param tf_expr_assay (character) - Name of assay containing the TF expression |
|
|
11 |
#' data. If NULL, all TFs with a motif are used. Default: "RNA". |
|
|
12 |
#' @param peak_assay (character) - Name of the assay containing the DNA regions |
|
|
13 |
#' (ATAC peaks). Default: "peaks". |
|
|
14 |
#' @param tf_multiplex_name (character) - Name of multiplex containing the TFs. |
|
|
15 |
#' If NULL, the name of the TF assay is used. |
|
|
16 |
#' @param peak_multiplex_name (character) - Name of the multiplex containing the |
|
|
17 |
#' DNA regions (ATAC peaks). If NULL, the name of the peak assay is used. |
|
|
18 |
#' @param genome (BSgenome object) - Reference genome. |
|
|
19 |
#' @param store_network (bool) - Save the bipartite directly |
|
|
20 |
#' (\code{TRUE}, default) or return without saving on disk (\code{FALSE}). |
|
|
21 |
#' @param output_file (character) - Name of the output_file |
|
|
22 |
#' (if store_bipartite == \code{TRUE}). Default: NULL. |
|
|
23 |
#' @param verbose (integer) Display function messages. |
|
|
24 |
#' Set to 0 for no message displayed, >= 1 for more details. Default: 1. |
|
|
25 |
#' @param bipartite_name (character) - Name of bipartite. Default: "tf_peak". |
|
|
26 |
#' |
|
|
27 |
#' @return hummus_object (hummus_object) - Hummus object with TF-peak bipartite |
|
|
28 |
#' added to the multilayer slot |
|
|
29 |
#' @export |
|
|
30 |
#' |
|
|
31 |
#' @examples hummus <- bipartite_tfs2peaks( |
|
|
32 |
#' hummus_object = hummus, |
|
|
33 |
#' tf_expr_assay = "RNA", |
|
|
34 |
#' peak_assay = "peaks", |
|
|
35 |
#' tf_multiplex_name = "TF", |
|
|
36 |
#' peak_multiplex_name = "peaks", |
|
|
37 |
#' genome = BSgenome.Hsapiens.UCSC.hg38::BSgenome.Hsapiens.UCSC.hg38, |
|
|
38 |
#' store_network = FALSE, |
|
|
39 |
#' verbose = 1, |
|
|
40 |
#' bipartite_name = "tf_peak") |
|
|
41 |
|
|
|
42 |
bipartite_tfs2peaks <- function( |
|
|
43 |
hummus_object, |
|
|
44 |
tf_expr_assay = "RNA", |
|
|
45 |
peak_assay = "peaks", |
|
|
46 |
tf_multiplex_name = NULL, |
|
|
47 |
peak_multiplex_name = NULL, |
|
|
48 |
genome, |
|
|
49 |
store_network = FALSE, |
|
|
50 |
output_file = NULL, |
|
|
51 |
verbose = 1, |
|
|
52 |
bipartite_name = "tf_peak" |
|
|
53 |
) { |
|
|
54 |
|
|
|
55 |
if (verbose > 0) { |
|
|
56 |
cat("Computing TF-peak bipartite\n") |
|
|
57 |
} |
|
|
58 |
# Cck if tf_gene_assay is NULL |
|
|
59 |
if (!is.null(tf_expr_assay)) { |
|
|
60 |
# Check if the gene assay is present in the seurat object |
|
|
61 |
if (!tf_expr_assay %in% names(hummus_object@assays)) { |
|
|
62 |
stop("The gene assay is not present in the seurat object") |
|
|
63 |
} |
|
|
64 |
# Get TFs expressed in assay AND having known binding motifs |
|
|
65 |
tfs_use <- get_tfs(hummus_object, |
|
|
66 |
assay = tf_expr_assay, |
|
|
67 |
store_tfs = FALSE, |
|
|
68 |
verbose = verbose) |
|
|
69 |
} else { # No filtering on expression assay, use all TFs with a motif |
|
|
70 |
if (verbose > 0) { |
|
|
71 |
cat("No filtering on expression assay, using all TFs with a motif.\n") |
|
|
72 |
} |
|
|
73 |
tfs_use <- unique(hummus_object@motifs_db@tf2motifs$tf) |
|
|
74 |
} |
|
|
75 |
|
|
|
76 |
# Check if the peak assay is present in the seurat object |
|
|
77 |
if (!peak_assay %in% names(hummus_object@assays)) { |
|
|
78 |
stop("The peak assay is not present in the seurat object") |
|
|
79 |
} |
|
|
80 |
# Check if the peak assay is a ChromatinAssay object |
|
|
81 |
if (!inherits(hummus_object@assays[[peak_assay]], |
|
|
82 |
"ChromatinAssay")) { |
|
|
83 |
stop("The peak assay is not a ChromatinAssay object |
|
|
84 |
or does not have annotations (gene.range object))") |
|
|
85 |
} |
|
|
86 |
# Check if the peak assay has gene.range annotations |
|
|
87 |
if (is.null(Signac::Annotation(hummus_object[[peak_assay]]))) { |
|
|
88 |
stop("The peak assay does not have annotations (gene.range object)") |
|
|
89 |
} |
|
|
90 |
|
|
|
91 |
# Add motifs to the peaks |
|
|
92 |
motif_pos <- Signac::AddMotifs( |
|
|
93 |
object = hummus_object[[peak_assay]], |
|
|
94 |
genome = genome, |
|
|
95 |
pfm = hummus_object@motifs_db@motifs #add verbose options |
|
|
96 |
) |
|
|
97 |
|
|
|
98 |
## The 17 following lines are inspired from the Pando package : |
|
|
99 |
# https://github.com/quadbiolab/Pando/blob/main/R/regions.R |
|
|
100 |
# Add TF info for motifs |
|
|
101 |
if (verbose > 0) { |
|
|
102 |
cat("\tAdding TF info\n") |
|
|
103 |
} |
|
|
104 |
|
|
|
105 |
# Spread dataframe to sparse matrix |
|
|
106 |
tf2motifs <- hummus_object@motifs_db@tf2motifs |
|
|
107 |
# Select motif and tf columns |
|
|
108 |
tf2motifs <- dplyr::"%>%"(tf2motifs, dplyr::select("motif" = 1, "tf" = 2)) |
|
|
109 |
tf2motifs <- dplyr::"%>%"(tf2motifs, dplyr::distinct()) # Remove duplicates |
|
|
110 |
# Add value column |
|
|
111 |
tf2motifs <- dplyr::"%>%"(tf2motifs, dplyr::mutate(val = 1)) |
|
|
112 |
tf2motifs <- dplyr::"%>%"(tf2motifs, # Spread TFs |
|
|
113 |
tidyr::pivot_wider(names_from = "tf", |
|
|
114 |
values_from = val, |
|
|
115 |
values_fill = 0) |
|
|
116 |
) |
|
|
117 |
# Set motif as rownames |
|
|
118 |
tf2motifs <- dplyr::"%>%"(tf2motifs, tibble::column_to_rownames("motif")) |
|
|
119 |
tf2motifs <- dplyr::"%>%"(tf2motifs, as.matrix()) # Convert to matrix |
|
|
120 |
|
|
|
121 |
# Convert to sparse matrix |
|
|
122 |
tf2motifs <- dplyr::"%>%"(tf2motifs, Matrix::Matrix(sparse = TRUE)) |
|
|
123 |
|
|
|
124 |
if (length(tfs_use) == 0) { # If no TFs are found in the dataset |
|
|
125 |
stop("None of the provided TFs were found in the dataset. |
|
|
126 |
Consider providing a custom motif-to-TF map as `motif_tfs`") |
|
|
127 |
} |
|
|
128 |
|
|
|
129 |
# Get TF peak links |
|
|
130 |
# Keep only the TFs that are in our tf list |
|
|
131 |
TFs_Peaks <- motif_pos@motifs@data %*% tf2motifs[, tfs_use] |
|
|
132 |
|
|
|
133 |
# Remove values equal to 0 |
|
|
134 |
tfs2peaks <- expand.grid(rownames(TFs_Peaks), |
|
|
135 |
colnames(TFs_Peaks))[as.vector(TFs_Peaks > 0), ] |
|
|
136 |
# TF-peak links |
|
|
137 |
colnames(tfs2peaks) <- c("peak", "TF") # set column names |
|
|
138 |
|
|
|
139 |
# Save TF-peak links |
|
|
140 |
store_network(network = tfs2peaks, |
|
|
141 |
store_network = store_network, |
|
|
142 |
output_file = output_file, |
|
|
143 |
verbose = verbose) |
|
|
144 |
|
|
|
145 |
if (verbose > 0) { |
|
|
146 |
cat("\tReturning TF-peak links as bipartite object\n") |
|
|
147 |
} |
|
|
148 |
|
|
|
149 |
# Set default names for the networks if not provided |
|
|
150 |
if (is.null(tf_multiplex_name)) { |
|
|
151 |
cat("no TF layer name provided, using tf_expr_assay name\n") |
|
|
152 |
tf_multiplex_name <- tf_expr_assay |
|
|
153 |
} |
|
|
154 |
if (is.null(peak_multiplex_name)) { |
|
|
155 |
cat("no peak layer name provided, using peak_assay name\n") |
|
|
156 |
peak_multiplex_name <- peak_assay |
|
|
157 |
} |
|
|
158 |
|
|
|
159 |
# Return tf-peak bipartite |
|
|
160 |
hummus_object@multilayer@bipartites[[bipartite_name]] <- new("bipartite", |
|
|
161 |
"network" = tfs2peaks, |
|
|
162 |
"multiplex_left" = peak_multiplex_name, |
|
|
163 |
"multiplex_right" = tf_multiplex_name) |
|
|
164 |
return(hummus_object) # Return TF-peak bipartite object |
|
|
165 |
} |
|
|
166 |
|
|
|
167 |
|
|
|
168 |
|
|
|
169 |
|
|
|
170 |
#' Compute links between DNA regions and genenames |
|
|
171 |
#' |
|
|
172 |
#' Compute and add bipartite between DNA regions and genenames to hummus object. |
|
|
173 |
#' Links are computed based on the distance between peaks and gene's TSS |
|
|
174 |
#' location from gene.range annotations. |
|
|
175 |
#' Call find_peaks_near_genes function, that can use different methods. |
|
|
176 |
#' |
|
|
177 |
#' @param hummus_object (hummus_object) - Hummus object. |
|
|
178 |
#' @param gene_assay (character) - Name of assay containing the gene expression |
|
|
179 |
#' data. Default: "RNA". |
|
|
180 |
#' @param peak_assay (character) - Name of the assay containing the DNA regions |
|
|
181 |
#' (ATAC peaks). Default: "peaks". |
|
|
182 |
#' @param gene_multiplex_name (character) - Name of the multiplex containing the |
|
|
183 |
#' genes. |
|
|
184 |
#' If NULL, the name of the gene assay is used. |
|
|
185 |
#' @param peak_multiplex_name (character) - Name of the multiplex containing the |
|
|
186 |
#' DNA regions (ATAC peaks). If NULL, the name of the peak assay is used. |
|
|
187 |
#' @param peak_to_gene_method (character) - Method to use to compute the links |
|
|
188 |
#' between peaks and genes. Default: "Signac". |
|
|
189 |
#' * \code{'Signac'} - Use Signac::Extend to extend genes. |
|
|
190 |
#' * \code{'GREAT'} - Not implemented yet. |
|
|
191 |
#' @param upstream (int) - Upstream distance from TSS |
|
|
192 |
#' to consider as potential promoter. |
|
|
193 |
#' @param downstream (int) - Downstream distance from TSS |
|
|
194 |
#' to consider as potential promoter. |
|
|
195 |
#' @param only_tss (logical) - If TRUE, only TSS will be considered. |
|
|
196 |
#' @param store_network (bool) - Save the bipartite directly |
|
|
197 |
#' (\code{TRUE}, default) or return without saving on disk (\code{FALSE}). |
|
|
198 |
#' @param output_file (character) - Name of the output_file |
|
|
199 |
#' (if store_bipartite == \code{TRUE}). Default: NULL. |
|
|
200 |
#' @param verbose (integer) Display function messages. |
|
|
201 |
#' Set to 0 for no message displayed, >= 1 for more details. Default: 1. |
|
|
202 |
#' @param bipartite_name (character) - Name of bipartite. Default: "atac_rna". |
|
|
203 |
#' |
|
|
204 |
#' @return hummus_object (hummus_object) - Hummus object w/ atac-rna bipartite |
|
|
205 |
#' added to the multilayer slot |
|
|
206 |
#' @export |
|
|
207 |
#' |
|
|
208 |
#' @examples hummus <- bipartite_peaks2genes( |
|
|
209 |
#' hummus_object = hummus, |
|
|
210 |
#' gene_assay = "RNA", |
|
|
211 |
#' peak_assay = "peaks", |
|
|
212 |
#' gene_multiplex_name = "RNA", |
|
|
213 |
#' peak_multiplex_name = "peaks", |
|
|
214 |
#' peak_to_gene_method = "Signac", |
|
|
215 |
#' upstream = 500, |
|
|
216 |
#' downstream = 500, |
|
|
217 |
#' only_tss = TRUE, |
|
|
218 |
#' store_network = FALSE, |
|
|
219 |
#' bipartite_name = "atac_rna") |
|
|
220 |
|
|
|
221 |
bipartite_peaks2genes <- function( |
|
|
222 |
hummus_object, |
|
|
223 |
gene_assay = "RNA", |
|
|
224 |
peak_assay = "peaks", |
|
|
225 |
gene_multiplex_name = NULL, |
|
|
226 |
peak_multiplex_name = NULL, |
|
|
227 |
peak_to_gene_method = "Signac", |
|
|
228 |
upstream = 500, |
|
|
229 |
downstream = 500, |
|
|
230 |
only_tss = TRUE, |
|
|
231 |
store_network = FALSE, |
|
|
232 |
output_file = NULL, |
|
|
233 |
bipartite_name = "atac_rna" |
|
|
234 |
) { |
|
|
235 |
# Check if the gene assay is present in the hummus object |
|
|
236 |
if (!gene_assay %in% names(hummus_object@assays)) { |
|
|
237 |
stop("The gene assay is not present in the hummus object") |
|
|
238 |
} else if (!peak_assay %in% names(hummus_object@assays)) { |
|
|
239 |
# Check if the peak assay is present in the hummus object |
|
|
240 |
stop("The peak assay is not present in the hummus object") |
|
|
241 |
} else if (!inherits(hummus_object@assays[[peak_assay]], |
|
|
242 |
"ChromatinAssay")) { |
|
|
243 |
# Check if the peak assay is a ChromatinAssay object |
|
|
244 |
stop("The peak assay is not a ChromatinAssay object |
|
|
245 |
or does not have annotations (gene.range object))") |
|
|
246 |
} else if (is.null(Signac::Annotation(hummus_object[[peak_assay]]))) { |
|
|
247 |
# Check if the peak assay has gene.range annotations |
|
|
248 |
stop("The peak assay does not have annotations (gene.range object)") |
|
|
249 |
} |
|
|
250 |
|
|
|
251 |
# Find candidate regions near gene bodies |
|
|
252 |
peaks_near_genes <- find_peaks_near_genes( |
|
|
253 |
peaks = hummus_object[[peak_assay]]@ranges, |
|
|
254 |
method = peak_to_gene_method, |
|
|
255 |
genes = Signac::Annotation(hummus_object[[peak_assay]]), |
|
|
256 |
upstream = upstream, |
|
|
257 |
downstream = downstream, |
|
|
258 |
only_tss = only_tss) |
|
|
259 |
# Aggregate candidate regions to gene bodies (peak to gene matrix) |
|
|
260 |
peaks2genes <- aggregate_matrix(Matrix::t(peaks_near_genes), |
|
|
261 |
groups = colnames(peaks_near_genes), |
|
|
262 |
fun = "sum") |
|
|
263 |
# Keep only the genes that are in our scRNA-seq dataset |
|
|
264 |
peaks2genes <- peaks2genes[rownames(peaks2genes) |
|
|
265 |
%in% rownames(hummus_object@assays[[gene_assay]]), ] |
|
|
266 |
# Remove rows/cols with only zeros |
|
|
267 |
peaks2genes <- peaks2genes[Matrix::rowSums(peaks2genes) != 0, |
|
|
268 |
Matrix::colSums(peaks2genes) != 0] |
|
|
269 |
# peak-gene links |
|
|
270 |
peaks2genes <- expand.grid(rownames(peaks2genes), |
|
|
271 |
colnames(peaks2genes))[as.vector(peaks2genes > 0), ] |
|
|
272 |
colnames(peaks2genes) <- c("gene", "peak") # set column names |
|
|
273 |
|
|
|
274 |
|
|
|
275 |
# Save peak-gene links |
|
|
276 |
store_network(network = peaks2genes, |
|
|
277 |
store_network = store_network, |
|
|
278 |
output_file = output_file, |
|
|
279 |
verbose = 1) |
|
|
280 |
|
|
|
281 |
# Set default names for the networks if not provided |
|
|
282 |
if (is.null(gene_multiplex_name)) { |
|
|
283 |
gene_multiplex_name <- gene_assay |
|
|
284 |
} |
|
|
285 |
if (is.null(peak_multiplex_name)) { |
|
|
286 |
peak_multiplex_name <- peak_assay |
|
|
287 |
} |
|
|
288 |
|
|
|
289 |
# Return atac-rna bipartite |
|
|
290 |
hummus_object@multilayer@bipartites[[bipartite_name]] <- new("bipartite", |
|
|
291 |
"network" = peaks2genes, |
|
|
292 |
"multiplex_left" = gene_multiplex_name, |
|
|
293 |
"multiplex_right" = peak_multiplex_name) |
|
|
294 |
return(hummus_object) |
|
|
295 |
} |
|
|
296 |
|
|
|
297 |
#' @title Associate peaks to genes based on distance to TSS (or gene body) |
|
|
298 |
#' |
|
|
299 |
#' @param peaks vector(character) - List of peaks. |
|
|
300 |
#' @param genes vector(character) - List of genes. |
|
|
301 |
#' @param sep vector(character) - Separator between chromosome, |
|
|
302 |
#' start and end position. Default: c('-', '-'). |
|
|
303 |
#' @param method (character) - Method to use. Default: "Signac". |
|
|
304 |
#' * \code{'Signac'} - Use Signac::Extend to extend genes. |
|
|
305 |
#' * \code{'GREAT'} - Not implemented yet. |
|
|
306 |
#' @param upstream (int) - Upstream distance from TSS |
|
|
307 |
#' to consider as potential promoter. |
|
|
308 |
#' @param downstream (int) - Downstream distance from TSS |
|
|
309 |
#' to consider as potential promoter. |
|
|
310 |
#' @param extend (int) - Integer defining the distance from the upstream |
|
|
311 |
#' and downstream of the basal regulatory region. Used only by method 'GREAT'. |
|
|
312 |
#' @param only_tss (logical) - If TRUE, only TSS will be considered. |
|
|
313 |
#' @param verbose (logical) - If TRUE, print progress messages. |
|
|
314 |
#' |
|
|
315 |
#' @return (matrix) - Matrix of peaks x genes with 1 if peak is near gene. |
|
|
316 |
#' @export |
|
|
317 |
#' |
|
|
318 |
#' @examples TODO |
|
|
319 |
find_peaks_near_genes <- function( |
|
|
320 |
peaks, |
|
|
321 |
genes, |
|
|
322 |
sep = c("-", "-"), |
|
|
323 |
method = c("Signac", "GREAT"), |
|
|
324 |
upstream = 100000, |
|
|
325 |
downstream = 0, |
|
|
326 |
extend = 1000000, |
|
|
327 |
only_tss = FALSE, |
|
|
328 |
verbose = TRUE |
|
|
329 |
) { |
|
|
330 |
# Match arg |
|
|
331 |
method <- match.arg(method) |
|
|
332 |
|
|
|
333 |
if (method == "Signac") { |
|
|
334 |
|
|
|
335 |
if (only_tss) { |
|
|
336 |
genes <- IRanges::resize(x = genes, width = 1, fix = "start") |
|
|
337 |
} |
|
|
338 |
genes_extended <- suppressWarnings( |
|
|
339 |
expr = Signac::Extend( |
|
|
340 |
genes, upstream = upstream, downstream = downstream |
|
|
341 |
) |
|
|
342 |
) |
|
|
343 |
overlaps <- IRanges::findOverlaps( |
|
|
344 |
query = peaks, |
|
|
345 |
subject = genes_extended, |
|
|
346 |
type = "any", |
|
|
347 |
select = "all" |
|
|
348 |
) |
|
|
349 |
hit_matrix <- Matrix::sparseMatrix( |
|
|
350 |
i = S4Vectors::queryHits(overlaps), |
|
|
351 |
j = S4Vectors::subjectHits(overlaps), |
|
|
352 |
x = 1, |
|
|
353 |
dims = c(length(peaks), length(genes_extended)) |
|
|
354 |
) |
|
|
355 |
rownames(hit_matrix) <- Signac::GRangesToString(grange = peaks, sep = sep) |
|
|
356 |
colnames(hit_matrix) <- genes_extended$gene_name |
|
|
357 |
|
|
|
358 |
} else { |
|
|
359 |
stop("method must be either 'Signac' or 'GREAT' ; |
|
|
360 |
please check that current version of HuMMuS |
|
|
361 |
already accepts GREAT as a method.") |
|
|
362 |
} |
|
|
363 |
return(hit_matrix) |
|
|
364 |
} |
|
|
365 |
|
|
|
366 |
|
|
|
367 |
#' @title Filter peaks to those overlapping specific (regulatory) elements |
|
|
368 |
#' @description Function to reduce list of "Peaks" to the ones overlapping with |
|
|
369 |
#' list of "RegEl", e.g. regulatory elements, evolutionary conserved regions |
|
|
370 |
#' |
|
|
371 |
#' @param Peaks (character) vector of genomic coordinates of peaks |
|
|
372 |
#' @param RegEl (character) vector of genomic coordinates of regulatory elements |
|
|
373 |
#' @param sep_Peak1 (character) separator between chromosome and |
|
|
374 |
#' start position of peak |
|
|
375 |
#' @param sep_Peak2 (character) separator between start position |
|
|
376 |
#' and end position of peak |
|
|
377 |
#' @param sep_RegEl1 (character) separator between chromosome and |
|
|
378 |
#' start position of regulatory element |
|
|
379 |
#' @param sep_RegEl2 (character) separator between start position and |
|
|
380 |
#' end position of regulatory element |
|
|
381 |
#' |
|
|
382 |
#' @return (character) vector of genomic coordinates of peaks overlapping |
|
|
383 |
#' @export |
|
|
384 |
#' |
|
|
385 |
#' @examples peaks_in_regulatory_elements(peaks, RegEl) |
|
|
386 |
peaks_in_regulatory_elements <- function( |
|
|
387 |
Peaks, |
|
|
388 |
RegEl, |
|
|
389 |
sep_Peak1 = "-", |
|
|
390 |
sep_Peak2 = "-", |
|
|
391 |
sep_RegEl1 = "-", |
|
|
392 |
sep_RegEl2 = "-" |
|
|
393 |
) { |
|
|
394 |
# Make sure Peaks and RegEl are unique |
|
|
395 |
Peaks <- unique(Peaks) |
|
|
396 |
RegEl <- unique(RegEl) |
|
|
397 |
|
|
|
398 |
# convert genomic corrdinate string to GRanges object |
|
|
399 |
Peak_GRangesObj <- Signac::StringToGRanges(Peaks, |
|
|
400 |
sep = c(sep_Peak1, sep_Peak2)) |
|
|
401 |
RegEl_GRangesObj <- Signac::StringToGRanges(RegEl, |
|
|
402 |
sep = c(sep_RegEl1, sep_RegEl2)) |
|
|
403 |
|
|
|
404 |
# find overlap between peaks and regulatory elements |
|
|
405 |
PeakOverlaps <- IRanges::findOverlaps(query = RegEl_GRangesObj, |
|
|
406 |
subject = Peak_GRangesObj) |
|
|
407 |
|
|
|
408 |
# return peaks that overlapped with regulatory element |
|
|
409 |
return(Peaks[unique(as.matrix(PeakOverlaps)[, 2])]) |
|
|
410 |
} |