|
a |
|
b/R/method_wrappers.R |
|
|
1 |
# Description: This file contains the wrapper functions for the methods that |
|
|
2 |
# are used to compute the different layers of the multilayer network. The |
|
|
3 |
# functions are called from the compute_*_network functions in layers.R |
|
|
4 |
# For now, only the compute_atac_peak_network function has wrapper functions |
|
|
5 |
# for the different methods. The other methods are still directly implemented |
|
|
6 |
# in the compute_*_network functions in layers.R |
|
|
7 |
|
|
|
8 |
#' @title Cicero wrapper function for the compute_atac_peak_network function |
|
|
9 |
#' |
|
|
10 |
#' @description This function is a wrapper for the compute_atac_peak_network |
|
|
11 |
#' function in layers.R. It computes the peak network from scATAC-seq data |
|
|
12 |
#' using Cicero. It returns a data frame with the peak network. The data frame |
|
|
13 |
#' also contains the coaccess score for each edge. The coaccess score is the |
|
|
14 |
#' probability that two peaks are accessible in the same cell. The coaccess |
|
|
15 |
#' score is computed by Cicero. Edges are filtered based on the coaccess score. |
|
|
16 |
#' Only edges with a coaccess score > threshold are kept. |
|
|
17 |
#' |
|
|
18 |
#' @param hummus A hummus object |
|
|
19 |
#' @param atac_assay The name of the assay containing the scATAC-seq data |
|
|
20 |
#' @param genome The genome object |
|
|
21 |
#' @param window The window size used by Cicero to compute the coaccess score |
|
|
22 |
#' @param number_cells_per_clusters The number of cells per cluster used by |
|
|
23 |
#' Cicero to compute the coaccess score |
|
|
24 |
#' @param sample_num The number of samples used by Cicero to compute the |
|
|
25 |
#' coaccess score |
|
|
26 |
#' @param seed The seed used by Cicero to compute the coaccess score |
|
|
27 |
#' @param verbose The verbosity level |
|
|
28 |
#' @param threshold The threshold used to filter edges based on the coaccess |
|
|
29 |
#' score |
|
|
30 |
#' @param reduction_method The method used by monocle3 to reduce the dimension |
|
|
31 |
#' of the scATAC-seq data before defining the pseudocells. The default is UMAP. |
|
|
32 |
#' |
|
|
33 |
#' @return A data frame containing the peak network |
|
|
34 |
#' @export |
|
|
35 |
#' |
|
|
36 |
run_cicero_wrapper <- function( |
|
|
37 |
hummus, |
|
|
38 |
atac_assay, |
|
|
39 |
genome, |
|
|
40 |
window, |
|
|
41 |
number_cells_per_clusters, |
|
|
42 |
sample_num, |
|
|
43 |
seed, |
|
|
44 |
verbose, |
|
|
45 |
threshold, |
|
|
46 |
reduction_method = "UMAP" |
|
|
47 |
) { |
|
|
48 |
|
|
|
49 |
# functions that need to be renamed |
|
|
50 |
int_elementMetadata <- SingleCellExperiment::int_elementMetadata |
|
|
51 |
counts <- SingleCellExperiment::counts |
|
|
52 |
|
|
|
53 |
# obtain chromosome sizes |
|
|
54 |
chromosome_sizes <- data.frame(V1 = genome@seqinfo@seqnames, |
|
|
55 |
V2 = genome@seqinfo@seqlengths) |
|
|
56 |
|
|
|
57 |
# Get scATAC-seq data |
|
|
58 |
scATAC <- as.matrix(hummus@assays[[atac_assay]]@counts) |
|
|
59 |
# Matrix to edgelist |
|
|
60 |
acc <- reshape2::melt(scATAC) |
|
|
61 |
colnames(acc) <- c("V1", "V2", "V3") |
|
|
62 |
|
|
|
63 |
# Prepare cicero input |
|
|
64 |
input_cds <- cicero::make_atac_cds(acc, binarize = TRUE) # Create CDS object |
|
|
65 |
set.seed(seed) |
|
|
66 |
# It is required that there is no empty cell |
|
|
67 |
if (length(which(colSums(as.matrix(monocle3::exprs(input_cds))) == 0)) == 0 |
|
|
68 |
) { |
|
|
69 |
# Calculating size factors using default method = mean-geometric-mean-total |
|
|
70 |
input_cds <- monocle3::estimate_size_factors(input_cds) |
|
|
71 |
# Preprocessing using LSI |
|
|
72 |
input_cds <- monocle3::preprocess_cds(input_cds, method = "LSI") |
|
|
73 |
# Dimensionality reduction using UMAP |
|
|
74 |
input_cds <- monocle3::reduce_dimension( |
|
|
75 |
input_cds, |
|
|
76 |
reduction_method = reduction_method, |
|
|
77 |
preprocess_method = "LSI") |
|
|
78 |
} else { |
|
|
79 |
print("Error: there is at least one cell with no signal.") |
|
|
80 |
} |
|
|
81 |
# Get reduced (UMAP) coordinates |
|
|
82 |
umap_coords <- SingleCellExperiment::reducedDims(input_cds)$UMAP |
|
|
83 |
# Compute pseudocells |
|
|
84 |
cicero_cds <- cicero::make_cicero_cds( |
|
|
85 |
input_cds, # Create a Cicero CDS object |
|
|
86 |
reduced_coordinates = umap_coords, |
|
|
87 |
k = number_cells_per_clusters, #number neighbors/ Default = 50 |
|
|
88 |
summary_stats = NULL, # Default |
|
|
89 |
size_factor_normalize = TRUE, # Default |
|
|
90 |
silent = FALSE) # Default |
|
|
91 |
|
|
|
92 |
cicero <- cicero::run_cicero( |
|
|
93 |
cds = cicero_cds, # Infer peak-links |
|
|
94 |
genomic_coords = chromosome_sizes, |
|
|
95 |
window = window, # Default = 5e+05 |
|
|
96 |
silent = FALSE, # Default = FALSE |
|
|
97 |
sample_num = sample_num) # Default = 100 |
|
|
98 |
|
|
|
99 |
# Remove NAs, double edges, and edges with coaccess score <=0 |
|
|
100 |
# Check for coaccess = NA |
|
|
101 |
if (length(which(is.na(cicero$coaccess))) > threshold) { |
|
|
102 |
cicero <- cicero[which(!is.na(cicero$coaccess)), ] # Remove NAs |
|
|
103 |
} |
|
|
104 |
cicero$temp <- NA # Helper column to check and remove double edges |
|
|
105 |
my_cols <- which(as.character(cicero$Peak1) <= as.character(cicero$Peak2)) |
|
|
106 |
cicero$temp[my_cols] <- paste(cicero$Peak1[my_cols], |
|
|
107 |
cicero$Peak2[my_cols], |
|
|
108 |
sep = ";") |
|
|
109 |
|
|
|
110 |
my_cols <- which(as.character(cicero$Peak1) > as.character(cicero$Peak2)) |
|
|
111 |
cicero$temp[my_cols] <- paste(cicero$Peak2[my_cols], |
|
|
112 |
cicero$Peak1[my_cols], |
|
|
113 |
sep = ";") |
|
|
114 |
|
|
|
115 |
# Sort table according to temp-column (each entry appears double) |
|
|
116 |
cicero <- cicero[with(cicero, order(temp, decreasing = TRUE)), ] |
|
|
117 |
rownames(cicero) <- c(1:dim(cicero)[1]) |
|
|
118 |
A <- as.character(cicero$Peak1[seq(1, dim(cicero)[1], 2)]) |
|
|
119 |
Anum <- round(cicero$coaccess[seq(1, dim(cicero)[1], 2)], 10) |
|
|
120 |
B <- as.character(cicero$Peak2[seq(2, dim(cicero)[1], 2)]) |
|
|
121 |
Bnum <- round(cicero$coaccess[seq(2, dim(cicero)[1], 2)], 10) |
|
|
122 |
# length(which(A==B & Anum==Bnum)) |
|
|
123 |
# Each edge appears twice with same coaccess score (rounded to 10 digits) |
|
|
124 |
cicero <- cicero[seq(1, dim(cicero)[1], 2), ] # Remove double edges |
|
|
125 |
cicero$temp <- NULL # Remove helper column |
|
|
126 |
cicero <- cicero[with(cicero, order(cicero$coaccess, |
|
|
127 |
decreasing = TRUE)), ] # Sort |
|
|
128 |
rownames(cicero) <- c(1:dim(cicero)[1]) |
|
|
129 |
cicero$Peak1 <- gsub("_", "-", cicero$Peak1) |
|
|
130 |
# Peak names 2x"-" to match bipartites |
|
|
131 |
cicero$Peak2 <- gsub("_", "-", cicero$Peak2) |
|
|
132 |
# Peak names 2x"-" to match bipartites ? 2x"-" or 2x"_" |
|
|
133 |
|
|
|
134 |
peak_network <- cicero[which(cicero$coaccess > threshold), ] |
|
|
135 |
# Remove edges with coaccess score <= threshold |
|
|
136 |
|
|
|
137 |
if (verbose > 0) { |
|
|
138 |
cat("\n", dim(peak_network)[1], "peak edges with a coaccess score >", |
|
|
139 |
threshold, "were found.\n") |
|
|
140 |
} |
|
|
141 |
|
|
|
142 |
# Return peak network including edges with positive coaccess score |
|
|
143 |
return(peak_network) |
|
|
144 |
} |
|
|
145 |
|
|
|
146 |
|
|
|
147 |
#' @title Omnipath wrapper function for the compute_tf_network function |
|
|
148 |
#' @description This function is a wrapper for the compute_tf_network function |
|
|
149 |
#' in layers.R. It computes the TF network from using Omnipath database. |
|
|
150 |
#' It returns a data frame with the TF network. The data frame is not weighted |
|
|
151 |
#' and does not contain scores for the edges. |
|
|
152 |
#' @param hummus A hummus object |
|
|
153 |
# Get TF-TF interactions from Omnipath |
|
|
154 |
run_omnipath_wrapper <- function( |
|
|
155 |
hummus = hummus, |
|
|
156 |
organism = organism, |
|
|
157 |
tfs = tfs, |
|
|
158 |
gene_assay = gene_assay, |
|
|
159 |
source_target = source_target, |
|
|
160 |
verbose = 1) { |
|
|
161 |
|
|
|
162 |
TF_PPI <- OmnipathR::import_post_translational_interactions( |
|
|
163 |
organism = organism, partners = tfs, source_target = source_target |
|
|
164 |
) |
|
|
165 |
|
|
|
166 |
if (verbose > 0) { |
|
|
167 |
cat("\tNumber of edges from Omnipath:", nrow(TF_PPI), |
|
|
168 |
"\nWill now be filtered to only those corresponding to specified tfs") |
|
|
169 |
} |
|
|
170 |
|
|
|
171 |
if (is.na(tfs)) { |
|
|
172 |
# Get tfs list |
|
|
173 |
tfs <- get_tfs(hummus = hummus, |
|
|
174 |
assay = gene_assay, |
|
|
175 |
store_tfs = FALSE, |
|
|
176 |
output_file = NULL, |
|
|
177 |
verbose = verbose) |
|
|
178 |
} else if (typeof(tfs) != "character") { |
|
|
179 |
stop("'tfs' argument needs to be a vector of characters |
|
|
180 |
(e.g.: c('MYC', 'JAK1')).") |
|
|
181 |
} |
|
|
182 |
|
|
|
183 |
# add filtering if element is not a TF expressed in the dataset |
|
|
184 |
if (source_target == "AND") { |
|
|
185 |
TF_PPI <- TF_PPI[which(TF_PPI$source_genesymbol %in% tfs & |
|
|
186 |
TF_PPI$target_genesymbol %in% tfs), ] |
|
|
187 |
} else if (source_target == "OR") { |
|
|
188 |
TF_PPI <- TF_PPI[which(TF_PPI$source_genesymbol %in% tfs | |
|
|
189 |
TF_PPI$target_genesymbol %in% tfs), ] |
|
|
190 |
} |
|
|
191 |
# Get only source and target columns |
|
|
192 |
tf_network <- TF_PPI[, c(3, 4)] |
|
|
193 |
|
|
|
194 |
# Convert to data.frame from tibble |
|
|
195 |
tf_network <- as.data.frame(tf_network) |
|
|
196 |
|
|
|
197 |
# Check if there is any TF-TF edges otherwise add a fake node |
|
|
198 |
# and connect all TFs to it (to allow HuMMuS to run without impacting result) |
|
|
199 |
if (nrow(tf_network) == 0) { |
|
|
200 |
cat("No TF-TF edges from Omnipath for the given parameters. |
|
|
201 |
You can try to change the source_target parameter to 'OR' to get |
|
|
202 |
TF-other protein interactions. Or try to import a network |
|
|
203 |
computed externally. Right now, a network with all TFs connected |
|
|
204 |
to a fake node is created, for HuMMuS analysis.\n It has no biological |
|
|
205 |
meaning but will allow to run the pipeline as if no edges were present. |
|
|
206 |
\n") |
|
|
207 |
tf_network <- run_tf_null_wrapper( |
|
|
208 |
hummus = hummus, |
|
|
209 |
organism = organism, |
|
|
210 |
tfs = tfs, |
|
|
211 |
gene_assay = gene_assay, |
|
|
212 |
verbose = ) |
|
|
213 |
|
|
|
214 |
} |
|
|
215 |
return(tf_network) |
|
|
216 |
} |
|
|
217 |
|
|
|
218 |
#' @title tf_null wrapper function for the tf_network function |
|
|
219 |
#' @description This function is a wrapper for the tf_network function |
|
|
220 |
#' |
|
|
221 |
#' @param hummus A hummus object |
|
|
222 |
#' |
|
|
223 |
#' |
|
|
224 |
run_tf_null_wrapper <- function( |
|
|
225 |
hummus = hummus, |
|
|
226 |
organism = organism, |
|
|
227 |
tfs = tfs, |
|
|
228 |
gene_assay = gene_assay, |
|
|
229 |
verbose = 1) { |
|
|
230 |
|
|
|
231 |
if (verbose > 0) { |
|
|
232 |
cat("Creating a fake TF network with all TFs connected to a fake node.\n") |
|
|
233 |
} |
|
|
234 |
|
|
|
235 |
if (is.na(tfs)) { |
|
|
236 |
# Get tfs list |
|
|
237 |
tfs <- get_tfs(hummus = hummus, |
|
|
238 |
assay = gene_assay, |
|
|
239 |
store_tfs = FALSE, |
|
|
240 |
output_file = NULL, |
|
|
241 |
verbose = verbose) |
|
|
242 |
} else if (typeof(tfs) != "character") { |
|
|
243 |
stop("'tfs' argument needs to be a vector of characters |
|
|
244 |
(e.g.: c('MYC', 'JAK1')).") |
|
|
245 |
} |
|
|
246 |
|
|
|
247 |
|
|
|
248 |
FAKE_NODE <- "fake_node" |
|
|
249 |
tf_network <- data.frame(source = c(), target = c()) |
|
|
250 |
for (tf in tfs) { |
|
|
251 |
tf_network <- rbind(tf_network, data.frame(source = tf, target = FAKE_NODE)) |
|
|
252 |
} |
|
|
253 |
return(tf_network) |
|
|
254 |
} |