[9987e3]: / R / method_wrappers.R

Download this file

254 lines (228 with data), 10.0 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
# Description: This file contains the wrapper functions for the methods that
# are used to compute the different layers of the multilayer network. The
# functions are called from the compute_*_network functions in layers.R
# For now, only the compute_atac_peak_network function has wrapper functions
# for the different methods. The other methods are still directly implemented
# in the compute_*_network functions in layers.R
#' @title Cicero wrapper function for the compute_atac_peak_network function
#'
#' @description This function is a wrapper for the compute_atac_peak_network
#' function in layers.R. It computes the peak network from scATAC-seq data
#' using Cicero. It returns a data frame with the peak network. The data frame
#' also contains the coaccess score for each edge. The coaccess score is the
#' probability that two peaks are accessible in the same cell. The coaccess
#' score is computed by Cicero. Edges are filtered based on the coaccess score.
#' Only edges with a coaccess score > threshold are kept.
#'
#' @param hummus A hummus object
#' @param atac_assay The name of the assay containing the scATAC-seq data
#' @param genome The genome object
#' @param window The window size used by Cicero to compute the coaccess score
#' @param number_cells_per_clusters The number of cells per cluster used by
#' Cicero to compute the coaccess score
#' @param sample_num The number of samples used by Cicero to compute the
#' coaccess score
#' @param seed The seed used by Cicero to compute the coaccess score
#' @param verbose The verbosity level
#' @param threshold The threshold used to filter edges based on the coaccess
#' score
#' @param reduction_method The method used by monocle3 to reduce the dimension
#' of the scATAC-seq data before defining the pseudocells. The default is UMAP.
#'
#' @return A data frame containing the peak network
#' @export
#'
run_cicero_wrapper <- function(
hummus,
atac_assay,
genome,
window,
number_cells_per_clusters,
sample_num,
seed,
verbose,
threshold,
reduction_method = "UMAP"
) {
# functions that need to be renamed
int_elementMetadata <- SingleCellExperiment::int_elementMetadata
counts <- SingleCellExperiment::counts
# obtain chromosome sizes
chromosome_sizes <- data.frame(V1 = genome@seqinfo@seqnames,
V2 = genome@seqinfo@seqlengths)
# Get scATAC-seq data
scATAC <- as.matrix(hummus@assays[[atac_assay]]@counts)
# Matrix to edgelist
acc <- reshape2::melt(scATAC)
colnames(acc) <- c("V1", "V2", "V3")
# Prepare cicero input
input_cds <- cicero::make_atac_cds(acc, binarize = TRUE) # Create CDS object
set.seed(seed)
# It is required that there is no empty cell
if (length(which(colSums(as.matrix(monocle3::exprs(input_cds))) == 0)) == 0
) {
# Calculating size factors using default method = mean-geometric-mean-total
input_cds <- monocle3::estimate_size_factors(input_cds)
# Preprocessing using LSI
input_cds <- monocle3::preprocess_cds(input_cds, method = "LSI")
# Dimensionality reduction using UMAP
input_cds <- monocle3::reduce_dimension(
input_cds,
reduction_method = reduction_method,
preprocess_method = "LSI")
} else {
print("Error: there is at least one cell with no signal.")
}
# Get reduced (UMAP) coordinates
umap_coords <- SingleCellExperiment::reducedDims(input_cds)$UMAP
# Compute pseudocells
cicero_cds <- cicero::make_cicero_cds(
input_cds, # Create a Cicero CDS object
reduced_coordinates = umap_coords,
k = number_cells_per_clusters, #number neighbors/ Default = 50
summary_stats = NULL, # Default
size_factor_normalize = TRUE, # Default
silent = FALSE) # Default
cicero <- cicero::run_cicero(
cds = cicero_cds, # Infer peak-links
genomic_coords = chromosome_sizes,
window = window, # Default = 5e+05
silent = FALSE, # Default = FALSE
sample_num = sample_num) # Default = 100
# Remove NAs, double edges, and edges with coaccess score <=0
# Check for coaccess = NA
if (length(which(is.na(cicero$coaccess))) > threshold) {
cicero <- cicero[which(!is.na(cicero$coaccess)), ] # Remove NAs
}
cicero$temp <- NA # Helper column to check and remove double edges
my_cols <- which(as.character(cicero$Peak1) <= as.character(cicero$Peak2))
cicero$temp[my_cols] <- paste(cicero$Peak1[my_cols],
cicero$Peak2[my_cols],
sep = ";")
my_cols <- which(as.character(cicero$Peak1) > as.character(cicero$Peak2))
cicero$temp[my_cols] <- paste(cicero$Peak2[my_cols],
cicero$Peak1[my_cols],
sep = ";")
# Sort table according to temp-column (each entry appears double)
cicero <- cicero[with(cicero, order(temp, decreasing = TRUE)), ]
rownames(cicero) <- c(1:dim(cicero)[1])
A <- as.character(cicero$Peak1[seq(1, dim(cicero)[1], 2)])
Anum <- round(cicero$coaccess[seq(1, dim(cicero)[1], 2)], 10)
B <- as.character(cicero$Peak2[seq(2, dim(cicero)[1], 2)])
Bnum <- round(cicero$coaccess[seq(2, dim(cicero)[1], 2)], 10)
# length(which(A==B & Anum==Bnum))
# Each edge appears twice with same coaccess score (rounded to 10 digits)
cicero <- cicero[seq(1, dim(cicero)[1], 2), ] # Remove double edges
cicero$temp <- NULL # Remove helper column
cicero <- cicero[with(cicero, order(cicero$coaccess,
decreasing = TRUE)), ] # Sort
rownames(cicero) <- c(1:dim(cicero)[1])
cicero$Peak1 <- gsub("_", "-", cicero$Peak1)
# Peak names 2x"-" to match bipartites
cicero$Peak2 <- gsub("_", "-", cicero$Peak2)
# Peak names 2x"-" to match bipartites ? 2x"-" or 2x"_"
peak_network <- cicero[which(cicero$coaccess > threshold), ]
# Remove edges with coaccess score <= threshold
if (verbose > 0) {
cat("\n", dim(peak_network)[1], "peak edges with a coaccess score >",
threshold, "were found.\n")
}
# Return peak network including edges with positive coaccess score
return(peak_network)
}
#' @title Omnipath wrapper function for the compute_tf_network function
#' @description This function is a wrapper for the compute_tf_network function
#' in layers.R. It computes the TF network from using Omnipath database.
#' It returns a data frame with the TF network. The data frame is not weighted
#' and does not contain scores for the edges.
#' @param hummus A hummus object
# Get TF-TF interactions from Omnipath
run_omnipath_wrapper <- function(
hummus = hummus,
organism = organism,
tfs = tfs,
gene_assay = gene_assay,
source_target = source_target,
verbose = 1) {
TF_PPI <- OmnipathR::import_post_translational_interactions(
organism = organism, partners = tfs, source_target = source_target
)
if (verbose > 0) {
cat("\tNumber of edges from Omnipath:", nrow(TF_PPI),
"\nWill now be filtered to only those corresponding to specified tfs")
}
if (is.na(tfs)) {
# Get tfs list
tfs <- get_tfs(hummus = hummus,
assay = gene_assay,
store_tfs = FALSE,
output_file = NULL,
verbose = verbose)
} else if (typeof(tfs) != "character") {
stop("'tfs' argument needs to be a vector of characters
(e.g.: c('MYC', 'JAK1')).")
}
# add filtering if element is not a TF expressed in the dataset
if (source_target == "AND") {
TF_PPI <- TF_PPI[which(TF_PPI$source_genesymbol %in% tfs &
TF_PPI$target_genesymbol %in% tfs), ]
} else if (source_target == "OR") {
TF_PPI <- TF_PPI[which(TF_PPI$source_genesymbol %in% tfs |
TF_PPI$target_genesymbol %in% tfs), ]
}
# Get only source and target columns
tf_network <- TF_PPI[, c(3, 4)]
# Convert to data.frame from tibble
tf_network <- as.data.frame(tf_network)
# Check if there is any TF-TF edges otherwise add a fake node
# and connect all TFs to it (to allow HuMMuS to run without impacting result)
if (nrow(tf_network) == 0) {
cat("No TF-TF edges from Omnipath for the given parameters.
You can try to change the source_target parameter to 'OR' to get
TF-other protein interactions. Or try to import a network
computed externally. Right now, a network with all TFs connected
to a fake node is created, for HuMMuS analysis.\n It has no biological
meaning but will allow to run the pipeline as if no edges were present.
\n")
tf_network <- run_tf_null_wrapper(
hummus = hummus,
organism = organism,
tfs = tfs,
gene_assay = gene_assay,
verbose = )
}
return(tf_network)
}
#' @title tf_null wrapper function for the tf_network function
#' @description This function is a wrapper for the tf_network function
#'
#' @param hummus A hummus object
#'
#'
run_tf_null_wrapper <- function(
hummus = hummus,
organism = organism,
tfs = tfs,
gene_assay = gene_assay,
verbose = 1) {
if (verbose > 0) {
cat("Creating a fake TF network with all TFs connected to a fake node.\n")
}
if (is.na(tfs)) {
# Get tfs list
tfs <- get_tfs(hummus = hummus,
assay = gene_assay,
store_tfs = FALSE,
output_file = NULL,
verbose = verbose)
} else if (typeof(tfs) != "character") {
stop("'tfs' argument needs to be a vector of characters
(e.g.: c('MYC', 'JAK1')).")
}
FAKE_NODE <- "fake_node"
tf_network <- data.frame(source = c(), target = c())
for (tf in tfs) {
tf_network <- rbind(tf_network, data.frame(source = tf, target = FAKE_NODE))
}
return(tf_network)
}