|
a |
|
b/utils.R |
|
|
1 |
############################# |
|
|
2 |
## Commonly-used functions ## |
|
|
3 |
############################# |
|
|
4 |
|
|
|
5 |
load_SingleCellExperiment <- function(file, normalise = FALSE, features = NULL, cells = NULL, remove_non_expressed_genes = FALSE) { |
|
|
6 |
library(SingleCellExperiment); library(scran); library(scater); |
|
|
7 |
sce <- readRDS(file) |
|
|
8 |
if (!is.null(cells)) sce <- sce[,cells] |
|
|
9 |
if (!is.null(features)) sce <- sce[features,] |
|
|
10 |
if (remove_non_expressed_genes) sce <- sce[which(Matrix::rowSums(counts(sce))>15),] |
|
|
11 |
if (normalise) sce <- logNormCounts(sce) |
|
|
12 |
|
|
|
13 |
return(sce) |
|
|
14 |
} |
|
|
15 |
|
|
|
16 |
load_Seurat <- function(file, assay = "RNA", normalise = FALSE, features = NULL, cells = NULL, remove_non_expressed_genes = FALSE, ...) { |
|
|
17 |
library(Seurat) |
|
|
18 |
seurat <- readRDS(file) |
|
|
19 |
# if (assay%in%Seurat::Assays(seurat)) seurat <- seurat[[assay]] |
|
|
20 |
if (!is.null(cells)) seurat <- seurat[,cells] |
|
|
21 |
if (!is.null(features)) seurat <- seurat[features,] |
|
|
22 |
if (normalise) { |
|
|
23 |
seurat <- NormalizeData(seurat, normalization.method = "LogNormalize") |
|
|
24 |
seurat <- ScaleData(seurat, ...) |
|
|
25 |
} |
|
|
26 |
if (remove_non_expressed_genes) seurat <- seurat[which(Matrix::rowMeans(seurat@assays[[assay]]@counts)>1e-4),] |
|
|
27 |
return(seurat) |
|
|
28 |
} |
|
|
29 |
|
|
|
30 |
matrix.please<-function(x) { |
|
|
31 |
m<-as.matrix(x[,-1]) |
|
|
32 |
rownames(m)<-x[[1]] |
|
|
33 |
m |
|
|
34 |
} |
|
|
35 |
|
|
|
36 |
#' method=1: The TF-IDF implementation used by Stuart & Butler et al. 2019. This computes \eqn{\log(TF \times IDF)}. |
|
|
37 |
#' method=2: The TF-IDF implementation used by Cusanovich & Hill et al. 2018. This computes \eqn{TF \times (\log(IDF))}. |
|
|
38 |
#' method=3: The log-TF method used by Andrew Hill. This computes \eqn{\log(TF) \times \log(IDF)}. |
|
|
39 |
tfidf <- function(mtx, method = 1, scale.factor = 1e4) { |
|
|
40 |
npeaks <- colSums(mtx) |
|
|
41 |
if (any(npeaks == 0)) { |
|
|
42 |
warning("Some cells contain 0 total counts") |
|
|
43 |
} |
|
|
44 |
|
|
|
45 |
tf <- Matrix::tcrossprod(mtx, y = Matrix::Diagonal(x=1/npeaks)) |
|
|
46 |
|
|
|
47 |
rsums <- rowSums(mtx) |
|
|
48 |
if (any(rsums == 0)) { |
|
|
49 |
warning("Some features contain 0 total counts") |
|
|
50 |
} |
|
|
51 |
idf <- ncol(mtx) / rsums |
|
|
52 |
|
|
|
53 |
if (method == 2) { |
|
|
54 |
idf <- log(1 + idf) |
|
|
55 |
} else if (method == 3) { |
|
|
56 |
tf <- log1p(tf * scale.factor) |
|
|
57 |
idf <- log(1 + idf) |
|
|
58 |
} |
|
|
59 |
mtx.tfidf <- Matrix::Diagonal(n = length(idf), x = idf) %*% tf |
|
|
60 |
|
|
|
61 |
if (method == 1) { |
|
|
62 |
mtx.tfidf <- log1p(mtx.tfidf * scale.factor) |
|
|
63 |
} |
|
|
64 |
colnames(mtx.tfidf) <- colnames(mtx) |
|
|
65 |
rownames(mtx.tfidf) <- rownames(mtx) |
|
|
66 |
|
|
|
67 |
# set NA values to 0 |
|
|
68 |
mtx.tfidf[is.na(mtx.tfidf)] <- 0 |
|
|
69 |
|
|
|
70 |
return(mtx.tfidf) |
|
|
71 |
} |
|
|
72 |
|
|
|
73 |
pdist <- function(tmat){ |
|
|
74 |
# @param tmat A non-negative matrix with samples by features |
|
|
75 |
# @reference http://r.789695.n4.nabble.com/dist-function-in-R-is-very-slow-td4738317.html |
|
|
76 |
mtm <- Matrix::tcrossprod(tmat) |
|
|
77 |
sq <- rowSums(tmat^2) |
|
|
78 |
out0 <- outer(sq, sq, "+") - 2 * mtm |
|
|
79 |
out0[out0 < 0] <- 0 |
|
|
80 |
|
|
|
81 |
sqrt(out0) |
|
|
82 |
} |
|
|
83 |
|
|
|
84 |
smoother_aggregate_nearest_nb <- function(mat, D, k){ |
|
|
85 |
# @param mat A matrix in a shape of #genes x #samples. |
|
|
86 |
# @param D A predefined distance matrix in a shape of #samples x #samples. |
|
|
87 |
# @param k An integer to choose \code{k} nearest samples (self-inclusive) to |
|
|
88 |
# aggregate based on the distance matrix \code{D}. |
|
|
89 |
denoised_mat <- sapply(seq_len(ncol(mat)), function(cid){ |
|
|
90 |
nb_cid <- head(order(D[cid, ]), k) |
|
|
91 |
closest_mat <- mat[, nb_cid, drop=FALSE] |
|
|
92 |
# return(Matrix::rowSums(closest_mat)) |
|
|
93 |
return(Matrix::rowMeans(closest_mat)) |
|
|
94 |
}) |
|
|
95 |
dimnames(denoised_mat) <- dimnames(mat) |
|
|
96 |
return(denoised_mat) |
|
|
97 |
} |
|
|
98 |
|
|
|
99 |
|
|
|
100 |
# TO-FINISH..... |
|
|
101 |
smoother_aggregate_nearest_nb_parallel <- function(mat, D, k, cores=1){ |
|
|
102 |
# @param mat A matrix in a shape of #genes x #samples. |
|
|
103 |
# @param D A predefined distance matrix in a shape of #samples x #samples. |
|
|
104 |
# @param k An integer to choose \code{k} nearest samples (self-inclusive) to |
|
|
105 |
# aggregate based on the distance matrix \code{D}. |
|
|
106 |
|
|
|
107 |
# library(future) |
|
|
108 |
library(future.apply) |
|
|
109 |
plan("multiprocess", workers = ncores) |
|
|
110 |
|
|
|
111 |
|
|
|
112 |
# sapply(seq_len(ncol(mat)), function(cid){ |
|
|
113 |
future_sapply(seq_len(ncol(mat)), function(cid){ |
|
|
114 |
nb_cid <- head(order(D[cid, ]), k) |
|
|
115 |
closest_mat <- mat[, nb_cid, drop=FALSE] |
|
|
116 |
# return(Matrix::rowSums(closest_mat)) |
|
|
117 |
return(Matrix::rowMeans(closest_mat)) |
|
|
118 |
}) |
|
|
119 |
} |
|
|
120 |
|
|
|
121 |
# regress_covariates <- function(mtx, vars.to.regress) { |
|
|
122 |
# data <- scale(t(logcounts(sce_filt)), center = T, scale = F) |
|
|
123 |
# data_regressed <- apply(data, 2, function(x) { |
|
|
124 |
# lm.out <- lm(formula=expr~covariate, data=data.frame(expr=x, covariate=factor(sce_filt$stage))); |
|
|
125 |
# residuals <- lm.out[["residuals"]]+lm.out[["coefficients"]][1] |
|
|
126 |
# }) |
|
|
127 |
# } |
|
|
128 |
|
|
|
129 |
# Remove unwanted effects from a matrix |
|
|
130 |
# |
|
|
131 |
# @parm mtx An expression matrix to regress the effects of covariates out |
|
|
132 |
# of should be the complete expression matrix in genes x cells |
|
|
133 |
# @param covariates A matrix or data.frame of latent variables, should be cells |
|
|
134 |
# x covariates, the colnames should be the variables to regress |
|
|
135 |
# @param features_idx An integer vector representing the indices of the |
|
|
136 |
# genes to run regression on |
|
|
137 |
# @param model.use Model to use, one of 'linear', 'poisson', or 'negbinom'; pass |
|
|
138 |
# NULL to simply return mtx |
|
|
139 |
# @param verbose Display a progress bar |
|
|
140 |
#' @importFrom stats as.formula lm |
|
|
141 |
#' @importFrom utils txtProgressBar setTxtProgressBar |
|
|
142 |
# |
|
|
143 |
RegressOutMatrix_parallel <- function(mtx, covariates = NULL, features_idx = NULL, split.by = NULL, block.size = 1000, min.cells.to.block = 3000, ncores = 1, verbose = TRUE) { |
|
|
144 |
|
|
|
145 |
library(future) |
|
|
146 |
library(future.apply) |
|
|
147 |
plan("multiprocess", workers = ncores) |
|
|
148 |
|
|
|
149 |
# Check features_idx |
|
|
150 |
if (is.null(features_idx)) { |
|
|
151 |
features_idx <- 1:nrow(mtx) |
|
|
152 |
} |
|
|
153 |
if (is.character(features_idx)) { |
|
|
154 |
features_idx <- intersect(features_idx, rownames(mtx)) |
|
|
155 |
if (length(features_idx) == 0) { |
|
|
156 |
stop("Cannot use features that are beyond the scope of mtx") |
|
|
157 |
} |
|
|
158 |
} else if (max(features_idx) > nrow(mtx)) { |
|
|
159 |
stop("Cannot use features that are beyond the scope of mtx") |
|
|
160 |
} |
|
|
161 |
|
|
|
162 |
# Check dataset dimensions |
|
|
163 |
if (nrow(covariates) != ncol(mtx)) { |
|
|
164 |
stop("Uneven number of cells between latent data and expression data") |
|
|
165 |
} |
|
|
166 |
|
|
|
167 |
# Subset |
|
|
168 |
mtx <- mtx[features_idx,] |
|
|
169 |
mtx.dimnames <- dimnames(mtx) |
|
|
170 |
|
|
|
171 |
# Define chunck points |
|
|
172 |
chunk.points <- ChunkPoints(dsize = nrow(mtx), csize = block.size) |
|
|
173 |
|
|
|
174 |
# Define cell splitting |
|
|
175 |
split.cells <- split(colnames(mtx), f = split.by %||% TRUE) |
|
|
176 |
|
|
|
177 |
if (nbrOfWorkers() > 1) { |
|
|
178 |
|
|
|
179 |
# Define chuncks |
|
|
180 |
chunks <- expand.grid( |
|
|
181 |
names(split.cells), |
|
|
182 |
1:ncol(chunk.points), |
|
|
183 |
stringsAsFactors = FALSE |
|
|
184 |
) |
|
|
185 |
|
|
|
186 |
# Run RegressOutMatrix in parallel |
|
|
187 |
mtx.resid <- future_lapply( |
|
|
188 |
X = 1:nrow(chunks), |
|
|
189 |
FUN = function(i) { |
|
|
190 |
row <- chunks[i, ] |
|
|
191 |
group <- row[[1]] |
|
|
192 |
index <- as.numeric(row[[2]]) |
|
|
193 |
return(RegressOutMatrix( |
|
|
194 |
mtx = mtx[chunk.points[1, index]:chunk.points[2, index], split.cells[[group]], drop = FALSE], |
|
|
195 |
covariates = covariates[split.cells[[group]], , drop = FALSE], |
|
|
196 |
# features_idx = features_idx[chunk.points[1, index]:chunk.points[2, index]], |
|
|
197 |
verbose = FALSE |
|
|
198 |
)) |
|
|
199 |
} |
|
|
200 |
) |
|
|
201 |
|
|
|
202 |
# Merge splitted cells |
|
|
203 |
if (length(split.cells) > 1) { |
|
|
204 |
merge.indices <- lapply( |
|
|
205 |
X = 1:length(x = split.cells), |
|
|
206 |
FUN = seq.int, |
|
|
207 |
to = length(mtx.resid), |
|
|
208 |
by = length(split.cells) |
|
|
209 |
) |
|
|
210 |
mtx.resid <- lapply( |
|
|
211 |
X = merge.indices, |
|
|
212 |
FUN = function(x) { |
|
|
213 |
return(do.call( 'rbind', mtx.resid[x])) |
|
|
214 |
} |
|
|
215 |
) |
|
|
216 |
mtx.resid <- do.call('cbind', mtx.resid) |
|
|
217 |
} else { |
|
|
218 |
mtx.resid <- do.call( 'rbind', mtx.resid) |
|
|
219 |
} |
|
|
220 |
} else { |
|
|
221 |
|
|
|
222 |
mtx.resid <- lapply( |
|
|
223 |
X = names(split.cells), |
|
|
224 |
FUN = function(x) { |
|
|
225 |
if (verbose && length(split.cells) > 1) { |
|
|
226 |
message("Regressing out variables from split ", x) |
|
|
227 |
} |
|
|
228 |
return(RegressOutMatrix( |
|
|
229 |
mtx = mtx[, split.cells[[x]], drop = FALSE], |
|
|
230 |
covariates = covariates[split.cells[[x]], , drop = FALSE], |
|
|
231 |
features_idx = features_idx, |
|
|
232 |
verbose = verbose |
|
|
233 |
)) |
|
|
234 |
} |
|
|
235 |
) |
|
|
236 |
mtx.resid <- do.call('cbind', mtx.resid) |
|
|
237 |
} |
|
|
238 |
# dimnames(mtx.resid) <- dimnames(mtx) |
|
|
239 |
return(mtx.resid) |
|
|
240 |
} |
|
|
241 |
|
|
|
242 |
RegressOutMatrix <- function(mtx, covariates = NULL, features_idx = NULL, verbose = TRUE) { |
|
|
243 |
|
|
|
244 |
# Check features_idx |
|
|
245 |
if (is.null(features_idx)) { |
|
|
246 |
features_idx <- 1:nrow(mtx) |
|
|
247 |
} |
|
|
248 |
if (is.character(features_idx)) { |
|
|
249 |
features_idx <- intersect(features_idx, rownames(mtx)) |
|
|
250 |
if (length(features_idx) == 0) { |
|
|
251 |
stop("Cannot use features that are beyond the scope of mtx") |
|
|
252 |
} |
|
|
253 |
} else if (max(features_idx) > nrow(mtx)) { |
|
|
254 |
stop("Cannot use features that are beyond the scope of mtx") |
|
|
255 |
} |
|
|
256 |
|
|
|
257 |
# Check dataset dimensions |
|
|
258 |
if (nrow(covariates) != ncol(mtx)) { |
|
|
259 |
stop("Uneven number of cells between latent data and expression data") |
|
|
260 |
} |
|
|
261 |
|
|
|
262 |
# Subset |
|
|
263 |
mtx <- mtx[features_idx,] |
|
|
264 |
mtx.dimnames <- dimnames(mtx) |
|
|
265 |
|
|
|
266 |
# Create formula for regression |
|
|
267 |
vars.to.regress <- colnames(covariates) |
|
|
268 |
fmla <- paste('GENE ~', paste(vars.to.regress, collapse = '+')) %>% as.formula |
|
|
269 |
|
|
|
270 |
# In this code, we'll repeatedly regress different Y against the same X |
|
|
271 |
# (covariates) in order to calculate residuals. Rather that repeatedly |
|
|
272 |
# call lm to do this, we'll avoid recalculating the QR decomposition for the |
|
|
273 |
# covariates matrix each time by reusing it after calculating it once |
|
|
274 |
regression.mat <- cbind(covariates, mtx[1,]) |
|
|
275 |
colnames(regression.mat) <- c(colnames(covariates), "GENE") |
|
|
276 |
qr <- lm(fmla, data = regression.mat, qr = TRUE)$qr |
|
|
277 |
rm(regression.mat) |
|
|
278 |
|
|
|
279 |
# Make results matrix |
|
|
280 |
data.resid <- matrix( |
|
|
281 |
nrow = nrow(mtx), |
|
|
282 |
ncol = ncol(mtx) |
|
|
283 |
) |
|
|
284 |
|
|
|
285 |
if (verbose) pb <- txtProgressBar(char = '=', style = 3, file = stderr()) |
|
|
286 |
|
|
|
287 |
# Extract residuals from each feature by using the pre-computed QR decomposition |
|
|
288 |
for (i in 1:length(features_idx)) { |
|
|
289 |
regression.mat <- cbind(covariates, mtx[features_idx[i], ]) |
|
|
290 |
colnames(regression.mat) <- c(vars.to.regress, 'GENE') |
|
|
291 |
regression.mat <- qr.resid(qr = qr, y = mtx[features_idx[i],]) # The function qr.resid returns the residuals when fitting y to the matrix with QR decomposition. |
|
|
292 |
data.resid[i, ] <- regression.mat |
|
|
293 |
if (verbose) { |
|
|
294 |
setTxtProgressBar(pb = pb, value = i / length(features_idx)) |
|
|
295 |
} |
|
|
296 |
} |
|
|
297 |
|
|
|
298 |
if (verbose) close(con = pb) |
|
|
299 |
|
|
|
300 |
dimnames(data.resid) <- mtx.dimnames |
|
|
301 |
|
|
|
302 |
return(data.resid) |
|
|
303 |
} |
|
|
304 |
|
|
|
305 |
|
|
|
306 |
|
|
|
307 |
# Generate chunk points |
|
|
308 |
# |
|
|
309 |
# @param dsize How big is the data being chunked |
|
|
310 |
# @param csize How big should each chunk be |
|
|
311 |
# |
|
|
312 |
# @return A matrix where each column is a chunk, row 1 is start points, row 2 is end points |
|
|
313 |
# |
|
|
314 |
ChunkPoints <- function(dsize, csize) { |
|
|
315 |
return(vapply( |
|
|
316 |
X = 1L:ceiling(dsize / csize), |
|
|
317 |
FUN = function(i) { |
|
|
318 |
return(c( |
|
|
319 |
start = (csize * (i - 1L)) + 1L, |
|
|
320 |
end = min(csize * i, dsize) |
|
|
321 |
)) |
|
|
322 |
}, |
|
|
323 |
FUN.VALUE = numeric(length = 2L) |
|
|
324 |
)) |
|
|
325 |
} |
|
|
326 |
|
|
|
327 |
|
|
|
328 |
"%ni%" <- Negate("%in%") |
|
|
329 |
|
|
|
330 |
ggplot_theme_NoAxes <- function() { |
|
|
331 |
theme( |
|
|
332 |
axis.title = element_blank(), |
|
|
333 |
axis.line = element_blank(), |
|
|
334 |
axis.text = element_blank(), |
|
|
335 |
axis.ticks = element_blank() |
|
|
336 |
) |
|
|
337 |
} |
|
|
338 |
|
|
|
339 |
minmax.normalisation <- function(x) |
|
|
340 |
{ |
|
|
341 |
return((x-min(x,na.rm=T)) /(max(x,na.rm=T)-min(x,na.rm=T))) |
|
|
342 |
} |
|
|
343 |
|
|
|
344 |
getmode <- function(v, dist) { |
|
|
345 |
tab <- table(v) |
|
|
346 |
#if tie, break to shortest distance |
|
|
347 |
if(sum(tab == max(tab)) > 1){ |
|
|
348 |
tied <- names(tab)[tab == max(tab)] |
|
|
349 |
sub <- dist[v %in% tied] |
|
|
350 |
names(sub) <- v[v %in% tied] |
|
|
351 |
return(names(sub)[which.min(sub)]) |
|
|
352 |
} else { |
|
|
353 |
return(names(tab)[which.max(tab)]) |
|
|
354 |
} |
|
|
355 |
} |
|
|
356 |
|
|
|
357 |
GRangesToString <- function(grange, sep = c("-", "-")) { |
|
|
358 |
regions <- paste0( |
|
|
359 |
as.character(x = seqnames(x = grange)), |
|
|
360 |
sep[[1]], |
|
|
361 |
start(x = grange), |
|
|
362 |
sep[[2]], |
|
|
363 |
end(x = grange) |
|
|
364 |
) |
|
|
365 |
return(regions) |
|
|
366 |
} |
|
|
367 |
|
|
|
368 |
|
|
|
369 |
## sparse -> matrix |
|
|
370 |
dropNA2matrix <- function(x) { |
|
|
371 |
if(!is(x, "dsparseMatrix")) stop("x needs to be a subclass of dsparseMatrix!") |
|
|
372 |
|
|
|
373 |
x <- as(x, "dgCMatrix") |
|
|
374 |
|
|
|
375 |
## remember true NAs |
|
|
376 |
nas <- Matrix::which(is.na(x), arr.ind=TRUE) |
|
|
377 |
|
|
|
378 |
x@x[x@x==0] <- NA |
|
|
379 |
zeros <- Matrix::which(is.na(x), arr.ind=TRUE) |
|
|
380 |
x <- as(x, "matrix") |
|
|
381 |
x[x==0] <- NA |
|
|
382 |
x[zeros] <- 0 |
|
|
383 |
x[nas] <- NA |
|
|
384 |
x |
|
|
385 |
} |
|
|
386 |
|
|
|
387 |
# dropNA2vector <- function(x) { |
|
|
388 |
# stop("NEEDS TO BE FIXED") |
|
|
389 |
# if(!is(x, "dsparseMatrix")) stop("x needs to be a subclass of dsparseMatrix!") |
|
|
390 |
# x <- as(x, "numeric") |
|
|
391 |
|
|
|
392 |
# # true NAs |
|
|
393 |
# nas <- which(is.na(x), arr.ind=TRUE) |
|
|
394 |
|
|
|
395 |
# x[x==0] <- NA |
|
|
396 |
# zeros <- which(is.na(x), arr.ind=TRUE) |
|
|
397 |
# x[nas] <- NA |
|
|
398 |
# x[zeros] <- 0 |
|
|
399 |
# } |
|
|
400 |
|
|
|
401 |
## matrix -> sparse |
|
|
402 |
dropNA <- function(x) { |
|
|
403 |
if(!is(x, "matrix")) stop("x needs to be a matrix!") |
|
|
404 |
|
|
|
405 |
zeros <- which(x==0, arr.ind=TRUE) |
|
|
406 |
## keep zeros |
|
|
407 |
x[is.na(x)] <- 0 |
|
|
408 |
x[zeros] <- NA |
|
|
409 |
x <- Matrix::drop0(x) |
|
|
410 |
x[zeros] <- 0 |
|
|
411 |
x |
|
|
412 |
} |
|
|
413 |
|
|
|
414 |
# dropNAis.na <- function(x) { |
|
|
415 |
# if(!is(x, "dsparseMatrix")) stop("x needs to be a subclass of dsparseMatrix!") |
|
|
416 |
# x <- as(x, "dgCMatrix") |
|
|
417 |
# |
|
|
418 |
# ### not represented means NA and 0 means 0 |
|
|
419 |
# ### this coercion keeps 0 |
|
|
420 |
# !as(x, "ngCMatrix") |
|
|
421 |
# } |
|
|
422 |
|
|
|
423 |
give.n <- function(x){ |
|
|
424 |
return(c(y = mean(x), label = length(x))) |
|
|
425 |
} |
|
|
426 |
|
|
|
427 |
sort.abs <- function(dt, sort.field) dt[order(-abs(dt[[sort.field]]))] |
|
|
428 |
|
|
|
429 |
|
|
|
430 |
# function to pseudobulk a SingleCellExperiment object |
|
|
431 |
pseudobulk_sce_fn <- function(x, assay = NULL, by, fun = c("sum", "mean", "median"), scale = FALSE) { |
|
|
432 |
|
|
|
433 |
# check validity of input arguments |
|
|
434 |
fun <- match.arg(fun) |
|
|
435 |
if (is.null(assay)) assay <- assayNames(x)[1] |
|
|
436 |
|
|
|
437 |
# store aggregation parameters & |
|
|
438 |
# nb. of cells that went into aggregation |
|
|
439 |
md <- metadata(x) |
|
|
440 |
md$agg_pars <- list(assay = assay, by = by, fun = fun, scale = scale) |
|
|
441 |
|
|
|
442 |
# get aggregation function |
|
|
443 |
# fun <- switch(fun, sum = "rowSums", mean = "rowMeans", median = "rowMedians") |
|
|
444 |
|
|
|
445 |
# drop missing factor levels & tabulate number of cells |
|
|
446 |
cd <- dplyr::mutate_if(as.data.frame(colData(x)), is.factor, droplevels) |
|
|
447 |
colData(x) <- DataFrame(cd, row.names = colnames(x),check.names = FALSE) |
|
|
448 |
md$n_cells <- table(as.data.frame(colData(x)[, by])) |
|
|
449 |
|
|
|
450 |
# assure 'by' colData columns are factors so that missing combinations aren't dropped |
|
|
451 |
for (i in by) |
|
|
452 |
if (!is.factor(x[[i]])) |
|
|
453 |
x[[i]] <- factor(x[[i]]) |
|
|
454 |
|
|
|
455 |
# split cells & compute pseudo-bulks |
|
|
456 |
cs <- .split_cells(x, by) |
|
|
457 |
# pb <- .pb(x, cs, assay, fun) |
|
|
458 |
pb <- .pb(x=x, by=by, fun=fun) |
|
|
459 |
if (scale & length(by) == 2) { |
|
|
460 |
ls <- lapply(.pb(x, cs, "counts", "rowSums"), colSums) |
|
|
461 |
pb <- lapply(seq_along(pb), function(i) pb[[i]] / 1e6 * ls[[i]]) |
|
|
462 |
names(pb) <- names(ls) |
|
|
463 |
} |
|
|
464 |
|
|
|
465 |
# construct SCE |
|
|
466 |
pb <- SingleCellExperiment(pb, metadata = md) |
|
|
467 |
|
|
|
468 |
# propagate 'colData' columns that are unique across 2nd 'by' |
|
|
469 |
if (length(by) == 2) { |
|
|
470 |
cd <- colData(x) |
|
|
471 |
ids <- colnames(pb) |
|
|
472 |
counts <- vapply(ids, function(u) { |
|
|
473 |
m <- as.logical(match(cd[, by[2]], u, nomatch = 0)) |
|
|
474 |
vapply(cd[m, ], function(u) length(unique(u)), numeric(1)) |
|
|
475 |
}, numeric(ncol(colData(x)))) |
|
|
476 |
cd_keep <- apply(counts, 1, function(u) all(u == 1)) |
|
|
477 |
cd_keep <- setdiff(names(which(cd_keep)), by) |
|
|
478 |
if (length(cd_keep) != 0) { |
|
|
479 |
m <- match(ids, cd[, by[2]], nomatch = 0) |
|
|
480 |
cd <- cd[m, cd_keep, drop = FALSE] |
|
|
481 |
rownames(cd) <- ids |
|
|
482 |
colData(pb) <- cd |
|
|
483 |
} |
|
|
484 |
} |
|
|
485 |
return(pb) |
|
|
486 |
} |
|
|
487 |
|
|
|
488 |
|
|
|
489 |
# split cells by cluster-sample |
|
|
490 |
# auxiliary function to pseudobulk a SingleCellExperiment object |
|
|
491 |
# - by: character vector specifying colData column(s) to split by. If length(by) == 1, a list of length nlevels(colData$by), else, |
|
|
492 |
# a nested list with 2nd level of length nlevels(colData$by[2]) |
|
|
493 |
.split_cells <- function(x, by) { |
|
|
494 |
if (is(x, "SingleCellExperiment")) x <- colData(x) |
|
|
495 |
cd <- data.frame(x[by], check.names = FALSE) |
|
|
496 |
cd <- data.table(cd, cell = rownames(x)) %>% split(by = by, sorted = TRUE, flatten = FALSE) |
|
|
497 |
purrr::map_depth(cd, length(by), "cell") |
|
|
498 |
} |
|
|
499 |
|
|
|
500 |
|
|
|
501 |
# auxiliary function to pseudobulk a SingleCellExperiment object |
|
|
502 |
.pb <- function(x, by, fun) { |
|
|
503 |
|
|
|
504 |
# compute pseudobulks |
|
|
505 |
# y <- scuttle::summarizeAssayByGroup(x, assay.type = assay, ids = (ids <- colData(x)[by]), statistics = fun, BPPARAM = BiocParallel::SerialParam()) |
|
|
506 |
y <- scuttle::summarizeAssayByGroup(x, ids = colData(x)[by], statistics = fun) |
|
|
507 |
colnames(y) <- y[[by[length(by)]]] |
|
|
508 |
|
|
|
509 |
if (length(by) == 1) return(assay(y)) |
|
|
510 |
|
|
|
511 |
# reformat into one assay per 'by[1]' |
|
|
512 |
if (is.factor(ids <- y[[by[1]]])) |
|
|
513 |
ids <- droplevels(ids) |
|
|
514 |
is <- split(seq_len(ncol(y)), ids) |
|
|
515 |
ys <- map(is, ~assay(y)[, .]) |
|
|
516 |
|
|
|
517 |
# fill in missing combinations |
|
|
518 |
for (i in seq_along(ys)) { |
|
|
519 |
fill <- setdiff(unique(y[[by[2]]]), colnames(ys[[i]])) |
|
|
520 |
if (length(fill != 0)) { |
|
|
521 |
foo <- matrix(0, nrow(x), length(fill)) |
|
|
522 |
colnames(foo) <- fill |
|
|
523 |
foo <- cbind(ys[[i]], foo) |
|
|
524 |
o <- paste(sort(unique(y[[by[2]]]))) |
|
|
525 |
ys[[i]] <- foo[, o] |
|
|
526 |
} |
|
|
527 |
} |
|
|
528 |
return(ys) |
|
|
529 |
} |
|
|
530 |
|
|
|
531 |
|
|
|
532 |
.summarizeJASPARMotifs <- function(motifs = NULL){ |
|
|
533 |
|
|
|
534 |
motifNames <- lapply(seq_along(motifs), function(x){ |
|
|
535 |
namex <- make.names(motifs[[x]]@name) |
|
|
536 |
if(substr(namex,nchar(namex),nchar(namex))=="."){ |
|
|
537 |
namex <- substr(namex,1,nchar(namex)-1) |
|
|
538 |
} |
|
|
539 |
namex <- paste0(namex, "_", x) |
|
|
540 |
namex |
|
|
541 |
}) %>% unlist(.) |
|
|
542 |
|
|
|
543 |
motifDF <- lapply(seq_along(motifs), function(x){ |
|
|
544 |
data.frame( |
|
|
545 |
row.names = motifNames[x], |
|
|
546 |
name = motifs[[x]]@name[[1]], |
|
|
547 |
ID = motifs[[x]]@ID, |
|
|
548 |
strand = motifs[[x]]@strand, |
|
|
549 |
symbol = ifelse(!is.null(motifs[[x]]@tags$symbol[1]), motifs[[x]]@tags$symbol[1], NA) , |
|
|
550 |
family = ifelse(!is.null(motifs[[x]]@tags$family[1]), motifs[[x]]@tags$family[1], NA), |
|
|
551 |
alias = ifelse(!is.null(motifs[[x]]@tags$alias[1]), motifs[[x]]@tags$alias[1], NA), |
|
|
552 |
stringsAsFactors = FALSE |
|
|
553 |
) |
|
|
554 |
}) %>% Reduce("rbind", .) %>% DataFrame |
|
|
555 |
|
|
|
556 |
names(motifs) <- motifNames |
|
|
557 |
|
|
|
558 |
out <- list(motifs = motifs, motifSummary = motifDF) |
|
|
559 |
|
|
|
560 |
return(out) |
|
|
561 |
|
|
|
562 |
} |
|
|
563 |
|
|
|
564 |
.summarizeChromVARMotifs <- function(motifs = NULL){ |
|
|
565 |
|
|
|
566 |
motifNames <- lapply(seq_along(motifs), function(x){ |
|
|
567 |
namex <- make.names(motifs[[x]]@name) |
|
|
568 |
if(grepl("LINE", namex)){ |
|
|
569 |
splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE) |
|
|
570 |
namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1] |
|
|
571 |
} |
|
|
572 |
if(substr(namex,nchar(namex),nchar(namex))=="."){ |
|
|
573 |
namex <- substr(namex,1,nchar(namex)-1) |
|
|
574 |
} |
|
|
575 |
namex <- paste0(namex, "_", x) |
|
|
576 |
namex |
|
|
577 |
}) %>% unlist(.) |
|
|
578 |
|
|
|
579 |
motifNames2 <- lapply(seq_along(motifs), function(x){ |
|
|
580 |
namex <- make.names(motifs[[x]]@name) |
|
|
581 |
if(grepl("LINE", namex)){ |
|
|
582 |
splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE) |
|
|
583 |
namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1] |
|
|
584 |
} |
|
|
585 |
if(substr(namex,nchar(namex),nchar(namex))=="."){ |
|
|
586 |
namex <- substr(namex,1,nchar(namex)-1) |
|
|
587 |
} |
|
|
588 |
namex |
|
|
589 |
}) %>% unlist(.) |
|
|
590 |
|
|
|
591 |
motifDF <- lapply(seq_along(motifs), function(x){ |
|
|
592 |
df <- data.frame( |
|
|
593 |
row.names = motifNames[x], |
|
|
594 |
name = motifNames2[[x]], |
|
|
595 |
ID = motifs[[x]]@ID, |
|
|
596 |
strand = motifs[[x]]@strand, |
|
|
597 |
stringsAsFactors = FALSE |
|
|
598 |
) |
|
|
599 |
}) %>% Reduce("rbind", .) %>% DataFrame |
|
|
600 |
|
|
|
601 |
names(motifs) <- motifNames |
|
|
602 |
|
|
|
603 |
out <- list(motifs = motifs, motifSummary = motifDF) |
|
|
604 |
|
|
|
605 |
return(out) |
|
|
606 |
|
|
|
607 |
} |
|
|
608 |
|
|
|
609 |
.augment_matrix <-function(mtx, samples) { |
|
|
610 |
samples <- unique(samples) |
|
|
611 |
mtx <- t(mtx) |
|
|
612 |
aug_mtx<-matrix(NA, ncol=ncol(mtx), nrow=length(samples)) |
|
|
613 |
aug_mtx<-mtx[match(samples,rownames(mtx)),,drop=FALSE] |
|
|
614 |
rownames(aug_mtx)<-samples |
|
|
615 |
colnames(aug_mtx)<-colnames(mtx) |
|
|
616 |
return(t(aug_mtx)) |
|
|
617 |
} |
|
|
618 |
|
|
|
619 |
|
|
|
620 |
stop_quietly <- function() { |
|
|
621 |
opt <- options(show.error.messages = FALSE) |
|
|
622 |
on.exit(options(opt)) |
|
|
623 |
stop() |
|
|
624 |
} |