--- a +++ b/utils.R @@ -0,0 +1,624 @@ +############################# +## Commonly-used functions ## +############################# + +load_SingleCellExperiment <- function(file, normalise = FALSE, features = NULL, cells = NULL, remove_non_expressed_genes = FALSE) { + library(SingleCellExperiment); library(scran); library(scater); + sce <- readRDS(file) + if (!is.null(cells)) sce <- sce[,cells] + if (!is.null(features)) sce <- sce[features,] + if (remove_non_expressed_genes) sce <- sce[which(Matrix::rowSums(counts(sce))>15),] + if (normalise) sce <- logNormCounts(sce) + + return(sce) +} + +load_Seurat <- function(file, assay = "RNA", normalise = FALSE, features = NULL, cells = NULL, remove_non_expressed_genes = FALSE, ...) { + library(Seurat) + seurat <- readRDS(file) + # if (assay%in%Seurat::Assays(seurat)) seurat <- seurat[[assay]] + if (!is.null(cells)) seurat <- seurat[,cells] + if (!is.null(features)) seurat <- seurat[features,] + if (normalise) { + seurat <- NormalizeData(seurat, normalization.method = "LogNormalize") + seurat <- ScaleData(seurat, ...) + } + if (remove_non_expressed_genes) seurat <- seurat[which(Matrix::rowMeans(seurat@assays[[assay]]@counts)>1e-4),] + return(seurat) +} + +matrix.please<-function(x) { + m<-as.matrix(x[,-1]) + rownames(m)<-x[[1]] + m +} + +#' method=1: The TF-IDF implementation used by Stuart & Butler et al. 2019. This computes \eqn{\log(TF \times IDF)}. +#' method=2: The TF-IDF implementation used by Cusanovich & Hill et al. 2018. This computes \eqn{TF \times (\log(IDF))}. +#' method=3: The log-TF method used by Andrew Hill. This computes \eqn{\log(TF) \times \log(IDF)}. +tfidf <- function(mtx, method = 1, scale.factor = 1e4) { + npeaks <- colSums(mtx) + if (any(npeaks == 0)) { + warning("Some cells contain 0 total counts") + } + + tf <- Matrix::tcrossprod(mtx, y = Matrix::Diagonal(x=1/npeaks)) + + rsums <- rowSums(mtx) + if (any(rsums == 0)) { + warning("Some features contain 0 total counts") + } + idf <- ncol(mtx) / rsums + + if (method == 2) { + idf <- log(1 + idf) + } else if (method == 3) { + tf <- log1p(tf * scale.factor) + idf <- log(1 + idf) + } + mtx.tfidf <- Matrix::Diagonal(n = length(idf), x = idf) %*% tf + + if (method == 1) { + mtx.tfidf <- log1p(mtx.tfidf * scale.factor) + } + colnames(mtx.tfidf) <- colnames(mtx) + rownames(mtx.tfidf) <- rownames(mtx) + + # set NA values to 0 + mtx.tfidf[is.na(mtx.tfidf)] <- 0 + + return(mtx.tfidf) +} + +pdist <- function(tmat){ + # @param tmat A non-negative matrix with samples by features + # @reference http://r.789695.n4.nabble.com/dist-function-in-R-is-very-slow-td4738317.html + mtm <- Matrix::tcrossprod(tmat) + sq <- rowSums(tmat^2) + out0 <- outer(sq, sq, "+") - 2 * mtm + out0[out0 < 0] <- 0 + + sqrt(out0) +} + +smoother_aggregate_nearest_nb <- function(mat, D, k){ + # @param mat A matrix in a shape of #genes x #samples. + # @param D A predefined distance matrix in a shape of #samples x #samples. + # @param k An integer to choose \code{k} nearest samples (self-inclusive) to + # aggregate based on the distance matrix \code{D}. + denoised_mat <- sapply(seq_len(ncol(mat)), function(cid){ + nb_cid <- head(order(D[cid, ]), k) + closest_mat <- mat[, nb_cid, drop=FALSE] + # return(Matrix::rowSums(closest_mat)) + return(Matrix::rowMeans(closest_mat)) + }) + dimnames(denoised_mat) <- dimnames(mat) + return(denoised_mat) +} + + +# TO-FINISH..... +smoother_aggregate_nearest_nb_parallel <- function(mat, D, k, cores=1){ + # @param mat A matrix in a shape of #genes x #samples. + # @param D A predefined distance matrix in a shape of #samples x #samples. + # @param k An integer to choose \code{k} nearest samples (self-inclusive) to + # aggregate based on the distance matrix \code{D}. + + # library(future) + library(future.apply) + plan("multiprocess", workers = ncores) + + + # sapply(seq_len(ncol(mat)), function(cid){ + future_sapply(seq_len(ncol(mat)), function(cid){ + nb_cid <- head(order(D[cid, ]), k) + closest_mat <- mat[, nb_cid, drop=FALSE] + # return(Matrix::rowSums(closest_mat)) + return(Matrix::rowMeans(closest_mat)) + }) +} + +# regress_covariates <- function(mtx, vars.to.regress) { +# data <- scale(t(logcounts(sce_filt)), center = T, scale = F) +# data_regressed <- apply(data, 2, function(x) { +# lm.out <- lm(formula=expr~covariate, data=data.frame(expr=x, covariate=factor(sce_filt$stage))); +# residuals <- lm.out[["residuals"]]+lm.out[["coefficients"]][1] +# }) +# } + +# Remove unwanted effects from a matrix +# +# @parm mtx An expression matrix to regress the effects of covariates out +# of should be the complete expression matrix in genes x cells +# @param covariates A matrix or data.frame of latent variables, should be cells +# x covariates, the colnames should be the variables to regress +# @param features_idx An integer vector representing the indices of the +# genes to run regression on +# @param model.use Model to use, one of 'linear', 'poisson', or 'negbinom'; pass +# NULL to simply return mtx +# @param verbose Display a progress bar +#' @importFrom stats as.formula lm +#' @importFrom utils txtProgressBar setTxtProgressBar +# +RegressOutMatrix_parallel <- function(mtx, covariates = NULL, features_idx = NULL, split.by = NULL, block.size = 1000, min.cells.to.block = 3000, ncores = 1, verbose = TRUE) { + + library(future) + library(future.apply) + plan("multiprocess", workers = ncores) + + # Check features_idx + if (is.null(features_idx)) { + features_idx <- 1:nrow(mtx) + } + if (is.character(features_idx)) { + features_idx <- intersect(features_idx, rownames(mtx)) + if (length(features_idx) == 0) { + stop("Cannot use features that are beyond the scope of mtx") + } + } else if (max(features_idx) > nrow(mtx)) { + stop("Cannot use features that are beyond the scope of mtx") + } + + # Check dataset dimensions + if (nrow(covariates) != ncol(mtx)) { + stop("Uneven number of cells between latent data and expression data") + } + + # Subset + mtx <- mtx[features_idx,] + mtx.dimnames <- dimnames(mtx) + + # Define chunck points + chunk.points <- ChunkPoints(dsize = nrow(mtx), csize = block.size) + + # Define cell splitting + split.cells <- split(colnames(mtx), f = split.by %||% TRUE) + + if (nbrOfWorkers() > 1) { + + # Define chuncks + chunks <- expand.grid( + names(split.cells), + 1:ncol(chunk.points), + stringsAsFactors = FALSE + ) + + # Run RegressOutMatrix in parallel + mtx.resid <- future_lapply( + X = 1:nrow(chunks), + FUN = function(i) { + row <- chunks[i, ] + group <- row[[1]] + index <- as.numeric(row[[2]]) + return(RegressOutMatrix( + mtx = mtx[chunk.points[1, index]:chunk.points[2, index], split.cells[[group]], drop = FALSE], + covariates = covariates[split.cells[[group]], , drop = FALSE], + # features_idx = features_idx[chunk.points[1, index]:chunk.points[2, index]], + verbose = FALSE + )) + } + ) + + # Merge splitted cells + if (length(split.cells) > 1) { + merge.indices <- lapply( + X = 1:length(x = split.cells), + FUN = seq.int, + to = length(mtx.resid), + by = length(split.cells) + ) + mtx.resid <- lapply( + X = merge.indices, + FUN = function(x) { + return(do.call( 'rbind', mtx.resid[x])) + } + ) + mtx.resid <- do.call('cbind', mtx.resid) + } else { + mtx.resid <- do.call( 'rbind', mtx.resid) + } + } else { + + mtx.resid <- lapply( + X = names(split.cells), + FUN = function(x) { + if (verbose && length(split.cells) > 1) { + message("Regressing out variables from split ", x) + } + return(RegressOutMatrix( + mtx = mtx[, split.cells[[x]], drop = FALSE], + covariates = covariates[split.cells[[x]], , drop = FALSE], + features_idx = features_idx, + verbose = verbose + )) + } + ) + mtx.resid <- do.call('cbind', mtx.resid) + } + # dimnames(mtx.resid) <- dimnames(mtx) + return(mtx.resid) + } + +RegressOutMatrix <- function(mtx, covariates = NULL, features_idx = NULL, verbose = TRUE) { + + # Check features_idx + if (is.null(features_idx)) { + features_idx <- 1:nrow(mtx) + } + if (is.character(features_idx)) { + features_idx <- intersect(features_idx, rownames(mtx)) + if (length(features_idx) == 0) { + stop("Cannot use features that are beyond the scope of mtx") + } + } else if (max(features_idx) > nrow(mtx)) { + stop("Cannot use features that are beyond the scope of mtx") + } + + # Check dataset dimensions + if (nrow(covariates) != ncol(mtx)) { + stop("Uneven number of cells between latent data and expression data") + } + + # Subset + mtx <- mtx[features_idx,] + mtx.dimnames <- dimnames(mtx) + + # Create formula for regression + vars.to.regress <- colnames(covariates) + fmla <- paste('GENE ~', paste(vars.to.regress, collapse = '+')) %>% as.formula + + # In this code, we'll repeatedly regress different Y against the same X + # (covariates) in order to calculate residuals. Rather that repeatedly + # call lm to do this, we'll avoid recalculating the QR decomposition for the + # covariates matrix each time by reusing it after calculating it once + regression.mat <- cbind(covariates, mtx[1,]) + colnames(regression.mat) <- c(colnames(covariates), "GENE") + qr <- lm(fmla, data = regression.mat, qr = TRUE)$qr + rm(regression.mat) + + # Make results matrix + data.resid <- matrix( + nrow = nrow(mtx), + ncol = ncol(mtx) + ) + + if (verbose) pb <- txtProgressBar(char = '=', style = 3, file = stderr()) + + # Extract residuals from each feature by using the pre-computed QR decomposition + for (i in 1:length(features_idx)) { + regression.mat <- cbind(covariates, mtx[features_idx[i], ]) + colnames(regression.mat) <- c(vars.to.regress, 'GENE') + regression.mat <- qr.resid(qr = qr, y = mtx[features_idx[i],]) # The function qr.resid returns the residuals when fitting y to the matrix with QR decomposition. + data.resid[i, ] <- regression.mat + if (verbose) { + setTxtProgressBar(pb = pb, value = i / length(features_idx)) + } + } + + if (verbose) close(con = pb) + + dimnames(data.resid) <- mtx.dimnames + + return(data.resid) +} + + + +# Generate chunk points +# +# @param dsize How big is the data being chunked +# @param csize How big should each chunk be +# +# @return A matrix where each column is a chunk, row 1 is start points, row 2 is end points +# +ChunkPoints <- function(dsize, csize) { + return(vapply( + X = 1L:ceiling(dsize / csize), + FUN = function(i) { + return(c( + start = (csize * (i - 1L)) + 1L, + end = min(csize * i, dsize) + )) + }, + FUN.VALUE = numeric(length = 2L) + )) +} + + +"%ni%" <- Negate("%in%") + +ggplot_theme_NoAxes <- function() { + theme( + axis.title = element_blank(), + axis.line = element_blank(), + axis.text = element_blank(), + axis.ticks = element_blank() + ) +} + +minmax.normalisation <- function(x) +{ + return((x-min(x,na.rm=T)) /(max(x,na.rm=T)-min(x,na.rm=T))) +} + +getmode <- function(v, dist) { + tab <- table(v) + #if tie, break to shortest distance + if(sum(tab == max(tab)) > 1){ + tied <- names(tab)[tab == max(tab)] + sub <- dist[v %in% tied] + names(sub) <- v[v %in% tied] + return(names(sub)[which.min(sub)]) + } else { + return(names(tab)[which.max(tab)]) + } +} + +GRangesToString <- function(grange, sep = c("-", "-")) { + regions <- paste0( + as.character(x = seqnames(x = grange)), + sep[[1]], + start(x = grange), + sep[[2]], + end(x = grange) + ) + return(regions) +} + + +## sparse -> matrix +dropNA2matrix <- function(x) { + if(!is(x, "dsparseMatrix")) stop("x needs to be a subclass of dsparseMatrix!") + + x <- as(x, "dgCMatrix") + + ## remember true NAs + nas <- Matrix::which(is.na(x), arr.ind=TRUE) + + x@x[x@x==0] <- NA + zeros <- Matrix::which(is.na(x), arr.ind=TRUE) + x <- as(x, "matrix") + x[x==0] <- NA + x[zeros] <- 0 + x[nas] <- NA + x +} + +# dropNA2vector <- function(x) { +# stop("NEEDS TO BE FIXED") +# if(!is(x, "dsparseMatrix")) stop("x needs to be a subclass of dsparseMatrix!") +# x <- as(x, "numeric") + +# # true NAs +# nas <- which(is.na(x), arr.ind=TRUE) + +# x[x==0] <- NA +# zeros <- which(is.na(x), arr.ind=TRUE) +# x[nas] <- NA +# x[zeros] <- 0 +# } + +## matrix -> sparse +dropNA <- function(x) { + if(!is(x, "matrix")) stop("x needs to be a matrix!") + + zeros <- which(x==0, arr.ind=TRUE) + ## keep zeros + x[is.na(x)] <- 0 + x[zeros] <- NA + x <- Matrix::drop0(x) + x[zeros] <- 0 + x +} + +# dropNAis.na <- function(x) { +# if(!is(x, "dsparseMatrix")) stop("x needs to be a subclass of dsparseMatrix!") +# x <- as(x, "dgCMatrix") +# +# ### not represented means NA and 0 means 0 +# ### this coercion keeps 0 +# !as(x, "ngCMatrix") +# } + +give.n <- function(x){ + return(c(y = mean(x), label = length(x))) +} + +sort.abs <- function(dt, sort.field) dt[order(-abs(dt[[sort.field]]))] + + +# function to pseudobulk a SingleCellExperiment object +pseudobulk_sce_fn <- function(x, assay = NULL, by, fun = c("sum", "mean", "median"), scale = FALSE) { + + # check validity of input arguments + fun <- match.arg(fun) + if (is.null(assay)) assay <- assayNames(x)[1] + + # store aggregation parameters & + # nb. of cells that went into aggregation + md <- metadata(x) + md$agg_pars <- list(assay = assay, by = by, fun = fun, scale = scale) + + # get aggregation function + # fun <- switch(fun, sum = "rowSums", mean = "rowMeans", median = "rowMedians") + + # drop missing factor levels & tabulate number of cells + cd <- dplyr::mutate_if(as.data.frame(colData(x)), is.factor, droplevels) + colData(x) <- DataFrame(cd, row.names = colnames(x),check.names = FALSE) + md$n_cells <- table(as.data.frame(colData(x)[, by])) + + # assure 'by' colData columns are factors so that missing combinations aren't dropped + for (i in by) + if (!is.factor(x[[i]])) + x[[i]] <- factor(x[[i]]) + + # split cells & compute pseudo-bulks + cs <- .split_cells(x, by) + # pb <- .pb(x, cs, assay, fun) + pb <- .pb(x=x, by=by, fun=fun) + if (scale & length(by) == 2) { + ls <- lapply(.pb(x, cs, "counts", "rowSums"), colSums) + pb <- lapply(seq_along(pb), function(i) pb[[i]] / 1e6 * ls[[i]]) + names(pb) <- names(ls) + } + + # construct SCE + pb <- SingleCellExperiment(pb, metadata = md) + + # propagate 'colData' columns that are unique across 2nd 'by' + if (length(by) == 2) { + cd <- colData(x) + ids <- colnames(pb) + counts <- vapply(ids, function(u) { + m <- as.logical(match(cd[, by[2]], u, nomatch = 0)) + vapply(cd[m, ], function(u) length(unique(u)), numeric(1)) + }, numeric(ncol(colData(x)))) + cd_keep <- apply(counts, 1, function(u) all(u == 1)) + cd_keep <- setdiff(names(which(cd_keep)), by) + if (length(cd_keep) != 0) { + m <- match(ids, cd[, by[2]], nomatch = 0) + cd <- cd[m, cd_keep, drop = FALSE] + rownames(cd) <- ids + colData(pb) <- cd + } + } + return(pb) +} + + +# split cells by cluster-sample +# auxiliary function to pseudobulk a SingleCellExperiment object +# - by: character vector specifying colData column(s) to split by. If length(by) == 1, a list of length nlevels(colData$by), else, +# a nested list with 2nd level of length nlevels(colData$by[2]) +.split_cells <- function(x, by) { + if (is(x, "SingleCellExperiment")) x <- colData(x) + cd <- data.frame(x[by], check.names = FALSE) + cd <- data.table(cd, cell = rownames(x)) %>% split(by = by, sorted = TRUE, flatten = FALSE) + purrr::map_depth(cd, length(by), "cell") +} + + +# auxiliary function to pseudobulk a SingleCellExperiment object +.pb <- function(x, by, fun) { + + # compute pseudobulks + # y <- scuttle::summarizeAssayByGroup(x, assay.type = assay, ids = (ids <- colData(x)[by]), statistics = fun, BPPARAM = BiocParallel::SerialParam()) + y <- scuttle::summarizeAssayByGroup(x, ids = colData(x)[by], statistics = fun) + colnames(y) <- y[[by[length(by)]]] + + if (length(by) == 1) return(assay(y)) + + # reformat into one assay per 'by[1]' + if (is.factor(ids <- y[[by[1]]])) + ids <- droplevels(ids) + is <- split(seq_len(ncol(y)), ids) + ys <- map(is, ~assay(y)[, .]) + + # fill in missing combinations + for (i in seq_along(ys)) { + fill <- setdiff(unique(y[[by[2]]]), colnames(ys[[i]])) + if (length(fill != 0)) { + foo <- matrix(0, nrow(x), length(fill)) + colnames(foo) <- fill + foo <- cbind(ys[[i]], foo) + o <- paste(sort(unique(y[[by[2]]]))) + ys[[i]] <- foo[, o] + } + } + return(ys) +} + + +.summarizeJASPARMotifs <- function(motifs = NULL){ + + motifNames <- lapply(seq_along(motifs), function(x){ + namex <- make.names(motifs[[x]]@name) + if(substr(namex,nchar(namex),nchar(namex))=="."){ + namex <- substr(namex,1,nchar(namex)-1) + } + namex <- paste0(namex, "_", x) + namex + }) %>% unlist(.) + + motifDF <- lapply(seq_along(motifs), function(x){ + data.frame( + row.names = motifNames[x], + name = motifs[[x]]@name[[1]], + ID = motifs[[x]]@ID, + strand = motifs[[x]]@strand, + symbol = ifelse(!is.null(motifs[[x]]@tags$symbol[1]), motifs[[x]]@tags$symbol[1], NA) , + family = ifelse(!is.null(motifs[[x]]@tags$family[1]), motifs[[x]]@tags$family[1], NA), + alias = ifelse(!is.null(motifs[[x]]@tags$alias[1]), motifs[[x]]@tags$alias[1], NA), + stringsAsFactors = FALSE + ) + }) %>% Reduce("rbind", .) %>% DataFrame + + names(motifs) <- motifNames + + out <- list(motifs = motifs, motifSummary = motifDF) + + return(out) + +} + +.summarizeChromVARMotifs <- function(motifs = NULL){ + + motifNames <- lapply(seq_along(motifs), function(x){ + namex <- make.names(motifs[[x]]@name) + if(grepl("LINE", namex)){ + splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE) + namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1] + } + if(substr(namex,nchar(namex),nchar(namex))=="."){ + namex <- substr(namex,1,nchar(namex)-1) + } + namex <- paste0(namex, "_", x) + namex + }) %>% unlist(.) + + motifNames2 <- lapply(seq_along(motifs), function(x){ + namex <- make.names(motifs[[x]]@name) + if(grepl("LINE", namex)){ + splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE) + namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1] + } + if(substr(namex,nchar(namex),nchar(namex))=="."){ + namex <- substr(namex,1,nchar(namex)-1) + } + namex + }) %>% unlist(.) + + motifDF <- lapply(seq_along(motifs), function(x){ + df <- data.frame( + row.names = motifNames[x], + name = motifNames2[[x]], + ID = motifs[[x]]@ID, + strand = motifs[[x]]@strand, + stringsAsFactors = FALSE + ) + }) %>% Reduce("rbind", .) %>% DataFrame + + names(motifs) <- motifNames + + out <- list(motifs = motifs, motifSummary = motifDF) + + return(out) + +} + +.augment_matrix <-function(mtx, samples) { + samples <- unique(samples) + mtx <- t(mtx) + aug_mtx<-matrix(NA, ncol=ncol(mtx), nrow=length(samples)) + aug_mtx<-mtx[match(samples,rownames(mtx)),,drop=FALSE] + rownames(aug_mtx)<-samples + colnames(aug_mtx)<-colnames(mtx) + return(t(aug_mtx)) +} + + +stop_quietly <- function() { + opt <- options(show.error.messages = FALSE) + on.exit(options(opt)) + stop() +} \ No newline at end of file