--- a
+++ b/utils.R
@@ -0,0 +1,624 @@
+#############################
+## Commonly-used functions ##
+#############################
+
+load_SingleCellExperiment <- function(file, normalise = FALSE, features = NULL, cells = NULL, remove_non_expressed_genes = FALSE) {
+  library(SingleCellExperiment); library(scran); library(scater);
+  sce <- readRDS(file)
+  if (!is.null(cells)) sce <- sce[,cells]
+  if (!is.null(features)) sce <- sce[features,]
+  if (remove_non_expressed_genes) sce <- sce[which(Matrix::rowSums(counts(sce))>15),]
+  if (normalise) sce <- logNormCounts(sce)
+  
+  return(sce)
+}
+
+load_Seurat <- function(file, assay = "RNA", normalise = FALSE, features = NULL, cells = NULL, remove_non_expressed_genes = FALSE, ...) {
+  library(Seurat)
+  seurat <- readRDS(file)
+  # if (assay%in%Seurat::Assays(seurat)) seurat <- seurat[[assay]]
+  if (!is.null(cells)) seurat <- seurat[,cells]
+  if (!is.null(features)) seurat <- seurat[features,]
+  if (normalise) {
+    seurat <- NormalizeData(seurat, normalization.method = "LogNormalize")
+    seurat <- ScaleData(seurat, ...)
+  }
+  if (remove_non_expressed_genes) seurat <- seurat[which(Matrix::rowMeans(seurat@assays[[assay]]@counts)>1e-4),]
+  return(seurat)
+}
+
+matrix.please<-function(x) {
+  m<-as.matrix(x[,-1])
+  rownames(m)<-x[[1]]
+  m
+}
+
+#'  method=1: The TF-IDF implementation used by Stuart & Butler et al. 2019. This computes \eqn{\log(TF \times IDF)}.
+#'  method=2: The TF-IDF implementation used by Cusanovich & Hill et al. 2018. This computes \eqn{TF \times (\log(IDF))}.
+#'  method=3: The log-TF method used by Andrew Hill. This computes \eqn{\log(TF) \times \log(IDF)}.
+tfidf <- function(mtx, method = 1, scale.factor = 1e4) {
+  npeaks <- colSums(mtx)
+  if (any(npeaks == 0)) {
+    warning("Some cells contain 0 total counts")
+  }
+
+  tf <- Matrix::tcrossprod(mtx, y = Matrix::Diagonal(x=1/npeaks))
+
+  rsums <- rowSums(mtx)
+  if (any(rsums == 0)) {
+    warning("Some features contain 0 total counts")
+  }
+  idf <- ncol(mtx) / rsums
+
+  if (method == 2) {
+    idf <- log(1 + idf)
+  } else if (method == 3) {
+    tf <- log1p(tf * scale.factor)
+    idf <- log(1 + idf)
+  }
+  mtx.tfidf <- Matrix::Diagonal(n = length(idf), x = idf) %*% tf
+
+  if (method == 1) {
+    mtx.tfidf <- log1p(mtx.tfidf * scale.factor)
+  }
+  colnames(mtx.tfidf) <- colnames(mtx)
+  rownames(mtx.tfidf) <- rownames(mtx)
+
+  # set NA values to 0
+  mtx.tfidf[is.na(mtx.tfidf)] <- 0
+
+  return(mtx.tfidf)
+}
+
+pdist <- function(tmat){
+  # @param tmat A non-negative matrix with samples by features
+  # @reference http://r.789695.n4.nabble.com/dist-function-in-R-is-very-slow-td4738317.html
+  mtm <- Matrix::tcrossprod(tmat)
+  sq <- rowSums(tmat^2)
+  out0 <- outer(sq, sq, "+") - 2 * mtm
+  out0[out0 < 0] <- 0
+  
+  sqrt(out0)
+}
+
+smoother_aggregate_nearest_nb <- function(mat, D, k){
+  # @param mat A matrix in a shape of #genes x #samples.
+  # @param D A predefined distance matrix in a shape of #samples x #samples.
+  # @param k An integer to choose \code{k} nearest samples (self-inclusive) to
+  #  aggregate based on the distance matrix \code{D}.
+  denoised_mat <- sapply(seq_len(ncol(mat)), function(cid){
+    nb_cid <- head(order(D[cid, ]), k)
+    closest_mat <- mat[, nb_cid, drop=FALSE]
+    # return(Matrix::rowSums(closest_mat))
+    return(Matrix::rowMeans(closest_mat))
+  })
+  dimnames(denoised_mat) <- dimnames(mat)
+  return(denoised_mat)
+}
+
+
+# TO-FINISH.....
+smoother_aggregate_nearest_nb_parallel <- function(mat, D, k, cores=1){
+  # @param mat A matrix in a shape of #genes x #samples.
+  # @param D A predefined distance matrix in a shape of #samples x #samples.
+  # @param k An integer to choose \code{k} nearest samples (self-inclusive) to
+  #  aggregate based on the distance matrix \code{D}.
+
+  # library(future)
+  library(future.apply)
+  plan("multiprocess", workers = ncores)
+
+
+  # sapply(seq_len(ncol(mat)), function(cid){
+  future_sapply(seq_len(ncol(mat)), function(cid){
+    nb_cid <- head(order(D[cid, ]), k)
+    closest_mat <- mat[, nb_cid, drop=FALSE]
+    # return(Matrix::rowSums(closest_mat))
+    return(Matrix::rowMeans(closest_mat))
+  })
+}
+
+# regress_covariates <- function(mtx, vars.to.regress) {
+#   data <- scale(t(logcounts(sce_filt)), center = T, scale = F)
+#   data_regressed <- apply(data, 2, function(x) {
+#     lm.out <- lm(formula=expr~covariate, data=data.frame(expr=x, covariate=factor(sce_filt$stage)));
+#     residuals <- lm.out[["residuals"]]+lm.out[["coefficients"]][1]
+#   })
+# }
+
+# Remove unwanted effects from a matrix
+#
+# @parm mtx An expression matrix to regress the effects of covariates out
+# of should be the complete expression matrix in genes x cells
+# @param covariates A matrix or data.frame of latent variables, should be cells
+# x covariates, the colnames should be the variables to regress
+# @param features_idx An integer vector representing the indices of the
+# genes to run regression on
+# @param model.use Model to use, one of 'linear', 'poisson', or 'negbinom'; pass
+# NULL to simply return mtx
+# @param verbose Display a progress bar
+#' @importFrom stats as.formula lm
+#' @importFrom utils txtProgressBar setTxtProgressBar
+#
+RegressOutMatrix_parallel <- function(mtx, covariates = NULL, features_idx = NULL, split.by = NULL, block.size = 1000, min.cells.to.block = 3000, ncores = 1, verbose = TRUE) {
+  
+  library(future)
+  library(future.apply)
+  plan("multiprocess", workers = ncores)
+  
+  # Check features_idx
+  if (is.null(features_idx)) {
+    features_idx <- 1:nrow(mtx)
+  }
+  if (is.character(features_idx)) {
+    features_idx <- intersect(features_idx, rownames(mtx))
+    if (length(features_idx) == 0) {
+      stop("Cannot use features that are beyond the scope of mtx")
+    }
+  } else if (max(features_idx) > nrow(mtx)) {
+    stop("Cannot use features that are beyond the scope of mtx")
+  }
+  
+  # Check dataset dimensions
+  if (nrow(covariates) != ncol(mtx)) {
+    stop("Uneven number of cells between latent data and expression data")
+  }
+  
+  # Subset
+  mtx <- mtx[features_idx,]
+  mtx.dimnames <- dimnames(mtx)
+  
+  # Define chunck points
+  chunk.points <- ChunkPoints(dsize = nrow(mtx), csize = block.size)
+  
+  # Define cell splitting
+  split.cells <- split(colnames(mtx), f = split.by %||% TRUE)
+
+   if (nbrOfWorkers() > 1) {
+
+    # Define chuncks
+      chunks <- expand.grid(
+        names(split.cells),
+        1:ncol(chunk.points),
+        stringsAsFactors = FALSE
+      )
+
+      # Run RegressOutMatrix in parallel
+      mtx.resid <- future_lapply(
+        X = 1:nrow(chunks),
+        FUN = function(i) {
+          row <- chunks[i, ]
+          group <- row[[1]]
+          index <- as.numeric(row[[2]])
+          return(RegressOutMatrix(
+            mtx = mtx[chunk.points[1, index]:chunk.points[2, index], split.cells[[group]], drop = FALSE],
+            covariates = covariates[split.cells[[group]], , drop = FALSE],
+            # features_idx = features_idx[chunk.points[1, index]:chunk.points[2, index]],
+            verbose = FALSE
+          ))
+        }
+      )
+
+      # Merge splitted cells
+      if (length(split.cells) > 1) {
+        merge.indices <- lapply(
+          X = 1:length(x = split.cells),
+          FUN = seq.int,
+          to = length(mtx.resid),
+          by = length(split.cells)
+        )
+        mtx.resid <- lapply(
+          X = merge.indices,
+          FUN = function(x) {
+            return(do.call( 'rbind', mtx.resid[x]))
+          }
+        )
+        mtx.resid <- do.call('cbind', mtx.resid)
+      } else {
+        mtx.resid <- do.call( 'rbind', mtx.resid)
+      }
+    } else {
+      
+      mtx.resid <- lapply(
+        X = names(split.cells),
+        FUN = function(x) {
+          if (verbose && length(split.cells) > 1) {
+            message("Regressing out variables from split ", x)
+          }
+          return(RegressOutMatrix(
+            mtx = mtx[, split.cells[[x]], drop = FALSE],
+            covariates = covariates[split.cells[[x]], , drop = FALSE],
+            features_idx = features_idx,
+            verbose = verbose
+          ))
+        }
+      )
+      mtx.resid <- do.call('cbind', mtx.resid)
+    }
+    # dimnames(mtx.resid) <- dimnames(mtx)
+    return(mtx.resid)
+  }
+
+RegressOutMatrix <- function(mtx, covariates = NULL, features_idx = NULL, verbose = TRUE) {
+  
+  # Check features_idx
+  if (is.null(features_idx)) {
+    features_idx <- 1:nrow(mtx)
+  }
+  if (is.character(features_idx)) {
+    features_idx <- intersect(features_idx, rownames(mtx))
+    if (length(features_idx) == 0) {
+      stop("Cannot use features that are beyond the scope of mtx")
+    }
+  } else if (max(features_idx) > nrow(mtx)) {
+    stop("Cannot use features that are beyond the scope of mtx")
+  }
+  
+  # Check dataset dimensions
+  if (nrow(covariates) != ncol(mtx)) {
+    stop("Uneven number of cells between latent data and expression data")
+  }
+  
+  # Subset
+  mtx <- mtx[features_idx,]
+  mtx.dimnames <- dimnames(mtx)
+  
+  # Create formula for regression
+  vars.to.regress <- colnames(covariates)
+  fmla <- paste('GENE ~', paste(vars.to.regress, collapse = '+')) %>% as.formula
+
+  # In this code, we'll repeatedly regress different Y against the same X
+  # (covariates) in order to calculate residuals.  Rather that repeatedly
+  # call lm to do this, we'll avoid recalculating the QR decomposition for the
+  # covariates matrix each time by reusing it after calculating it once
+  regression.mat <- cbind(covariates, mtx[1,])
+  colnames(regression.mat) <- c(colnames(covariates), "GENE")
+  qr <- lm(fmla, data = regression.mat, qr = TRUE)$qr
+  rm(regression.mat)
+
+  # Make results matrix
+  data.resid <- matrix(
+    nrow = nrow(mtx),
+    ncol = ncol(mtx)
+  )
+
+  if (verbose) pb <- txtProgressBar(char = '=', style = 3, file = stderr())
+
+  # Extract residuals from each feature by using the pre-computed QR decomposition
+  for (i in 1:length(features_idx)) {
+    regression.mat <- cbind(covariates, mtx[features_idx[i], ])
+    colnames(regression.mat) <- c(vars.to.regress, 'GENE')
+    regression.mat <- qr.resid(qr = qr, y = mtx[features_idx[i],])  # The function qr.resid returns the residuals when fitting y to the matrix with QR decomposition.
+    data.resid[i, ] <- regression.mat
+    if (verbose) {
+      setTxtProgressBar(pb = pb, value = i / length(features_idx))
+    }
+  }
+
+  if (verbose) close(con = pb)
+
+  dimnames(data.resid) <- mtx.dimnames
+  
+  return(data.resid)
+}
+
+
+
+# Generate chunk points
+#
+# @param dsize How big is the data being chunked
+# @param csize How big should each chunk be
+#
+# @return A matrix where each column is a chunk, row 1 is start points, row 2 is end points
+#
+ChunkPoints <- function(dsize, csize) {
+  return(vapply(
+    X = 1L:ceiling(dsize / csize),
+    FUN = function(i) {
+      return(c(
+        start = (csize * (i - 1L)) + 1L,
+        end = min(csize * i, dsize)
+      ))
+    },
+    FUN.VALUE = numeric(length = 2L)
+  ))
+}
+
+
+"%ni%" <- Negate("%in%")
+
+ggplot_theme_NoAxes <- function() {
+  theme(
+    axis.title = element_blank(),
+    axis.line = element_blank(),
+    axis.text = element_blank(),
+    axis.ticks = element_blank()
+  )
+}
+
+minmax.normalisation <- function(x)
+{
+    return((x-min(x,na.rm=T)) /(max(x,na.rm=T)-min(x,na.rm=T)))
+}
+
+getmode <- function(v, dist) {
+  tab <- table(v)
+  #if tie, break to shortest distance
+  if(sum(tab == max(tab)) > 1){
+    tied <- names(tab)[tab == max(tab)]
+    sub  <- dist[v %in% tied]
+    names(sub) <- v[v %in% tied]
+    return(names(sub)[which.min(sub)])
+  } else {
+    return(names(tab)[which.max(tab)])
+  }
+}
+
+GRangesToString <- function(grange, sep = c("-", "-")) {
+  regions <- paste0(
+    as.character(x = seqnames(x = grange)),
+    sep[[1]],
+    start(x = grange),
+    sep[[2]],
+    end(x = grange)
+  )
+  return(regions)
+}
+
+
+## sparse -> matrix
+dropNA2matrix <- function(x) {
+  if(!is(x, "dsparseMatrix")) stop("x needs to be a subclass of dsparseMatrix!")
+  
+  x <- as(x, "dgCMatrix")
+  
+  ## remember true NAs
+  nas <- Matrix::which(is.na(x), arr.ind=TRUE)
+  
+  x@x[x@x==0] <- NA
+  zeros <- Matrix::which(is.na(x), arr.ind=TRUE)
+  x <- as(x, "matrix")
+  x[x==0] <- NA
+  x[zeros] <- 0
+  x[nas] <- NA
+  x
+}
+
+# dropNA2vector <- function(x) {
+#   stop("NEEDS TO BE FIXED")
+#   if(!is(x, "dsparseMatrix")) stop("x needs to be a subclass of dsparseMatrix!")
+#   x <- as(x, "numeric")
+
+#   # true NAs
+#   nas <- which(is.na(x), arr.ind=TRUE)
+
+#   x[x==0] <- NA
+#   zeros <- which(is.na(x), arr.ind=TRUE)
+#   x[nas] <- NA
+#   x[zeros] <- 0
+# }
+
+## matrix -> sparse
+dropNA <- function(x) {
+  if(!is(x, "matrix")) stop("x needs to be a matrix!")
+  
+  zeros <- which(x==0, arr.ind=TRUE)
+  ## keep zeros
+  x[is.na(x)] <- 0
+  x[zeros] <- NA
+  x <- Matrix::drop0(x)
+  x[zeros] <- 0
+  x
+}
+
+# dropNAis.na <- function(x) {
+#   if(!is(x, "dsparseMatrix")) stop("x needs to be a subclass of dsparseMatrix!")
+#   x <- as(x, "dgCMatrix")
+#   
+#   ### not represented means NA and 0 means 0
+#   ### this coercion keeps 0
+#   !as(x, "ngCMatrix")
+# }
+
+give.n <- function(x){
+  return(c(y = mean(x), label = length(x)))
+}
+
+sort.abs <- function(dt, sort.field) dt[order(-abs(dt[[sort.field]]))]
+
+
+# function to pseudobulk a SingleCellExperiment object
+pseudobulk_sce_fn <- function(x, assay = NULL, by, fun = c("sum", "mean", "median"), scale = FALSE) {
+  
+  # check validity of input arguments
+  fun <- match.arg(fun)
+  if (is.null(assay))  assay <- assayNames(x)[1] 
+  
+  # store aggregation parameters &
+  # nb. of cells that went into aggregation
+  md <- metadata(x)
+  md$agg_pars <- list(assay = assay, by = by, fun = fun, scale = scale)
+  
+  # get aggregation function
+  # fun <- switch(fun, sum = "rowSums", mean = "rowMeans", median = "rowMedians")
+  
+  # drop missing factor levels & tabulate number of cells
+  cd <- dplyr::mutate_if(as.data.frame(colData(x)), is.factor, droplevels)
+  colData(x) <- DataFrame(cd, row.names = colnames(x),check.names = FALSE)
+  md$n_cells <- table(as.data.frame(colData(x)[, by]))
+  
+  # assure 'by' colData columns are factors so that missing combinations aren't dropped
+  for (i in by) 
+    if (!is.factor(x[[i]])) 
+      x[[i]] <- factor(x[[i]])
+  
+  # split cells & compute pseudo-bulks
+  cs <- .split_cells(x, by)
+  # pb <- .pb(x, cs, assay, fun)
+  pb <- .pb(x=x, by=by, fun=fun)
+  if (scale & length(by) == 2) {
+    ls <- lapply(.pb(x, cs, "counts", "rowSums"), colSums)
+    pb <- lapply(seq_along(pb), function(i) pb[[i]] / 1e6 * ls[[i]])
+    names(pb) <- names(ls)
+  }
+  
+  # construct SCE
+  pb <- SingleCellExperiment(pb, metadata = md)
+  
+  # propagate 'colData' columns that are unique across 2nd 'by'
+  if (length(by) == 2) {
+    cd <- colData(x)
+    ids <- colnames(pb)
+    counts <- vapply(ids, function(u) {
+      m <- as.logical(match(cd[, by[2]], u, nomatch = 0))
+      vapply(cd[m, ], function(u) length(unique(u)), numeric(1))
+    }, numeric(ncol(colData(x))))
+    cd_keep <- apply(counts, 1, function(u) all(u == 1))
+    cd_keep <- setdiff(names(which(cd_keep)), by)
+    if (length(cd_keep) != 0) {
+      m <- match(ids, cd[, by[2]], nomatch = 0)
+      cd <- cd[m, cd_keep, drop = FALSE]
+      rownames(cd) <- ids
+      colData(pb) <- cd
+    }
+  }
+  return(pb)
+}
+
+
+# split cells by cluster-sample
+# auxiliary function to pseudobulk a SingleCellExperiment object
+#   - by: character vector specifying colData column(s) to split by. If length(by) == 1, a list of length nlevels(colData$by), else,
+#          a nested list with 2nd level of length nlevels(colData$by[2])
+.split_cells <- function(x, by) {
+  if (is(x, "SingleCellExperiment")) x <- colData(x)
+  cd <- data.frame(x[by], check.names = FALSE)
+  cd <- data.table(cd, cell = rownames(x)) %>% split(by = by, sorted = TRUE, flatten = FALSE)
+  purrr::map_depth(cd, length(by), "cell")
+}
+
+
+# auxiliary function to pseudobulk a SingleCellExperiment object
+.pb <- function(x, by, fun) {
+  
+  # compute pseudobulks
+  # y <- scuttle::summarizeAssayByGroup(x, assay.type = assay, ids = (ids <- colData(x)[by]), statistics = fun, BPPARAM = BiocParallel::SerialParam())
+  y <- scuttle::summarizeAssayByGroup(x, ids = colData(x)[by], statistics = fun)
+  colnames(y) <- y[[by[length(by)]]]
+  
+  if (length(by) == 1)  return(assay(y))
+  
+  # reformat into one assay per 'by[1]'
+  if (is.factor(ids <- y[[by[1]]]))
+    ids <- droplevels(ids)
+  is <- split(seq_len(ncol(y)), ids)
+  ys <- map(is, ~assay(y)[, .])
+  
+  # fill in missing combinations
+  for (i in seq_along(ys)) {
+    fill <- setdiff(unique(y[[by[2]]]), colnames(ys[[i]]))
+    if (length(fill != 0)) {
+      foo <- matrix(0, nrow(x), length(fill))
+      colnames(foo) <- fill
+      foo <- cbind(ys[[i]], foo)
+      o <- paste(sort(unique(y[[by[2]]])))
+      ys[[i]] <- foo[, o]
+    }
+  }
+  return(ys)
+}
+
+
+.summarizeJASPARMotifs <- function(motifs = NULL){
+
+  motifNames <- lapply(seq_along(motifs), function(x){
+    namex <- make.names(motifs[[x]]@name)
+    if(substr(namex,nchar(namex),nchar(namex))=="."){
+      namex <- substr(namex,1,nchar(namex)-1)
+    }
+    namex <- paste0(namex, "_", x)
+    namex
+  }) %>% unlist(.)
+
+  motifDF <- lapply(seq_along(motifs), function(x){
+    data.frame(
+      row.names = motifNames[x],
+      name = motifs[[x]]@name[[1]],
+      ID = motifs[[x]]@ID,
+      strand = motifs[[x]]@strand,
+      symbol = ifelse(!is.null(motifs[[x]]@tags$symbol[1]), motifs[[x]]@tags$symbol[1], NA) ,
+      family = ifelse(!is.null(motifs[[x]]@tags$family[1]), motifs[[x]]@tags$family[1], NA),
+      alias = ifelse(!is.null(motifs[[x]]@tags$alias[1]), motifs[[x]]@tags$alias[1], NA),
+      stringsAsFactors = FALSE
+    )
+  }) %>% Reduce("rbind", .) %>% DataFrame
+  
+  names(motifs) <- motifNames
+
+  out <- list(motifs = motifs, motifSummary = motifDF)
+
+  return(out)
+  
+}
+
+.summarizeChromVARMotifs <- function(motifs = NULL){
+
+  motifNames <- lapply(seq_along(motifs), function(x){
+    namex <- make.names(motifs[[x]]@name)
+    if(grepl("LINE", namex)){
+      splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE)
+      namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1]
+    }
+    if(substr(namex,nchar(namex),nchar(namex))=="."){
+      namex <- substr(namex,1,nchar(namex)-1)
+    }
+    namex <- paste0(namex, "_", x)
+    namex
+  }) %>% unlist(.)
+
+  motifNames2 <- lapply(seq_along(motifs), function(x){
+    namex <- make.names(motifs[[x]]@name)
+    if(grepl("LINE", namex)){
+      splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE)
+      namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1]
+    }
+    if(substr(namex,nchar(namex),nchar(namex))=="."){
+      namex <- substr(namex,1,nchar(namex)-1)
+    }
+    namex
+  }) %>% unlist(.)
+
+  motifDF <- lapply(seq_along(motifs), function(x){
+    df <- data.frame(
+      row.names = motifNames[x],
+      name = motifNames2[[x]],
+      ID = motifs[[x]]@ID,
+      strand = motifs[[x]]@strand,
+      stringsAsFactors = FALSE
+    )
+  }) %>% Reduce("rbind", .) %>% DataFrame
+
+  names(motifs) <- motifNames
+
+  out <- list(motifs = motifs, motifSummary = motifDF)
+
+  return(out)
+
+}
+
+.augment_matrix <-function(mtx, samples) {
+  samples <- unique(samples)
+  mtx <- t(mtx)
+  aug_mtx<-matrix(NA, ncol=ncol(mtx), nrow=length(samples))
+  aug_mtx<-mtx[match(samples,rownames(mtx)),,drop=FALSE]
+  rownames(aug_mtx)<-samples
+  colnames(aug_mtx)<-colnames(mtx)
+  return(t(aug_mtx))
+}
+
+
+stop_quietly <- function() {
+  opt <- options(show.error.messages = FALSE)
+  on.exit(options(opt))
+  stop()
+}
\ No newline at end of file