mouse_organogenesis-m-om / Git / [dbb3ea] /utils.R

Models:
AlyssaS/
mouse_organogenesis-m-om
Downloads: 1
[dbb3ea]: / utils.R
History
Download this file
624 lines (521 with data), 19.2 kB

#############################
## Commonly-used functions ##
#############################

load_SingleCellExperiment <- function(file, normalise = FALSE, features = NULL, cells = NULL, remove_non_expressed_genes = FALSE) {
  library(SingleCellExperiment); library(scran); library(scater);
  sce <- readRDS(file)
  if (!is.null(cells)) sce <- sce[,cells]
  if (!is.null(features)) sce <- sce[features,]
  if (remove_non_expressed_genes) sce <- sce[which(Matrix::rowSums(counts(sce))>15),]
  if (normalise) sce <- logNormCounts(sce)
  
  return(sce)
}

load_Seurat <- function(file, assay = "RNA", normalise = FALSE, features = NULL, cells = NULL, remove_non_expressed_genes = FALSE, ...) {
  library(Seurat)
  seurat <- readRDS(file)
  # if (assay%in%Seurat::Assays(seurat)) seurat <- seurat[[assay]]
  if (!is.null(cells)) seurat <- seurat[,cells]
  if (!is.null(features)) seurat <- seurat[features,]
  if (normalise) {
    seurat <- NormalizeData(seurat, normalization.method = "LogNormalize")
    seurat <- ScaleData(seurat, ...)
  }
  if (remove_non_expressed_genes) seurat <- seurat[which(Matrix::rowMeans(seurat@assays[[assay]]@counts)>1e-4),]
  return(seurat)
}

matrix.please<-function(x) {
  m<-as.matrix(x[,-1])
  rownames(m)<-x[[1]]
  m
}

#'  method=1: The TF-IDF implementation used by Stuart & Butler et al. 2019. This computes \eqn{\log(TF \times IDF)}.
#'  method=2: The TF-IDF implementation used by Cusanovich & Hill et al. 2018. This computes \eqn{TF \times (\log(IDF))}.
#'  method=3: The log-TF method used by Andrew Hill. This computes \eqn{\log(TF) \times \log(IDF)}.
tfidf <- function(mtx, method = 1, scale.factor = 1e4) {
  npeaks <- colSums(mtx)
  if (any(npeaks == 0)) {
    warning("Some cells contain 0 total counts")
  }

  tf <- Matrix::tcrossprod(mtx, y = Matrix::Diagonal(x=1/npeaks))

  rsums <- rowSums(mtx)
  if (any(rsums == 0)) {
    warning("Some features contain 0 total counts")
  }
  idf <- ncol(mtx) / rsums

  if (method == 2) {
    idf <- log(1 + idf)
  } else if (method == 3) {
    tf <- log1p(tf * scale.factor)
    idf <- log(1 + idf)
  }
  mtx.tfidf <- Matrix::Diagonal(n = length(idf), x = idf) %*% tf

  if (method == 1) {
    mtx.tfidf <- log1p(mtx.tfidf * scale.factor)
  }
  colnames(mtx.tfidf) <- colnames(mtx)
  rownames(mtx.tfidf) <- rownames(mtx)

  # set NA values to 0
  mtx.tfidf[is.na(mtx.tfidf)] <- 0

  return(mtx.tfidf)
}

pdist <- function(tmat){
  # @param tmat A non-negative matrix with samples by features
  # @reference http://r.789695.n4.nabble.com/dist-function-in-R-is-very-slow-td4738317.html
  mtm <- Matrix::tcrossprod(tmat)
  sq <- rowSums(tmat^2)
  out0 <- outer(sq, sq, "+") - 2 * mtm
  out0[out0 < 0] <- 0
  
  sqrt(out0)
}

smoother_aggregate_nearest_nb <- function(mat, D, k){
  # @param mat A matrix in a shape of #genes x #samples.
  # @param D A predefined distance matrix in a shape of #samples x #samples.
  # @param k An integer to choose \code{k} nearest samples (self-inclusive) to
  #  aggregate based on the distance matrix \code{D}.
  denoised_mat <- sapply(seq_len(ncol(mat)), function(cid){
    nb_cid <- head(order(D[cid, ]), k)
    closest_mat <- mat[, nb_cid, drop=FALSE]
    # return(Matrix::rowSums(closest_mat))
    return(Matrix::rowMeans(closest_mat))
  })
  dimnames(denoised_mat) <- dimnames(mat)
  return(denoised_mat)
}


# TO-FINISH.....
smoother_aggregate_nearest_nb_parallel <- function(mat, D, k, cores=1){
  # @param mat A matrix in a shape of #genes x #samples.
  # @param D A predefined distance matrix in a shape of #samples x #samples.
  # @param k An integer to choose \code{k} nearest samples (self-inclusive) to
  #  aggregate based on the distance matrix \code{D}.

  # library(future)
  library(future.apply)
  plan("multiprocess", workers = ncores)


  # sapply(seq_len(ncol(mat)), function(cid){
  future_sapply(seq_len(ncol(mat)), function(cid){
    nb_cid <- head(order(D[cid, ]), k)
    closest_mat <- mat[, nb_cid, drop=FALSE]
    # return(Matrix::rowSums(closest_mat))
    return(Matrix::rowMeans(closest_mat))
  })
}

# regress_covariates <- function(mtx, vars.to.regress) {
#   data <- scale(t(logcounts(sce_filt)), center = T, scale = F)
#   data_regressed <- apply(data, 2, function(x) {
#     lm.out <- lm(formula=expr~covariate, data=data.frame(expr=x, covariate=factor(sce_filt$stage)));
#     residuals <- lm.out[["residuals"]]+lm.out[["coefficients"]][1]
#   })
# }

# Remove unwanted effects from a matrix
#
# @parm mtx An expression matrix to regress the effects of covariates out
# of should be the complete expression matrix in genes x cells
# @param covariates A matrix or data.frame of latent variables, should be cells
# x covariates, the colnames should be the variables to regress
# @param features_idx An integer vector representing the indices of the
# genes to run regression on
# @param model.use Model to use, one of 'linear', 'poisson', or 'negbinom'; pass
# NULL to simply return mtx
# @param verbose Display a progress bar
#' @importFrom stats as.formula lm
#' @importFrom utils txtProgressBar setTxtProgressBar
#
RegressOutMatrix_parallel <- function(mtx, covariates = NULL, features_idx = NULL, split.by = NULL, block.size = 1000, min.cells.to.block = 3000, ncores = 1, verbose = TRUE) {
  
  library(future)
  library(future.apply)
  plan("multiprocess", workers = ncores)
  
  # Check features_idx
  if (is.null(features_idx)) {
    features_idx <- 1:nrow(mtx)
  }
  if (is.character(features_idx)) {
    features_idx <- intersect(features_idx, rownames(mtx))
    if (length(features_idx) == 0) {
      stop("Cannot use features that are beyond the scope of mtx")
    }
  } else if (max(features_idx) > nrow(mtx)) {
    stop("Cannot use features that are beyond the scope of mtx")
  }
  
  # Check dataset dimensions
  if (nrow(covariates) != ncol(mtx)) {
    stop("Uneven number of cells between latent data and expression data")
  }
  
  # Subset
  mtx <- mtx[features_idx,]
  mtx.dimnames <- dimnames(mtx)
  
  # Define chunck points
  chunk.points <- ChunkPoints(dsize = nrow(mtx), csize = block.size)
  
  # Define cell splitting
  split.cells <- split(colnames(mtx), f = split.by %||% TRUE)

   if (nbrOfWorkers() > 1) {

    # Define chuncks
      chunks <- expand.grid(
        names(split.cells),
        1:ncol(chunk.points),
        stringsAsFactors = FALSE
      )

      # Run RegressOutMatrix in parallel
      mtx.resid <- future_lapply(
        X = 1:nrow(chunks),
        FUN = function(i) {
          row <- chunks[i, ]
          group <- row[[1]]
          index <- as.numeric(row[[2]])
          return(RegressOutMatrix(
            mtx = mtx[chunk.points[1, index]:chunk.points[2, index], split.cells[[group]], drop = FALSE],
            covariates = covariates[split.cells[[group]], , drop = FALSE],
            # features_idx = features_idx[chunk.points[1, index]:chunk.points[2, index]],
            verbose = FALSE
          ))
        }
      )

      # Merge splitted cells
      if (length(split.cells) > 1) {
        merge.indices <- lapply(
          X = 1:length(x = split.cells),
          FUN = seq.int,
          to = length(mtx.resid),
          by = length(split.cells)
        )
        mtx.resid <- lapply(
          X = merge.indices,
          FUN = function(x) {
            return(do.call( 'rbind', mtx.resid[x]))
          }
        )
        mtx.resid <- do.call('cbind', mtx.resid)
      } else {
        mtx.resid <- do.call( 'rbind', mtx.resid)
      }
    } else {
      
      mtx.resid <- lapply(
        X = names(split.cells),
        FUN = function(x) {
          if (verbose && length(split.cells) > 1) {
            message("Regressing out variables from split ", x)
          }
          return(RegressOutMatrix(
            mtx = mtx[, split.cells[[x]], drop = FALSE],
            covariates = covariates[split.cells[[x]], , drop = FALSE],
            features_idx = features_idx,
            verbose = verbose
          ))
        }
      )
      mtx.resid <- do.call('cbind', mtx.resid)
    }
    # dimnames(mtx.resid) <- dimnames(mtx)
    return(mtx.resid)
  }

RegressOutMatrix <- function(mtx, covariates = NULL, features_idx = NULL, verbose = TRUE) {
  
  # Check features_idx
  if (is.null(features_idx)) {
    features_idx <- 1:nrow(mtx)
  }
  if (is.character(features_idx)) {
    features_idx <- intersect(features_idx, rownames(mtx))
    if (length(features_idx) == 0) {
      stop("Cannot use features that are beyond the scope of mtx")
    }
  } else if (max(features_idx) > nrow(mtx)) {
    stop("Cannot use features that are beyond the scope of mtx")
  }
  
  # Check dataset dimensions
  if (nrow(covariates) != ncol(mtx)) {
    stop("Uneven number of cells between latent data and expression data")
  }
  
  # Subset
  mtx <- mtx[features_idx,]
  mtx.dimnames <- dimnames(mtx)
  
  # Create formula for regression
  vars.to.regress <- colnames(covariates)
  fmla <- paste('GENE ~', paste(vars.to.regress, collapse = '+')) %>% as.formula

  # In this code, we'll repeatedly regress different Y against the same X
  # (covariates) in order to calculate residuals.  Rather that repeatedly
  # call lm to do this, we'll avoid recalculating the QR decomposition for the
  # covariates matrix each time by reusing it after calculating it once
  regression.mat <- cbind(covariates, mtx[1,])
  colnames(regression.mat) <- c(colnames(covariates), "GENE")
  qr <- lm(fmla, data = regression.mat, qr = TRUE)$qr
  rm(regression.mat)

  # Make results matrix
  data.resid <- matrix(
    nrow = nrow(mtx),
    ncol = ncol(mtx)
  )

  if (verbose) pb <- txtProgressBar(char = '=', style = 3, file = stderr())

  # Extract residuals from each feature by using the pre-computed QR decomposition
  for (i in 1:length(features_idx)) {
    regression.mat <- cbind(covariates, mtx[features_idx[i], ])
    colnames(regression.mat) <- c(vars.to.regress, 'GENE')
    regression.mat <- qr.resid(qr = qr, y = mtx[features_idx[i],])  # The function qr.resid returns the residuals when fitting y to the matrix with QR decomposition.
    data.resid[i, ] <- regression.mat
    if (verbose) {
      setTxtProgressBar(pb = pb, value = i / length(features_idx))
    }
  }

  if (verbose) close(con = pb)

  dimnames(data.resid) <- mtx.dimnames
  
  return(data.resid)
}



# Generate chunk points
#
# @param dsize How big is the data being chunked
# @param csize How big should each chunk be
#
# @return A matrix where each column is a chunk, row 1 is start points, row 2 is end points
#
ChunkPoints <- function(dsize, csize) {
  return(vapply(
    X = 1L:ceiling(dsize / csize),
    FUN = function(i) {
      return(c(
        start = (csize * (i - 1L)) + 1L,
        end = min(csize * i, dsize)
      ))
    },
    FUN.VALUE = numeric(length = 2L)
  ))
}


"%ni%" <- Negate("%in%")

ggplot_theme_NoAxes <- function() {
  theme(
    axis.title = element_blank(),
    axis.line = element_blank(),
    axis.text = element_blank(),
    axis.ticks = element_blank()
  )
}

minmax.normalisation <- function(x)
{
    return((x-min(x,na.rm=T)) /(max(x,na.rm=T)-min(x,na.rm=T)))
}

getmode <- function(v, dist) {
  tab <- table(v)
  #if tie, break to shortest distance
  if(sum(tab == max(tab)) > 1){
    tied <- names(tab)[tab == max(tab)]
    sub  <- dist[v %in% tied]
    names(sub) <- v[v %in% tied]
    return(names(sub)[which.min(sub)])
  } else {
    return(names(tab)[which.max(tab)])
  }
}

GRangesToString <- function(grange, sep = c("-", "-")) {
  regions <- paste0(
    as.character(x = seqnames(x = grange)),
    sep[[1]],
    start(x = grange),
    sep[[2]],
    end(x = grange)
  )
  return(regions)
}


## sparse -> matrix
dropNA2matrix <- function(x) {
  if(!is(x, "dsparseMatrix")) stop("x needs to be a subclass of dsparseMatrix!")
  
  x <- as(x, "dgCMatrix")
  
  ## remember true NAs
  nas <- Matrix::which(is.na(x), arr.ind=TRUE)
  
  x@x[x@x==0] <- NA
  zeros <- Matrix::which(is.na(x), arr.ind=TRUE)
  x <- as(x, "matrix")
  x[x==0] <- NA
  x[zeros] <- 0
  x[nas] <- NA
  x
}

# dropNA2vector <- function(x) {
#   stop("NEEDS TO BE FIXED")
#   if(!is(x, "dsparseMatrix")) stop("x needs to be a subclass of dsparseMatrix!")
#   x <- as(x, "numeric")

#   # true NAs
#   nas <- which(is.na(x), arr.ind=TRUE)

#   x[x==0] <- NA
#   zeros <- which(is.na(x), arr.ind=TRUE)
#   x[nas] <- NA
#   x[zeros] <- 0
# }

## matrix -> sparse
dropNA <- function(x) {
  if(!is(x, "matrix")) stop("x needs to be a matrix!")
  
  zeros <- which(x==0, arr.ind=TRUE)
  ## keep zeros
  x[is.na(x)] <- 0
  x[zeros] <- NA
  x <- Matrix::drop0(x)
  x[zeros] <- 0
  x
}

# dropNAis.na <- function(x) {
#   if(!is(x, "dsparseMatrix")) stop("x needs to be a subclass of dsparseMatrix!")
#   x <- as(x, "dgCMatrix")
#   
#   ### not represented means NA and 0 means 0
#   ### this coercion keeps 0
#   !as(x, "ngCMatrix")
# }

give.n <- function(x){
  return(c(y = mean(x), label = length(x)))
}

sort.abs <- function(dt, sort.field) dt[order(-abs(dt[[sort.field]]))]


# function to pseudobulk a SingleCellExperiment object
pseudobulk_sce_fn <- function(x, assay = NULL, by, fun = c("sum", "mean", "median"), scale = FALSE) {
  
  # check validity of input arguments
  fun <- match.arg(fun)
  if (is.null(assay))  assay <- assayNames(x)[1] 
  
  # store aggregation parameters &
  # nb. of cells that went into aggregation
  md <- metadata(x)
  md$agg_pars <- list(assay = assay, by = by, fun = fun, scale = scale)
  
  # get aggregation function
  # fun <- switch(fun, sum = "rowSums", mean = "rowMeans", median = "rowMedians")
  
  # drop missing factor levels & tabulate number of cells
  cd <- dplyr::mutate_if(as.data.frame(colData(x)), is.factor, droplevels)
  colData(x) <- DataFrame(cd, row.names = colnames(x),check.names = FALSE)
  md$n_cells <- table(as.data.frame(colData(x)[, by]))
  
  # assure 'by' colData columns are factors so that missing combinations aren't dropped
  for (i in by) 
    if (!is.factor(x[[i]])) 
      x[[i]] <- factor(x[[i]])
  
  # split cells & compute pseudo-bulks
  cs <- .split_cells(x, by)
  # pb <- .pb(x, cs, assay, fun)
  pb <- .pb(x=x, by=by, fun=fun)
  if (scale & length(by) == 2) {
    ls <- lapply(.pb(x, cs, "counts", "rowSums"), colSums)
    pb <- lapply(seq_along(pb), function(i) pb[[i]] / 1e6 * ls[[i]])
    names(pb) <- names(ls)
  }
  
  # construct SCE
  pb <- SingleCellExperiment(pb, metadata = md)
  
  # propagate 'colData' columns that are unique across 2nd 'by'
  if (length(by) == 2) {
    cd <- colData(x)
    ids <- colnames(pb)
    counts <- vapply(ids, function(u) {
      m <- as.logical(match(cd[, by[2]], u, nomatch = 0))
      vapply(cd[m, ], function(u) length(unique(u)), numeric(1))
    }, numeric(ncol(colData(x))))
    cd_keep <- apply(counts, 1, function(u) all(u == 1))
    cd_keep <- setdiff(names(which(cd_keep)), by)
    if (length(cd_keep) != 0) {
      m <- match(ids, cd[, by[2]], nomatch = 0)
      cd <- cd[m, cd_keep, drop = FALSE]
      rownames(cd) <- ids
      colData(pb) <- cd
    }
  }
  return(pb)
}


# split cells by cluster-sample
# auxiliary function to pseudobulk a SingleCellExperiment object
#   - by: character vector specifying colData column(s) to split by. If length(by) == 1, a list of length nlevels(colData$by), else,
#          a nested list with 2nd level of length nlevels(colData$by[2])
.split_cells <- function(x, by) {
  if (is(x, "SingleCellExperiment")) x <- colData(x)
  cd <- data.frame(x[by], check.names = FALSE)
  cd <- data.table(cd, cell = rownames(x)) %>% split(by = by, sorted = TRUE, flatten = FALSE)
  purrr::map_depth(cd, length(by), "cell")
}


# auxiliary function to pseudobulk a SingleCellExperiment object
.pb <- function(x, by, fun) {
  
  # compute pseudobulks
  # y <- scuttle::summarizeAssayByGroup(x, assay.type = assay, ids = (ids <- colData(x)[by]), statistics = fun, BPPARAM = BiocParallel::SerialParam())
  y <- scuttle::summarizeAssayByGroup(x, ids = colData(x)[by], statistics = fun)
  colnames(y) <- y[[by[length(by)]]]
  
  if (length(by) == 1)  return(assay(y))
  
  # reformat into one assay per 'by[1]'
  if (is.factor(ids <- y[[by[1]]]))
    ids <- droplevels(ids)
  is <- split(seq_len(ncol(y)), ids)
  ys <- map(is, ~assay(y)[, .])
  
  # fill in missing combinations
  for (i in seq_along(ys)) {
    fill <- setdiff(unique(y[[by[2]]]), colnames(ys[[i]]))
    if (length(fill != 0)) {
      foo <- matrix(0, nrow(x), length(fill))
      colnames(foo) <- fill
      foo <- cbind(ys[[i]], foo)
      o <- paste(sort(unique(y[[by[2]]])))
      ys[[i]] <- foo[, o]
    }
  }
  return(ys)
}


.summarizeJASPARMotifs <- function(motifs = NULL){

  motifNames <- lapply(seq_along(motifs), function(x){
    namex <- make.names(motifs[[x]]@name)
    if(substr(namex,nchar(namex),nchar(namex))=="."){
      namex <- substr(namex,1,nchar(namex)-1)
    }
    namex <- paste0(namex, "_", x)
    namex
  }) %>% unlist(.)

  motifDF <- lapply(seq_along(motifs), function(x){
    data.frame(
      row.names = motifNames[x],
      name = motifs[[x]]@name[[1]],
      ID = motifs[[x]]@ID,
      strand = motifs[[x]]@strand,
      symbol = ifelse(!is.null(motifs[[x]]@tags$symbol[1]), motifs[[x]]@tags$symbol[1], NA) ,
      family = ifelse(!is.null(motifs[[x]]@tags$family[1]), motifs[[x]]@tags$family[1], NA),
      alias = ifelse(!is.null(motifs[[x]]@tags$alias[1]), motifs[[x]]@tags$alias[1], NA),
      stringsAsFactors = FALSE
    )
  }) %>% Reduce("rbind", .) %>% DataFrame
  
  names(motifs) <- motifNames

  out <- list(motifs = motifs, motifSummary = motifDF)

  return(out)
  
}

.summarizeChromVARMotifs <- function(motifs = NULL){

  motifNames <- lapply(seq_along(motifs), function(x){
    namex <- make.names(motifs[[x]]@name)
    if(grepl("LINE", namex)){
      splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE)
      namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1]
    }
    if(substr(namex,nchar(namex),nchar(namex))=="."){
      namex <- substr(namex,1,nchar(namex)-1)
    }
    namex <- paste0(namex, "_", x)
    namex
  }) %>% unlist(.)

  motifNames2 <- lapply(seq_along(motifs), function(x){
    namex <- make.names(motifs[[x]]@name)
    if(grepl("LINE", namex)){
      splitNamex <- stringr::str_split(motifs[[x]]@ID, pattern="\\_", simplify = TRUE)
      namex <- splitNamex[1, grep("LINE",splitNamex[1,]) + 1]
    }
    if(substr(namex,nchar(namex),nchar(namex))=="."){
      namex <- substr(namex,1,nchar(namex)-1)
    }
    namex
  }) %>% unlist(.)

  motifDF <- lapply(seq_along(motifs), function(x){
    df <- data.frame(
      row.names = motifNames[x],
      name = motifNames2[[x]],
      ID = motifs[[x]]@ID,
      strand = motifs[[x]]@strand,
      stringsAsFactors = FALSE
    )
  }) %>% Reduce("rbind", .) %>% DataFrame

  names(motifs) <- motifNames

  out <- list(motifs = motifs, motifSummary = motifDF)

  return(out)

}

.augment_matrix <-function(mtx, samples) {
  samples <- unique(samples)
  mtx <- t(mtx)
  aug_mtx<-matrix(NA, ncol=ncol(mtx), nrow=length(samples))
  aug_mtx<-mtx[match(samples,rownames(mtx)),,drop=FALSE]
  rownames(aug_mtx)<-samples
  colnames(aug_mtx)<-colnames(mtx)
  return(t(aug_mtx))
}


stop_quietly <- function() {
  opt <- options(show.error.messages = FALSE)
  on.exit(options(opt))
  stop()
}