--- a +++ b/R/scAI_model.R @@ -0,0 +1,1691 @@ +#' The scAI Class +#' +#' The scAI object is created from a paired single-cell transcriptomic and epigenomic data. +#' It takes a list of two digital data matrices as input. Genes/loci should be in rows and cells in columns. rownames and colnames should be included. +#' The class provides functions for data preprocessing, integrative analysis, and visualization. +#' +#' The key slots used in the scAI object are described below. +#' +#' @slot raw.data List of raw data matrices, one per dataset (Genes/loci should be in rows and cells in columns) +#' @slot norm.data List of normalized matrices (genes/loci by cells) +#' @slot agg.data Aggregated epigenomic data within similar cells +#' @slot scale.data List of scaled matrices +#' @slot pData data frame storing the information associated with each cell +#' @slot var.features List of informative features to be used, one giving informative genes and the other giving informative loci +#' @slot fit List of inferred low-rank matrices, including W1, W2, H, Z, R +#' @slot fit.variedK List of inferred low-rank matrices when varying the rank K +#' @slot embed List of the reduced 2D coordinates, one per method, e.g., t-SNE/FIt-SNE/umap +#' @slot identity a factor defining the cell identity +#' @slot cluster List of consensus clustering results +#' @slot options List of parameters used throughout analysis +#' +#' @exportClass scAI +#' @importFrom Rcpp evalCpp +#' @importFrom methods setClass +#' @useDynLib scAI +scAI <- methods::setClass("scAI", + slots = c(raw.data = "list", + norm.data = "list", + agg.data = "matrix", + scale.data = "list", + pData = "data.frame", + var.features = "list", + fit = "list", + fit.variedK = "list", + embed = "list", + identity = "factor", + cluster = "list", + options = "list") +) +#' show method for scAI +#' +#' @param scAI object +#' @param show show the object +#' @param object object +#' @docType methods +#' +setMethod(f = "show", signature = "scAI", definition = function(object) { + cat("An object of class", class(object), "\n", length(object@raw.data), "datasets.\n") + invisible(x = NULL) +}) + + + +#' creat a new scAI object +#' +#' @param raw.data List of raw data matrices, a paired single-cell transcriptomic and epigenomic data +#' @param do.sparse whether use sparse format +#' +#' @return +#' @export +#' +#' @examples +#' @importFrom methods as new +create_scAIobject <- function(raw.data, do.sparse = T) { + object <- methods::new(Class = "scAI", + raw.data = raw.data) + if (do.sparse) { + raw.data <- lapply(raw.data, function(x) { + as(as.matrix(x), "dgCMatrix") + }) + } + object@raw.data <- raw.data + return(object) +} + + + +#' preprocess the raw.data including quality control and normalization +#' +#' @param object scAI object +#' @param assay List of assay names to be normalized +#' @param minFeatures Filter out cells with expressed features < minimum features +#' @param minCells Filter out features expressing in less than minCells +#' @param minCounts Filter out cells with expressed count < minCounts +#' @param maxCounts Filter out cells with expressed count > minCounts +#' @param libararyflag Whether do library size normalization +#' @param logNormalize whether do log transformation +#' +#' @return +#' @export +#' +#' @examples +preprocessing <- function(object, assay = list("RNA", "ATAC"), minFeatures = 200, minCells = 3, minCounts = NULL, maxCounts = NULL, libararyflag = TRUE, logNormalize = TRUE) { + if (is.null(assay)) { + for (i in 1:length(object@raw.data)) { + object@norm.data[[i]] <- object@raw.data[[i]] + } + + } else { + + for (i in 1:length(assay)) { + iniData <- object@raw.data[[assay[[i]]]] + print(dim(iniData)) + + if (class(iniData) == "data.frame") { + iniData <- as.matrix(iniData) + } + # filter cells that have features less than #minFeatures + msum <- Matrix::colSums(iniData != 0) + proData <- iniData[, msum >= minFeatures] + # filter cells that have UMI counts less than #minCounts + if (!is.null(minCounts)) { + proData <- proData[, Matrix::colSums(proData) >= minCounts] + } + + # filter cells that have expressed genes high than #maxGenes + if (!is.null(maxCounts)) { + proData <- proData[, Matrix::colSums(proData) <= maxCounts] + } + + # filter genes that only express less than #minCells cells + proData <- proData[Matrix::rowSums(proData != 0) >= minCells, ] + # normalization:we employ a global-scaling normalization method that normalizes the gene expression measurements for each cell by the total expression multiplies this by a scale factor (10,000 by default) + if (libararyflag) { + proData <- sweep(proData, 2, Matrix::colSums(proData), FUN = `/`) * 10000 + } + if (logNormalize) { + proData = log(proData + 1) + } + object@norm.data[[assay[[i]]]] <- proData + + } + } + if (length(assay) == 2) { + X1 <- object@norm.data[[assay[[1]]]] + X2 <- object@norm.data[[assay[[2]]]] + cell.keep <- intersect(colnames(X1), colnames(X2)) + object@norm.data[[assay[[1]]]] <- X1[, cell.keep] + object@norm.data[[assay[[2]]]] <- X2[, cell.keep] + } else if (length(assay) == 1) { + X1 <- object@norm.data[[assay[[1]]]] + assay2 <- setdiff(names(object@raw.data), assay[[1]]) + X2 <- object@raw.data[[assay2]] + cell.keep <- intersect(colnames(X1), colnames(X2)) + object@norm.data[[assay[[1]]]] <- X1[, cell.keep] + object@norm.data[[assay2]] <- X2[, cell.keep] + } + names(object@norm.data) <- names(object@raw.data) + + return(object) +} + + + +#' add the cell information into pData slot +#' +#' @param object scAi object +#' @param pdata cell information to be added +#' @param pdata.name the name of column to be assigned +#' +#' @return +#' @export +#' +#' @examples +addpData <- function(object, pdata, pdata.name = NULL) { + if (is.null(x = pdata.name) && is.atomic(x = pdata)) { + stop("'pdata.name' must be provided for atomic pdata types (eg. vectors)") + } + if (inherits(x = pdata, what = c("matrix", "Matrix"))) { + pdata <- as.data.frame(x = pdata) + } + + if (is.null(x = pdata.name)) { + pdata.name <- names(pdata) + } else { + names(pdata) <- pdata.name + } + object@pData <- pdata + return(object) +} + + + +#' run scAI model +#' +#' @param object scAI object +#' @param K An integer indicating the Rank of the inferred factor +#' @param do.fast whether use the python version for running scAI optimization model +#' @param nrun Number of times to repreat the running +#' @param hvg.use1 whether use high variable genes for RNA-seq data +#' @param hvg.use2 whether use high variable genes for ATAC-seq data +#' @param keep_all Whether keep all the results from multiple runs +#' @param s Probability of Bernoulli distribution +#' @param alpha model parameter +#' @param lambda model parameter +#' @param gamma model parameter +#' @param maxIter An integer indicating Maximum number of iteration +#' @param stop_rule Stop rule to be used +#' @param init List of the initialized low-rank matrices +#' @param rand.seed An integer indicating the seed +#' +#' @return +#' @export +#' +#' @examples +#' @importFrom foreach foreach "%dopar%" +#' @importFrom parallel makeForkCluster makeCluster detectCores +#' @importFrom doParallel registerDoParallel +#' @importFrom reticulate r_to_py source_python +run_scAI <- function(object, K, do.fast = FALSE, nrun = 5L, hvg.use1 = FALSE, hvg.use2 = FALSE, keep_all = F, s = 0.25, alpha = 1, lambda = 100000, gamma = 1, maxIter = 500L, stop_rule = 1L, init = NULL, rand.seed = 1L) { + if (!is.null(init)) { + W1.init = init$W1.init + W2.init = init$W2.init + H.init = init$H.init + Z.init = init$Z.init + R.init = init$R.init + } else { + R.init = NULL + W1.init = NULL + W2.init = NULL + H.init = NULL + Z.init = NULL + } + options(warn = -1) + # Calculate the number of cores + numCores <- min(parallel::detectCores(), nrun) + cl <- tryCatch({ + parallel::makeForkCluster(numCores) + }, error = function(e) { + parallel::makeCluster(numCores) + }) + doParallel::registerDoParallel(cl) + if (hvg.use1) { + X1 <- as.matrix(object@norm.data[[1]][object@var.features[[1]], ]) + } else { + X1 <- as.matrix(object@norm.data[[1]]) + } + if (hvg.use2) { + X2 <- as.matrix(object@norm.data[[2]][object@var.features[[2]], ]) + } else { + X2 <- as.matrix(object@norm.data[[2]]) + } + + if (!do.fast) { + outs <- foreach(i = 1:nrun, .packages = c("Matrix")) %dopar% { + set.seed(rand.seed + i - 1) + scAImodel(X1, X2, K = K, s = s, alpha = alpha, lambda = lambda, gamma = gamma, maxIter = maxIter, stop_rule = stop_rule, + R.init = R.init, W1.init = W1.init, W2.init = W2.init, H.init = H.init, Z.init = Z.init) + } + } else { + barcodes <- colnames(X2) + feature1 <- rownames(X1) + feature2 <- rownames(X2) + names_com <- paste0("factor", seq_len(K)) + + path <- paste(system.file(package="scAI"), "scAImodel_py.py", sep="/") + reticulate::source_python(path) + X1py = r_to_py(X1) + X2py = r_to_py(X2) + R.initpy = r_to_py(R.init); W1.initpy = r_to_py(W1.init); W2.initpy = r_to_py(W2.init); H.initpy = r_to_py(H.init); Z.initpy = r_to_py(Z.init) + K = as.integer(K); maxIter = as.integer(maxIter) + outs <- pbapply::pbsapply( + X = 1:nrun, + FUN = function(x) { + seed = as.integer(rand.seed + x - 1) + ptm = Sys.time() + results = scAImodel_py(X1 = X1py, X2 = X2py, K = K, S = s, Alpha = alpha, Lambda = lambda, Gamma = gamma, Maxiter = maxIter, Stop_rule = stop_rule, + Seed = seed, W1 = W1.initpy, W2 = W2.initpy, H = H.initpy, Z = Z.initpy,R = R.initpy) + execution.time = Sys.time() - ptm + names(results) <- c("W1", "W2", "H", "Z", "R") + attr(results$W1, "dimnames") <- list(feature1, names_com) + attr(results$W2, "dimnames") <- list(feature2, names_com) + attr(results$H, "dimnames") <- list(names_com, barcodes) + attr(results$Z, "dimnames") <- list(barcodes, barcodes) + results$options = list(s = s, alpha = alpha, lambda = lambda, gamma = gamma, maxIter = maxIter, stop_rule = stop_rule, run.time = execution.time) + return(results) + }, + simplify = FALSE + ) + } + + objs <- foreach(i = 1:nrun, .combine = c) %dopar% { + W1 <- outs[[i]]$W1 + W2 <- outs[[i]]$W2 + sum(cor(as.matrix(W1))) + sum(cor(as.matrix(W2))) + } + parallel::stopCluster(cl) + N <- ncol(X1) + C <- base::matrix(0, N, N) + for (i in seq_len(nrun)) { + H <- outs[[i]]$H + H <- sweep(H, 2, colSums(H), FUN = `/`) + clusIndex <- apply(H, 2, which.max) + # compute the consensus matrix + adjMat <- clust2Mat(clusIndex) + C <- C + adjMat + } + CM <- C/nrun + + if (!keep_all) { + sprintf("The best seed is %d", which.min(objs)) + outs_final <- outs[[which.min(objs)]] + #object@agg.data <- outs_final$agg.data + W = list(W1 = outs_final$W1, W2 = outs_final$W2) + names(W) <- names(object@norm.data) + object@fit <- list(W = W, H = outs_final$H, Z = outs_final$Z, R = outs_final$R) + object <- getAggregatedData(object) + object@cluster$consensus <- CM + object@options$cost <- objs + object@options$paras <- outs_final$options + object@options$paras$nrun <- nrun + object@options$best.seed <- which.min(objs) + return(object) + } else { + outs_final <- list() + outs_final$best <- outs[[which.min(objs)]] + outs_final$best$consensus <- CM + outs_final$nruns <- outs + outs_final$options$cost <- objs + return(outs_final) + } +} + + + +#' Solving the optimization problem in scAI +#' +#' @param X1 Single-cell transcriptomic data matrix (norm.data) +#' @param X2 Single-cell epigenomic data matrix (norm.data) +#' @param K Rank of inferred factors +#' @param s Probability of Bernoulli distribution +#' @param alpha model parameter +#' @param lambda model parameter +#' @param gamma model parameter +#' @param maxIter Maximum number of iteration +#' @param stop_rule Stop rule to be used +#' @param R.init initialization of R +#' @param W1.init initialization of W1 +#' @param W2.init initialization of W2 +#' @param H.init initialization of H +#' @param Z.init initialization of Z +#' +#' @return +#' @export +#' +#' @examples +#' @importFrom stats rbinom runif +#' @importFrom rfunctions crossprodcpp +scAImodel <- function(X1, X2, K, s = 0.25, alpha = 1, lambda = 100000, gamma = 1, maxIter = 500, stop_rule = 1, + R.init = NULL, W1.init = NULL, W2.init = NULL, H.init = NULL, Z.init = NULL) { + # Initialization W1,W2,H and Z + p <- nrow(X1) + n <- ncol(X1) + q = nrow(X2) + if (is.null(W1.init)) { + W1.init = matrix(runif(p * K), p, K) + } + if (is.null(W2.init)) { + W2.init = matrix(runif(q * K), q, K) + } + if (is.null(H.init)) { + H.init = matrix(runif(K * n), K, n) + } + if (is.null(Z.init)) { + Z.init = matrix(runif(n), n, n) + } + if (is.null(R.init)) { + R.init = matrix(rbinom(n * n, 1, s), n, n) + } + W1 = W1.init + W2 = W2.init + H = H.init + Z = Z.init + R = R.init + + # start the clock to measure the execution time + ptm = proc.time() + eps <- .Machine$double.eps + onesM_K <- matrix(1, K, K) + + XtX2 <- crossprodcpp(X2) + index <- which(R == 0) + for (iter in 1:maxIter) { + # normalize H + H = H/rowSums(H) + # update W1 + HHt <- tcrossprod(H) + X1Ht <- eigenMapMattcrossprod(X1, H) + W1HHt <- eigenMapMatMult(W1, HHt) + W1 <- W1 * X1Ht/(W1HHt + eps) + + # update W2 + ZR <- Z + ZR[index] <- 0 + ZRHt <- eigenMapMattcrossprod(ZR, H) + X2ZRHt <- eigenMapMatMult(X2, ZRHt) + W2HHt <- eigenMapMatMult(W2, HHt) + W2 = W2 * X2ZRHt/(W2HHt + eps) + + # update H + W1tX1 <- eigenMapMatcrossprod(W1, X1) + W2tX2 <- eigenMapMatcrossprod(W2, X2) + W2tX2ZR <- eigenMapMatMult(W2tX2, ZR) + HZZt <- eigenMapMatMult(H, Z + t(Z)) + W1tW1 <- crossprodcpp(W1) + W2tW2 <- crossprodcpp(W2) + temp1 <- H * (alpha * W1tX1 + W2tX2ZR + lambda * HZZt) + temp2 <- eigenMapMatMult(alpha * W1tW1 + W2tW2 + 2 * lambda * HHt + gamma * onesM_K, H) + H <- temp1/(temp2 + eps) + + # update Z + HtH <- crossprodcpp(H) + X2tW2H <- eigenMapMatcrossprod(W2tX2, H) + RX2tW2H = X2tW2H + RX2tW2H[index] = 0 + XtX2ZR <- eigenMapMatMult(XtX2, ZR) + XtX2ZRR = XtX2ZR + XtX2ZRR[index] = 0 + Z = Z * (RX2tW2H + lambda * HtH)/(XtX2ZRR + lambda * Z + eps) + + if (stop_rule == 2) { + obj = alpha * norm(X1 - W1 %*% H, "F")^2 + norm(X2 %*% (Z * R) - W2 %*% H, "F")^2 + lambda * norm(Z - HtH, "F") + gamma * sum(colSums(H) * colSums(H)) + if (iter > 1 && ((obj_old - obj)/obj_old < 10^(-6)) || iter == maxIter) { + break + } + obj_old = obj + } + } + # compute the execution time + execution.time = proc.time() - ptm + + barcodes <- colnames(X2) + feature1 <- rownames(X1) + feature2 <- rownames(X2) + names_com <- paste0("factor", seq_len(K)) + attr(W1, "dimnames") <- list(feature1, names_com) + attr(W2, "dimnames") <- list(feature2, names_com) + attr(H, "dimnames") <- list(names_com, barcodes) + attr(Z, "dimnames") <- list(barcodes, barcodes) + + outs <- list(W1 = W1, W2 = W2, H = H, Z = Z, R = R, + options = list(s = s, alpha = alpha, lambda = lambda, gamma = gamma, maxIter = maxIter, stop_rule = stop_rule, run.time = execution.time)) + return(outs) + +} + + +#' Generate the aggregated epigenomic data +#' +#' @param object an scAI object after running run_scAI +#' @param group cell group information if available; aggregate epigenomic data based on the available cell group information instead of the learned cell-cell similarity matrix from scAI +#' +#' @return +#' @export +#' +getAggregatedData <- function(object, group = NULL) { + if (is.null(group)) { + Z <- object@fit$Z + } else { + if (is.factor(group) | is.character(group)) { + group <- as.numeric(factor(group)) + } + Z <- clust2Mat(group) + diag(Z) <- 1 + object@fit$Z <- Z + } + R <- object@fit$R + ZR <- Z * R + ZR <- sweep(ZR, 2, colSums(ZR), FUN = `/`) + X2 <- as.matrix(object@norm.data[[2]]) + X2agg <- eigenMapMatMult(X2, ZR) + X2agg <- sweep(X2agg, 2, colSums(X2agg), FUN = `/`) * 10000 + X2agg <- log(1 + X2agg) + attr(X2agg, "dimnames") <- list(rownames(object@norm.data[[2]]), colnames(object@norm.data[[2]])) + object@agg.data <- X2agg + return(object) +} + + + +#' Perform dimensional reduction +#' +#' Dimension reduction by PCA, t-SNE or UMAP +#' @param object scAI object +#' @param return.object whether return scAI object +#' @param data.use input data +#' @param do.scale whether scale the data +#' @param do.center whether scale and center the data +#' @param method Method of dimensional reduction, one of tsne, FItsne and umap +#' @param rand.seed Set a random seed. By default, sets the seed to 42. +#' @param perplexity perplexity parameter in tsne +#' @param theta parameter in tsne +#' @param check_duplicates parameter in tsne + +#' @param FItsne.path File path of FIt-SNE +#' @param dim.embed dimensions of t-SNE embedding +#' @param dim.use num of PCs used for t-SNE +#' +#' @param do.fast whether do fast PCA +#' @param dimPC the number of components to keep in PCA +#' @param weight.by.var whether use weighted pc.scores +#' +#' @param n.neighbors This determines the number of neighboring points used in +#' local approximations of manifold structure. Larger values will result in more +#' global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50. +#' @param n.components The dimension of the space to embed into. +#' @param distance This determines the choice of metric used to measure distance in the input space. +#' @param n.epochs the number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. If NULL is specified, a value will be selected based on the size of the input dataset (200 for large datasets, 500 for small). +#' @param learning.rate The initial learning rate for the embedding optimization. +#' @param min.dist This controls how tightly the embedding is allowed compress points together. +#' Larger values ensure embedded points are moreevenly distributed, while smaller values allow the +#' algorithm to optimise more accurately with regard to local structure. Sensible values are in the range 0.001 to 0.5. +#' @param spread he effective scale of embedded points. In combination with min.dist this determines how clustered/clumped the embedded points are. +#' @param set.op.mix.ratio Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. +#' @param local.connectivity The local connectivity required - i.e. the number of nearest neighbors +#' that should be assumed to be connected at a local level. The higher this value the more connected +#' the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold. +#' @param repulsion.strength Weighting applied to negative samples in low dimensional embedding +#' optimization. Values higher than one will result in greater weight being given to negative samples. +#' @param negative.sample.rate The number of negative samples to select per positive sample in the +#' optimization process. Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy. +#' @param a More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread. +#' @param b More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread. + +#' +#' @return +#' @export +#' +#' @examples +#' @importFrom Rtsne Rtsne +#' @importFrom reticulate py_module_available py_set_seed import +#' +reducedDims <- function(object, data.use = object@fit$H, do.scale = TRUE, do.center = TRUE, return.object = TRUE, method = "umap", +dim.embed = 2, dim.use = NULL, perplexity = 30, theta = 0.5, check_duplicates = F, rand.seed = 42L, +FItsne.path = NULL, +dimPC = 40,do.fast = TRUE, weight.by.var = TRUE, +n.neighbors = 30L, n.components = 2L, distance = "correlation",n.epochs = NULL,learning.rate = 1.0,min.dist = 0.3,spread = 1.0,set.op.mix.ratio = 1.0,local.connectivity = 1L, +repulsion.strength = 1,negative.sample.rate = 5,a = NULL,b = NULL) { + + data.use <- as.matrix(data.use) + if (do.scale) { + data.use <- t(scale(t(data.use), center = do.center, scale = TRUE)) + data.use[is.na(data.use)] <- 0 + } + if (!is.null(dim.use)) { + data.use = object@embed$pca[, dim.use] + } + if (method == "pca") { + cell_coords <- runPCA(data.use, do.fast = do.fast, dimPC = dimPC, seed.use = rand.seed, weight.by.var = weight.by.var) + object@embed$pca <- cell_coords + + } else if (method == "tsne") { + set.seed(rand.seed) + cell_coords <- Rtsne::Rtsne(t(data.use), pca = FALSE, dims = dim.embed, theta = theta, perplexity = perplexity, check_duplicates = F)$Y + rownames(cell_coords) <- rownames(t(data.use)) + object@embed$tsne <- cell_coords + + } else if (method == "FItsne") { + if (!exists("FItsne")) { + if (is.null(fitsne.path)) { + stop("Please pass in path to FIt-SNE directory as FItsne.path.") + } + source(paste0(fitsne.path, "/fast_tsne.R"), chdir = T) + } + cell_coords <- fftRtsne(t(data.use), pca = FALSE, dims = dim.embed, theta = theta, perplexity = perplexity, check_duplicates = F, rand_seed = rand.seed) + rownames(cell_coords) <- rownames(t(data.use)) + object@embed$FItsne <- cell_coords + + } else if (method == "umap") { + cell_coords <- runUMAP(data.use, + n.neighbors = n.neighbors, + n.components = n.components, + distance = distance, + n.epochs = n.epochs, + learning.rate = learning.rate, + min.dist = min.dist, + spread = spread, + set.op.mix.ratio = set.op.mix.ratio, + local.connectivity = local.connectivity, + repulsion.strength = repulsion.strength, + negative.sample.rate = negative.sample.rate, + a = a, + b = b, + seed.use = rand.seed + ) + object@embed$umap <- cell_coords + + } else { + stop("Error: unrecognized dimensionality reduction method.") + } + if (return.object) { + return(object) + } else { + return(cell_coords) + } + +} + + +#' Dimension reduction using PCA +#' +#' @param data.use input data +#' @param do.fast whether do fast PCA +#' @param dimPC the number of components to keep +#' @param seed.use set a seed +#' @param weight.by.var whether use weighted pc.scores +#' @importFrom stats prcomp +#' @importFrom irlba irlba +#' @return +#' @export +#' +#' @examples +runPCA <- function(data.use, do.fast = T, dimPC = 50, seed.use = 42, weight.by.var = T) { + set.seed(seed = seed.use) + if (do.fast) { + dimPC <- min(dimPC, nrow(data.use) - 1) + pca.res <- irlba::irlba(t(data.use), nv = dimPC) + sdev <- pca.res$d/sqrt(max(1, ncol(data.use) - 1)) + if (weight.by.var){ + pc.scores <- pca.res$u %*% diag(pca.res$d) + } else { + pc.scores <- pca.res$u + } + } else { + dimPC <- min(dimPC, nrow(data.use) - 1) + pca.res <- stats::prcomp(x = t(data.use), rank. = dimPC) + sdev <- pca.res$sdev + if (weight.by.var) { + pc.scores <- pca.res$x %*% diag(pca.res$sdev[1:dimPC]^2) + } else { + pc.scores <- pca.res$x + } + } + rownames(pc.scores) <- colnames(data.use) + colnames(pc.scores) <- paste0('PC', 1:ncol(pc.scores)) + cell_coords <- pc.scores + return(cell_coords) +} + +#' Perform dimension reduction using UMAP +#' +#' @param data.use input data +#' @param n.neighbors This determines the number of neighboring points used in +#' local approximations of manifold structure. Larger values will result in more +#' global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50. +#' @param n.components The dimension of the space to embed into. +#' @param distance This determines the choice of metric used to measure distance in the input space. +#' @param n.epochs the number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. If NULL is specified, a value will be selected based on the size of the input dataset (200 for large datasets, 500 for small). +#' @param learning.rate The initial learning rate for the embedding optimization. +#' @param min.dist This controls how tightly the embedding is allowed compress points together. +#' Larger values ensure embedded points are moreevenly distributed, while smaller values allow the +#' algorithm to optimise more accurately with regard to local structure. Sensible values are in the range 0.001 to 0.5. +#' @param spread he effective scale of embedded points. In combination with min.dist this determines how clustered/clumped the embedded points are. +#' @param set.op.mix.ratio Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. +#' @param local.connectivity The local connectivity required - i.e. the number of nearest neighbors +#' that should be assumed to be connected at a local level. The higher this value the more connected +#' the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold. +#' @param repulsion.strength Weighting applied to negative samples in low dimensional embedding +#' optimization. Values higher than one will result in greater weight being given to negative samples. +#' @param negative.sample.rate The number of negative samples to select per positive sample in the +#' optimization process. Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy. +#' @param a More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread. +#' @param b More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread. +#' @param seed.use Set a random seed. By default, sets the seed to 42. +#' @param metric.kwds A dictionary of arguments to pass on to the metric, such as the p value for Minkowski distance +#' @param angular.rp.forest Whether to use an angular random projection forest to initialise the +#' approximate nearest neighbor search. This can be faster, but is mostly on useful for metric that +#' use an angular style distance such as cosine, correlation etc. In the case of those metrics angular forests will be chosen automatically. +#' @param verbose Controls verbosity +#' This function is modified from Seurat package +#' @return +#' @export +#' @importFrom reticulate py_module_available py_set_seed import +#' @examples +runUMAP <- function( + data.use, + n.neighbors = 30L, + n.components = 2L, + distance = "correlation", + n.epochs = NULL, + learning.rate = 1.0, + min.dist = 0.3, + spread = 1.0, + set.op.mix.ratio = 1.0, + local.connectivity = 1L, + repulsion.strength = 1, + negative.sample.rate = 5, + a = NULL, + b = NULL, + seed.use = 42L, + metric.kwds = NULL, + angular.rp.forest = FALSE, + verbose = TRUE){ + if (!reticulate::py_module_available(module = 'umap')) { + stop("Cannot find UMAP, please install through pip (e.g. pip install umap-learn or reticulate::py_install(packages = 'umap-learn')).") + } + set.seed(seed.use) + reticulate::py_set_seed(seed.use) + umap_import <- reticulate::import(module = "umap", delay_load = TRUE) + umap <- umap_import$UMAP( + n_neighbors = as.integer(n.neighbors), + n_components = as.integer(n.components), + metric = distance, + n_epochs = n.epochs, + learning_rate = learning.rate, + min_dist = min.dist, + spread = spread, + set_op_mix_ratio = set.op.mix.ratio, + local_connectivity = local.connectivity, + repulsion_strength = repulsion.strength, + negative_sample_rate = negative.sample.rate, + a = a, + b = b, + metric_kwds = metric.kwds, + angular_rp_forest = angular.rp.forest, + verbose = verbose + ) + Rumap <- umap$fit_transform + umap_output <- Rumap(t(data.use)) + colnames(umap_output) <- paste0('UMAP', 1:ncol(umap_output)) + rownames(umap_output) <- colnames(data.use) + return(umap_output) +} + + + + +#' Identify enriched features in each factor +#' +#' Rank features in each factor by Factor loading analysis +#' @param object scAI object +#' @param assay Name of assay to be analyzed +#' @param features a vector of features +#' @param cutoff.W Threshold of feature loading values +#' @param cutoff.H Threshold of cell loading values +#' @param thresh.pc Threshold of the percent of cells enriched in one factor +#' @param thresh.fc Threshold of Fold Change +#' @param thresh.p Threshold of p-values +#' @param n.top Number of top features to be returned +#' @importFrom stats sd wilcox.test +#' @importFrom dplyr top_n slice +#' @importFrom future nbrOfWorkers +#' @importFrom future.apply future_sapply +#' @importFrom pbapply pbsapply +#' @importFrom stats p.adjust +#' +#' @return +#' @export +#' +#' @examples + +identifyFactorMarkers <- function(object, assay, features = NULL, cutoff.W = 0.5, cutoff.H = 0.5, + thresh.pc = 0.05, thresh.fc = 0.25, thresh.p = 0.05, n.top = 10) { + if (assay == "RNA") { + X <- as.matrix(object@norm.data[[assay]]) + } else { + X <- as.matrix(object@agg.data) + } + H <- object@fit$H + W <- object@fit$W[[assay]] + if (is.null(features)) { + features <- row.names(W) + } + + H <- sweep(H, 2, colSums(H), FUN = `/`) + K = ncol(W) + lib_W <- base::rowSums(W) + lib_W[lib_W == 0] <- 1 + lib_W[lib_W < mean(lib_W) - 5 * sd(lib_W)] <- 1 # omit the nearly null rows + W <- sweep(W, 1, lib_W, FUN = `/`) + MW <- base::colMeans(W) + sW <- apply(W, 2, sd) + # candidate markers for each factor + IndexW_record <- vector("list", K) + for (j in 1:K) { + IndexW_record[[j]] <- which(W[, j] > MW[j] + cutoff.W * sW[j]) + } + + my.sapply <- ifelse( + test = future::nbrOfWorkers() == 1, + yes = pbapply::pbsapply, + no = future.apply::future_sapply + ) + + # divided cells into two groups + mH <- apply(H, 1, mean) + sH <- apply(H, 1, sd) + IndexH_record1 <- vector("list", K) + IndexH_record2 <- vector("list", K) + for (i in 1:K) { + IndexH_record1[[i]] <- which(H[i, ] > mH[i] + cutoff.H * sH[i]) + IndexH_record2[[i]] <- base::setdiff(c(1:ncol(H)), IndexH_record1[[i]]) + } + + # identify factor-specific markers + factor_markers = vector("list", K) + markersTopn = vector("list", K) + factors <- c() + Features <- c() + Pvalues <- c() + Log2FC <- c() + for (i in 1:K) { + data1 <- X[IndexW_record[[i]], IndexH_record1[[i]]] + data2 <- X[IndexW_record[[i]], IndexH_record2[[i]]] + idx1 <- which(base::rowSums(data1 != 0) > thresh.pc * ncol(data1)) # at least expressed in thresh.pc% cells in one group + FC <- log2(base::rowMeans(data1)/base::rowMeans(data2)) + idx2 <- which(FC > thresh.fc) + pvalues <- my.sapply( + X = 1:nrow(x = data1), + FUN = function(x) { + return(wilcox.test(data1[x, ], data2[x, ], alternative = "greater")$p.value) + } + ) + + idx3 <- which(pvalues < thresh.p) + idx = intersect(intersect(idx1, idx2), idx3) + + # order + FC <- FC[idx] + c = sort(FC, decreasing = T, index.return = T)$ix + ri = IndexW_record[[i]] + + Pvalues <- c(Pvalues, pvalues[ri[idx[c]]]) + Log2FC <- c(Log2FC, FC[c]) + factors <- c(factors, rep(i, length(c))) + Features <- c(Features, features[ri[idx[c]]]) + + } + # stopCluster(cl) + markers.all <- cbind(factors = factors, features = Features, pvalues = Pvalues, log2FC = Log2FC) + + rownames(markers.all) <- Features + markers.all <- as.data.frame(markers.all) + markers.top <- markers.all %>% dplyr::group_by(factors) %>% top_n(n.top, log2FC) %>% dplyr::slice(1:n.top) + + markers = list(markers.all = markers.all, markers.top = markers.top) + + return(markers) +} + + + +#' compute the 2D coordinates of embeded cells, genes, loci and factors using VscAI visualization +#' +#' @param object scAI object +#' @param genes.embed A vector of genes to be embedded +#' @param loci.embed A vector of loci to be embedded +#' @param alpha.embed Parameter controlling the distance between the cells and the factors +#' @param snn.embed Number of neighbors +#' +#' @return +#' @export +#' +#' @examples +#' @importFrom Matrix Matrix +getEmbeddings <- function(object, genes.embed = NULL, loci.embed = NULL, alpha.embed = 1.9, snn.embed = 5) { + H <- object@fit$H + W1 <- object@fit$W[[1]] + W2 <- object@fit$W[[2]] + Z <- object@fit$Z + + if (nrow(H) < 3 & length(object@fit.variedK) == 0) { + print("VscAI needs at least three factors for embedding. Now rerun scAI with rank being 3...") + outs <- run_scAI(object, K = 3, nrun = object@options$paras$nrun, + s = object@options$paras$s, alpha = object@options$paras$alpha, lambda = object@options$paras$lambda, gamma = object@options$paras$gamma, + maxIter = object@options$paras$maxIter, stop_rule = object@options$paras$stop_rule) + W <- outs@fit$W + H <- outs@fit$H + Z <- outs@fit$Z + W1 <- W[[1]] + W2 <- W[[2]] + object@fit.variedK$W <- W + object@fit.variedK$H <- H + object@fit.variedK$Z <- Z + } else if (nrow(H) < 3 & length(object@fit.variedK) != 0) { + print("Using the previous calculated factors for embedding.") + W1 <- object@fit.variedK$W[[1]] + W2 <- object@fit.variedK$W[[2]] + H <- object@fit.variedK$H + Z <- object@fit.variedK$Z + + } + + nmf.scores <- H + + Zsym <- Z/max(Z) + Zsym[Zsym < 10^(-6)] <- 0 + Zsym <- (Zsym + t(Zsym))/2 + diag(Zsym) <- 1 + rownames(Zsym) <- colnames(H) + colnames(Zsym) <- rownames(Zsym) + + alpha.exp <- alpha.embed # Increase this > 1.0 to move the cells closer to the factors. Values > 2 start to distort the data. + snn.exp <- snn.embed # Lower this < 1.0 to move similar cells closer to each other + n_pull <- nrow(H) # The number of factors pulling on each cell. Must be at least 3. + + Zsym <- Matrix(data = Zsym, sparse = TRUE) + snn <- Zsym + swne.embedding <- swne::EmbedSWNE(nmf.scores, snn, alpha.exp = alpha.exp, snn.exp = snn.exp, n_pull = n_pull, proj.method = "sammon", dist.use = "cosine") + # project cells and factors + factor_coords <- swne.embedding$H.coords + cell_coords <- swne.embedding$sample.coords + rownames(factor_coords) <- rownames(H) + rownames(cell_coords) <- colnames(H) + + # project genes onto the low dimension space by VscAI + if (is.null(genes.embed)) { + genes.embed <- rownames(W1) + } else { + genes.embed <- as.character(as.vector(as.matrix((genes.embed)))) + } + + swne.embedding <- swne::EmbedFeatures(swne.embedding, W1, genes.embed, n_pull = n_pull) + gene_coords <- swne.embedding$feature.coords + rownames(gene_coords) <- gene_coords$name + + # project loci + if (is.null(loci.embed)) { + loci.embed <- rownames(W2) + } else { + loci.embed <- as.character(as.vector(as.matrix((loci.embed)))) + } + + swne.embedding <- swne::EmbedFeatures(swne.embedding, W2, loci.embed, n_pull = n_pull) + loci_coords <- swne.embedding$feature.coords + rownames(loci_coords) <- loci_coords$name + + object@embed$VscAI$cells <- cell_coords + object@embed$VscAI$factors <- factor_coords + object@embed$VscAI$genes <- gene_coords + object@embed$VscAI$loci <- loci_coords + return(object) +} + + + +#' Identify cell clusters +#' +#' @param object scAI object +#' @param partition.type Type of partition to use. Defaults to RBConfigurationVertexPartition. Options include: ModularityVertexPartition, RBERVertexPartition, CPMVertexPartition, MutableVertexPartition, SignificanceVertexPartition, SurpriseVertexPartition (see the Leiden python module documentation for more details) +#' @param seed.use set seed +#' @param n.iter number of iteration +#' @param initial.membership arameters to pass to the Python leidenalg function defaults initial_membership=None +#' @param weights defaults weights=None +#' @param node.sizes Parameters to pass to the Python leidenalg function +#' @param resolution A parameter controlling the coarseness of the clusters +#' @param K Number of clusters if performing hierarchical clustering of the consensus matrix +#' @return +#' @export +#' +#' @examples +#' @importFrom stats as.dist cophenetic cor cutree dist hclust +identifyClusters <- function(object, resolution = 1, partition.type = "RBConfigurationVertexPartition", + seed.use = 42L,n.iter = 10L, + initial.membership = NULL, weights = NULL, node.sizes = NULL, + K = NULL) { + if (is.null(K) & !is.null(resolution)) { + data.use <- object@fit$H + data.use <- as.matrix(data.use) + data.use <- t(scale(t(data.use), center = TRUE, scale = TRUE)) + snn <- swne::CalcSNN(data.use, k = 20, prune.SNN = 1/15) + idents <- runLeiden(SNN = snn, resolution = resolution, partition_type = partition.type, + seed.use = seed.use, n.iter = n.iter, + initial.membership = initial.membership, weights = weights, node.sizes = node.sizes) + } else { + CM <- object@cluster$consensus + d <- as.dist(1 - CM) + hc <- hclust(d, "ave") + idents<- hc %>% cutree(k = K) + coph <- cophenetic(hc) + cs <- cor(d, coph) + object@cluster$cluster.stability <- cs + } + names(idents) <- rownames(object@pData) + object@identity <- factor(idents) + object@pData$cluster <- factor(idents) + return(object) +} + + +#' Run Leiden clustering algorithm +#' This code is modified from Tom Kelly (https://github.com/TomKellyGenetics/leiden), where we added more parameters (seed.use and n.iter) to run the Python version. In addition, we also take care of the singleton issue after running leiden algorithm. +#' @description Implements the Leiden clustering algorithm in R using reticulate to run the Python version. Requires the python "leidenalg" and "igraph" modules to be installed. Returns a vector of partition indices. +#' @param SNN An adjacency matrix compatible with \code{\link[igraph]{igraph}} object. +#' @param seed.use set seed +#' @param n.iter number of iteration +#' @param initial.membership arameters to pass to the Python leidenalg function defaults initial_membership=None +#' @param node.sizes Parameters to pass to the Python leidenalg function +#' @param partition_type Type of partition to use. Defaults to RBConfigurationVertexPartition. Options include: ModularityVertexPartition, RBERVertexPartition, CPMVertexPartition, MutableVertexPartition, SignificanceVertexPartition, SurpriseVertexPartition (see the Leiden python module documentation for more details) +#' @param resolution A parameter controlling the coarseness of the clusters +#' @param weights defaults weights=None +#' @return A parition of clusters as a vector of integers +##' +##' +#' @keywords graph network igraph mvtnorm simulation +#' @importFrom reticulate import r_to_py +##' @export + +runLeiden <- function(SNN = matrix(), resolution = 1, partition_type = c( + 'RBConfigurationVertexPartition', + 'ModularityVertexPartition', + 'RBERVertexPartition', + 'CPMVertexPartition', + 'MutableVertexPartition', + 'SignificanceVertexPartition', + 'SurpriseVertexPartition'), +seed.use = 42L, +n.iter = 10L, +initial.membership = NULL, weights = NULL, node.sizes = NULL) { + if (!reticulate::py_module_available(module = 'leidenalg')) { + stop("Cannot find Leiden algorithm, please install through pip (e.g. pip install leidenalg).") + } + + #import python modules with reticulate + leidenalg <- import("leidenalg", delay_load = TRUE) + ig <- import("igraph", delay_load = TRUE) + + resolution_parameter <- resolution + initial_membership <- initial.membership + node_sizes <- node.sizes + #convert matrix input (corrects for sparse matrix input) + adj_mat <- as.matrix(SNN) + + #compute weights if non-binary adjacency matrix given + is_pure_adj <- all(as.logical(adj_mat) == adj_mat) + if (is.null(weights) && !is_pure_adj) { + #assign weights to edges (without dependancy on igraph) + weights <- t(adj_mat)[t(adj_mat)!=0] + #remove zeroes from rows of matrix and return vector of length edges + } + + ##convert to python numpy.ndarray, then a list + adj_mat_py <- r_to_py(adj_mat) + adj_mat_py <- adj_mat_py$tolist() + + #convert graph structure to a Python compatible object + GraphClass <- if (!is.null(weights) && !is_pure_adj){ + ig$Graph$Weighted_Adjacency + } else { + ig$Graph$Adjacency + } + snn_graph <- GraphClass(adj_mat_py) + + #compute partitions + partition_type <- match.arg(partition_type) + part <- switch( + EXPR = partition_type, + 'RBConfigurationVertexPartition' = leidenalg$find_partition( + snn_graph, + leidenalg$RBConfigurationVertexPartition, + initial_membership = initial.membership, weights = weights, + resolution_parameter = resolution, + n_iterations = n.iter, + seed = seed.use + ), + 'ModularityVertexPartition' = leidenalg$find_partition( + snn_graph, + leidenalg$ModularityVertexPartition, + initial_membership = initial.membership, weights = weights, + n_iterations = n.iter, + seed = seed.use + ), + 'RBERVertexPartition' = leidenalg$find_partition( + snn_graph, + leidenalg$RBERVertexPartition, + initial_membership = initial.membership, weights = weights, node_sizes = node.sizes, + resolution_parameter = resolution_parameter, + n_iterations = n.iter, + seed = seed.use + ), + 'CPMVertexPartition' = leidenalg$find_partition( + snn_graph, + leidenalg$CPMVertexPartition, + initial_membership = initial.membership, weights = weights, node_sizes = node.sizes, + resolution_parameter = resolution, + n_iterations = n.iter, + seed = seed.use + ), + 'MutableVertexPartition' = leidenalg$find_partition( + snn_graph, + leidenalg$MutableVertexPartition, + initial_membership = initial.membership, + n_iterations = n.iter, + seed = seed.use + ), + 'SignificanceVertexPartition' = leidenalg$find_partition( + snn_graph, + leidenalg$SignificanceVertexPartition, + initial_membership = initial.membership, node_sizes = node.sizes, + resolution_parameter = resolution, + n_iterations = n.iter, + seed = seed.use + ), + 'SurpriseVertexPartition' = leidenalg$find_partition( + snn_graph, + leidenalg$SurpriseVertexPartition, + initial_membership = initial.membership, weights = weights, node_sizes = node.sizes, + n_iterations = n.iter, + seed = seed.use + ), + stop("please specify a partition type as a string out of those documented") + ) + partition <- part$membership+1 + idents <- partition + + if (min(table(idents)) == 1) { + idents <- assignSingletons(idents, SNN) + } + idents <- factor(idents) + names(idents) <- row.names(SNN) + return(idents) +} + +# Group single cells that make up their own cluster in with the cluster they are most connected to. +# +# @param idents clustering result +# @param SNN SNN graph +# @return Returns scAI object with all singletons merged with most connected cluster + +assignSingletons <- function(idents, SNN) { + # identify singletons + singletons <- c() + for (cluster in unique(idents)) { + if (length(which(idents %in% cluster)) == 1) { + singletons <- append(x = singletons, values = cluster) + } + } + #singletons = names(table(idents))[which(table(idents)==1)] + # calculate connectivity of singletons to other clusters, add singleton to cluster it is most connected to + cluster_names <- unique(x = idents) + cluster_names <- setdiff(x = cluster_names, y = singletons) + connectivity <- vector(mode="numeric", length = length(x = cluster_names)) + names(x = connectivity) <- cluster_names + for (i in singletons) { + print(i) + for (j in cluster_names) { + subSNN = SNN[ + which(idents %in% i), + which(idents %in% j) + ] + if (is.object(x = subSNN)) { + connectivity[j] <- sum(subSNN) / (nrow(x = subSNN) * ncol(x = subSNN)) + } else { + connectivity[j] <- mean(x = subSNN) + } + } + m <- max(connectivity, na.rm = T) + mi <- which(x = connectivity == m, arr.ind = TRUE) + closest_cluster <- sample(x = names(x = connectivity[mi]), 1) + which(idents %in% i)[which(idents %in% i)] <- closest_cluster + } + if (length(x = singletons) > 0) { + message(paste( + length(x = singletons), + "singletons identified.", + length(x = unique(idents)), + "final clusters." + )) + } + return(idents) +} + + +#' Convert membership into an adjacent matrix +#' +#' @param memb Membership vector +#' +#' @return +#' @export +#' +#' @examples +clust2Mat <- function(memb) { + N <- length(memb) + return(as.numeric(outer(memb, memb, FUN = "==")) - outer(1:N, 1:N, "==")) +} + + +#' Identify markers in each cell cluster +#' +#' @param object scAI object +#' @param assay Name of assay to be analyzed +#' @param features a vector of features +#' @param use.agg whether use the aggregated data +#' @param test.use which test to use ("bimod" or "wilcox") +#' @param thresh.pc Threshold of the percent of cells enriched in one cluster +#' @param thresh.fc Threshold of Fold Change +#' @param thresh.p Threshold of p-values +#' @importFrom future nbrOfWorkers +#' @importFrom pbapply pbsapply +#' @importFrom future.apply future_sapply +#' @importFrom stats sd wilcox.test +#' @importFrom dplyr top_n slice +#' @importFrom stats p.adjust +#' +#' @return +#' @export +#' +#' @examples +identifyClusterMarkers <- function(object, assay, features = NULL, use.agg = TRUE, test.use = "bimod", +thresh.pc = 0.25, thresh.fc = 0.25, thresh.p = 0.01) { + if (assay == "RNA") { + X <- object@norm.data[[assay]] + } else { + if (use.agg) { + X <- object@agg.data + } else { + X <- object@norm.data[[assay]] + } + } + if (is.null(features)) { + features.use <- row.names(X) + } else { + features.use <- intersect(features, row.names(X)) + } + data.use <- X[features.use,] + data.use <- as.matrix(data.use) + + groups <- object@identity + labels <- groups + level.use <- levels(labels) + numCluster <- length(level.use) + + my.sapply <- ifelse( + test = future::nbrOfWorkers() == 1, + yes = pbapply::pbsapply, + no = future.apply::future_sapply + ) + + + mean.fxn <- function(x) { + return(log(x = mean(x = expm1(x = x)) + 1)) + } + genes.de <- vector("list", length = numCluster) + for (i in 1:numCluster) { + features <- features.use + cell.use1 <- which(labels %in% level.use[i]) + cell.use2 <- base::setdiff(1:length(labels), cell.use1) + +# feature selection (based on percentages) + thresh.min <- 0 + pct.1 <- round( + x = rowSums(x = data.use[features, cell.use1, drop = FALSE] > thresh.min) / + length(x = cell.use1), + digits = 3 + ) + pct.2 <- round( + x = rowSums(x = data.use[features, cell.use2, drop = FALSE] > thresh.min) / + length(x = cell.use2), + digits = 3 + ) + data.alpha <- cbind(pct.1, pct.2) + colnames(x = data.alpha) <- c("pct.1", "pct.2") + alpha.min <- apply(X = data.alpha, MARGIN = 1, FUN = max) + names(x = alpha.min) <- rownames(x = data.alpha) + features <- names(x = which(x = alpha.min > thresh.pc)) + if (length(x = features) == 0) { + stop("No features pass thresh.pc threshold") + } + + # feature selection (based on average difference) + data.1 <- apply(X = data.use[features, cell.use1, drop = FALSE],MARGIN = 1,FUN = mean.fxn) + data.2 <- apply(X = data.use[features, cell.use2, drop = FALSE],MARGIN = 1,FUN = mean.fxn) + FC <- (data.1 - data.2) + features.diff <- names(x = which(x = FC > thresh.fc)) + features <- intersect(x = features, y = features.diff) + if (length(x = features) == 0) { + stop("No features pass thresh.fc threshold") + } + + data1 <- data.use[features, cell.use1, drop = FALSE] + data2 <- data.use[features, cell.use2, drop = FALSE] + + if (test.use == "wilcox") { + pvalues <- unlist( + x = my.sapply( + X = 1:nrow(x = data1), + FUN = function(x) { + return(wilcox.test(data1[x, ], data2[x, ], alternative = "greater")$p.value) + } + ) + ) + + } else if (test.use == 'bimod') { + pvalues <- unlist( + x = my.sapply( + X = 1:nrow(x = data1), + FUN = function(x) { + return(DifferentialLRT( + x = as.numeric(x = data1[x,]), + y = as.numeric(x = data2[x,]) + )) + } + ) + ) + } + + pval.adj = stats::p.adjust( + p = pvalues, + method = "bonferroni", + n = nrow(X) + ) + genes.de[[i]] <- data.frame(clusters = level.use[i], features = as.character(rownames(data1)), pvalues = pvalues, pval_adj = pval.adj, logFC = FC[features], data.alpha[features,]) + } + + markers.all <- data.frame() + for (i in 1:numCluster) { + gde <- genes.de[[i]] + gde <- gde[order(gde$pvalues, -gde$logFC), ] + gde <- subset(gde, subset = pvalues < thresh.p) + if (nrow(gde) > 0) { + markers.all <- rbind(markers.all, gde) + } + } + markers.all$features <- as.character(markers.all$features) + return(markers.all) +} + +# function to run mcdavid et al. DE test +#' likelood ratio test +#' +#' @param x a vector +#' @param y a vector +#' @param xmin threshold for the values in the vector +#' +#' +#' @importFrom stats pchisq +DifferentialLRT <- function(x, y, xmin = 0) { + lrtX <- bimodLikData(x = x) + lrtY <- bimodLikData(x = y) + lrtZ <- bimodLikData(x = c(x, y)) + lrt_diff <- 2 * (lrtX + lrtY - lrtZ) + return(pchisq(q = lrt_diff, df = 3, lower.tail = F)) +} + +#' likelood ratio test +#' @importFrom stats sd dnorm +#' @param x a vector +#' @param xmin threshold for the values in the vector + +bimodLikData <- function(x, xmin = 0) { + x1 <- x[x <= xmin] + x2 <- x[x > xmin] + xal <- length(x = x2) / length(x = x) + xal[xal > 1 - 1e-5] <- 1 - 1e-5 + xal[xal < 1e-5] <- 1e-5 + likA <- length(x = x1) * log(x = 1 - xal) + if (length(x = x2) < 2) { + mysd <- 1 + } else { + mysd <- sd(x = x2) + } + likB <- length(x = x2) * + log(x = xal) + + sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE)) + return(likA + likB) +} + + +#' Infer regulatory relationships +#' +#' @param object scAI object +#' @param gene.use genes to be inferred +#' @param candinate_loci cadinate loci +#' @param cutoff_H threshold of coefficient values +#' @param cutoff the difference of correlation to be considered as significant +#' @param thresh_corr threshold of correlation coefficient +#' +#' @return +#' @export +#' +#' @examples +inferRegulations <- function(object, gene.use, candinate_loci, cutoff_H = 0.5, cutoff = 0.1, thresh_corr = 0.2) { + H <- as.matrix(object@fit$H) + H <- sweep(H, 2, colSums(H), FUN = `/`) + K = nrow(H) + mH <- apply(H, 1, mean) + sH <- apply(H, 1, sd) + IndexH_record <- vector("list", K) + for (i in 1:K) { + IndexH_record[[i]] <- which(H[i, ] > mH[i] + cutoff_H * sH[i]) + } + gene.use <- as.character(gene.use) + X1 <- object@norm.data[[1]] + X1 <- X1[gene.use, ] + X2a <- object@agg.data + + regulations <- vector("list", length(gene.use)) + names(regulations) <- gene.use + for (i in 1:length(gene.use)) { + regulations_i = vector("list", K) + names(regulations_i) <- rownames(H) + for (j in 1:K) { + loci_j <- candinate_loci$markers.all$features[candinate_loci$markers.all$factors == j] + # compute the correlation between + x1 <- X1[i, ] + x2a <- X2a[loci_j, ] + cors1 <- cor(x1, t(x2a)) + + # set the values of this gene and its candiate loci to be zero + X1_new <- X1 + X2a_new <- X2a + X1_new[gene.use[i], IndexH_record[[j]]] <- 0 + X2a_new[loci_j, IndexH_record[[j]]] <- 0 + x1_new <- X1_new[gene.use[i], ] + x2a_new <- X2a_new[loci_j, ] + cors2 <- cor(x1_new, t(x2a)) + cors3 <- cor(x1, t(x2a_new)) + cors1[is.na(cors1)] <- 0 + cors2[is.na(cors2)] <- 0 + cors3[is.na(cors3)] <- 0 + D <- rbind(cors1 - cors2, cors1 - cors3) + flag <- (rowSums(abs(D) > cutoff) > 0) & abs(cors1) > thresh_corr + regulations_i[[j]]$link <- as.character(loci_j[flag]) + regulations_i[[j]]$intensity <- cors1[flag] + } + regulations[[i]] <- regulations_i + } + return(regulations) +} + + + +#' select highly variable features +#' +#' @param object scAI objecy +#' @param assay Name of assay +#' @param do.plot Whether showing plot +#' @param do.text Whether adding feature names +#' @param x.low.cutoff The minimum expression level +#' @param x.high.cutoff The maximum expression level +#' @param y.cutoff The minimum fano factor values +#' @param y.high.cutoff The maximum fano factor values +#' @param num.bin Number of bins +#' @param pch.use Shape of dots in ggplot +#' @param col.use Color of dots +#' @param cex.text.use Size of text +#' +#' @return +#' @export +#' +#' @examples +#' @importFrom graphics smoothScatter text +selectFeatures <- function(object, assay = "RNA", do.plot = TRUE, do.text = TRUE, +x.low.cutoff = 0.01, x.high.cutoff = 3.5, y.cutoff = 0.5, y.high.cutoff = Inf, +num.bin = 20, pch.use = 16, col.use = "black", cex.text.use = 0.5) { + # This function is modified from Seurat Package + data <- object@norm.data[[assay]] + genes.use <- rownames(data) + + gene.mean <- apply(X = data, MARGIN = 1, FUN = ExpMean) + gene.dispersion <- apply(X = data, MARGIN = 1, FUN = LogVMR) + gene.dispersion[is.na(x = gene.dispersion)] <- 0 + gene.mean[is.na(x = gene.mean)] <- 0 + names(x = gene.mean) <- genes.use + data_x_bin <- cut(x = gene.mean, breaks = num.bin) + names(x = data_x_bin) <- names(x = gene.mean) + mean_y <- tapply(X = gene.dispersion, INDEX = data_x_bin, FUN = mean) + sd_y <- tapply(X = gene.dispersion, INDEX = data_x_bin, FUN = sd) + gene.dispersion.scaled <- (gene.dispersion - mean_y[as.numeric(x = data_x_bin)])/sd_y[as.numeric(x = data_x_bin)] + gene.dispersion.scaled[is.na(x = gene.dispersion.scaled)] <- 0 + names(x = gene.dispersion.scaled) <- names(x = gene.mean) + + mv.df <- data.frame(gene.mean, gene.dispersion, gene.dispersion.scaled) + rownames(x = mv.df) <- rownames(x = data) + hvg.info <- mv.df + names(x = gene.mean) <- names(x = gene.dispersion) <- names(x = gene.dispersion.scaled) <- rownames(hvg.info) + pass.cutoff <- names(x = gene.mean)[which(x = ((gene.mean > x.low.cutoff) & (gene.mean < x.high.cutoff)) & (gene.dispersion.scaled > y.cutoff) & (gene.dispersion.scaled < y.high.cutoff))] + if (do.plot) { + smoothScatter(x = gene.mean, y = gene.dispersion.scaled, pch = pch.use, cex = 0.5, col = col.use, xlab = "Average expression", ylab = "Dispersion", nrpoints = Inf) + } + if (do.text) { + text(x = gene.mean[pass.cutoff], y = gene.dispersion.scaled[pass.cutoff], labels = pass.cutoff, cex = cex.text.use) + } + hvg.info <- hvg.info[order(hvg.info$gene.dispersion, decreasing = TRUE), ] + + if (length(object@var.features) == 0) { + for (i in 1:length(object@raw.data)) { + object@var.features[[i]] <- vector("character") + } + names(object@var.features) <- names(object@raw.data) + } + + object@var.features[[assay]] <- pass.cutoff + return(object) + +} + +#' find chromsome regions of genes +#' +#' @param genes a given set of genes +#' @param species mouse or human +#' @importFrom biomaRt useMart getBM +#' @export +#' +searchGeneRegions <- function(genes, species = "mouse") { + if (species == "mouse"){ + mouse = biomaRt::useMart("ensembl", dataset = "mmusculus_gene_ensembl") + loci <- biomaRt::getBM(attributes = c( "mgi_symbol","chromosome_name",'start_position','end_position'), filters = "mgi_symbol", values = genes, mart = mouse) + } else{ + human = biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl") + loci <- biomaRt::getBM(attributes = c( "hgnc_symbol","chromosome_name",'start_position','end_position'), filters = "hgnc_symbol", values = genes, mart = human) + } + return(loci) +} + +#' find the nearby loci in a given loci set for a given set of genes +#' +#' @param genes a given set of genes +#' @param loci the background loci set +#' @param width the width from TSS +#' @param species mouse or human +#' @importFrom bedr bedr.sort.region in.region +#' @export +#' + +findNearbyLoci <- function(genes, loci, width = 50000, species = "mouse") { + gene.loci <- searchGeneRegions(genes, species) + gene.loci <- gene.loci[gene.loci$chromosome_name %in% c(1:22, "X","Y"), ] + gene.loci$start_position <- gene.loci$start_position-min(width, gene.loci$start_position) + gene.loci$end_position <- gene.loci$end_position + width + gene.loci.bed <- paste0("chr",gene.loci$chromosome_name, ":", gene.loci$start_position, "-",gene.loci$end_position) + gene.loci.bed.sort <- bedr::bedr.sort.region(gene.loci.bed); + a = grep(paste0("chr", c(1:22, "X","Y"), ":"), loci) + loci <- loci[grepl(paste(paste0("chr", c(1:22, "X","Y"), ":"), collapse="|"), loci)] + + loci.sort <- bedr::bedr.sort.region(loci); + is.region <- bedr::in.region(loci.sort, gene.loci.bed.sort); + loci.nearby <- loci.sort[is.region] + return(loci.nearby) +} + +#' Select number of the rank +#' +#' @param object scAI object +#' @param rangeK A predefined range of K +#' +#' @return +#' @export +#' +#' @examples +selectK <- function(object, rangeK = c(2:15)) { + coph <- vector("double", length(rangeK)) + i <- 0 + for (k in rangeK) { + i <- i+1 + outs <- run_scAI(object, K = k, nrun = object@options$paras$nrun, + s = object@options$paras$s, alpha = object@options$paras$alpha, lambda = object@options$paras$lambda, gamma = object@options$paras$gamma, + maxIter = object@options$paras$maxIter, stop_rule = object@options$paras$stop_rule) + CM <- outs@cluster$consensus + d1 <- dist(CM) + hc <- hclust(d1, method = "average") + d2 <- cophenetic(hc) + coph[i] <- cor(d1, d2) + } + + df <- data.frame(k = rangeK, Coph = coph) + gg <- ggplot(df, aes(x = k, y = Coph)) + geom_line(size=1) + + geom_point() + + theme_classic() + labs(x = 'K', y='Stability score (Coph)') + + scAI_theme_opts() + theme(text = element_text(size = 10)) + labs(x = 'K', y='Stability score (Coph)') + + theme(legend.position = "right", legend.title = NULL) + gg +} + + + +#' Reorder features according to the loading values +#' +#' @param W Basis matrix +#' @param cutoff Threshold of the loading values +#' +#' @return +#' @export +#' +#' @examples +reorderFeatures <- function(W, cutoff = 0.5) { + M <- nrow(W) + K = ncol(W) + MW <- colMeans(W) + vw <- apply(W, 2, sd) + # order features + IndexW_record <- vector("list", K) + IndexW_record[[1]] <- which(W[, 1] > MW[1] + cutoff * VW[1]) + n <- length(IndexW_record[[1]]) + A <- c(1:M) + c <- match(A, IndexW_record[[1]]) + A <- A[-c] + for (j in 2:K) { + IndexW_record[[j]] <- which(W[, j] > MW[j] + cutoff * VW[j]) + s <- 0 + for (k in 1:j - 1) { + s <- s + length(IndexW_record[[k]]) + } + ir2 <- match(IndexW_record[[j]], 1:s) + IndexW_record[[j]] <- IndexW_record[[j]][-ir2] + n <- length(IndexW_record[[j]]) + B <- union(1:s, IndexW_record[[j]]) + A <- 1:M + c <- match(A, B) + A <- A[-c] + W0 <- W + W[s + 1:s + n, ] <- W0[IndexW_record[[j]], ] + W[s + n + 1:M, ] <- W0[A, ] + } + results <- list() + list[["W"]] <- W + list[["index_record"]] <- IndexW_record + return(results) +} + +#' Calculate the variance to mean ratio of logged values +#' +#' Calculate the variance to mean ratio (VMR) in non-logspace (return answer in +#' log-space) +#' +#' @param x A vector of values +#' +#' @return Returns the VMR in log-space +#' +#' @importFrom stats var +#' +#' @export +#' +#' @examples +#' LogVMR(x = c(1, 2, 3)) +#' +LogVMR <- function(x) { + return(log(x = var(x = exp(x = x) - 1) / mean(x = exp(x = x) - 1))) +} + +#' Calculate the mean of logged values +#' +#' Calculate mean of logged values in non-log space (return answer in log-space) +#' +#' @param x A vector of values +#' +#' @return Returns the mean in log-space +#' +#' @export +#' +#' @examples +#' ExpMean(x = c(1, 2, 3)) +#' +ExpMean <- function(x) { + return(log(x = mean(x = exp(x = x) - 1) + 1)) +} + + + +#' Convert a sparse matrix to a dense matrix +#' +#' @param mat A sparse matrix +#' @export + +as_matrix <- function(mat){ + Rcpp::sourceCpp(code=' +#include <Rcpp.h> +using namespace Rcpp; + // [[Rcpp::export]] + IntegerMatrix asMatrix(NumericVector rp, + NumericVector cp, + NumericVector z, + int nrows, + int ncols){ + + int k = z.size() ; + + IntegerMatrix mat(nrows, ncols); + + for (int i = 0; i < k; i++){ + mat(rp[i],cp[i]) = z[i]; + } + + return mat; + } + ' ) + + row_pos <- mat@i + col_pos <- findInterval(seq(mat@x)-1,mat@p[-1]) + + tmp <- asMatrix(rp = row_pos, cp = col_pos, z = mat@x, + nrows = mat@Dim[1], ncols = mat@Dim[2]) + + row.names(tmp) <- mat@Dimnames[[1]] + colnames(tmp) <- mat@Dimnames[[2]] + return(tmp) +} + + + +