Diff of /R/scAI_model.R [000000] .. [4a0329]

Switch to unified view

a b/R/scAI_model.R
1
#' The scAI Class
2
#'
3
#' The scAI object is created from a paired single-cell transcriptomic and epigenomic data.
4
#' It takes a list of two digital data matrices as input. Genes/loci should be in rows and cells in columns. rownames and colnames should be included.
5
#' The class provides functions for data preprocessing, integrative analysis, and visualization.
6
#'
7
#' The key slots used in the scAI object are described below.
8
#'
9
#' @slot raw.data List of raw data matrices, one per dataset (Genes/loci should be in rows and cells in columns)
10
#' @slot norm.data List of normalized matrices (genes/loci by cells)
11
#' @slot agg.data Aggregated epigenomic data within similar cells
12
#' @slot scale.data List of scaled matrices
13
#' @slot pData data frame storing the information associated with each cell
14
#' @slot var.features List of informative features to be used, one giving informative genes and the other giving informative loci
15
#' @slot fit List of inferred low-rank matrices, including W1, W2, H, Z, R
16
#' @slot fit.variedK List of inferred low-rank matrices when varying the rank K
17
#' @slot embed List of the reduced 2D coordinates, one per method, e.g., t-SNE/FIt-SNE/umap
18
#' @slot identity a factor defining the cell identity
19
#' @slot cluster List of consensus clustering results
20
#' @slot options List of parameters used throughout analysis
21
#'
22
#' @exportClass scAI
23
#' @importFrom Rcpp evalCpp
24
#' @importFrom methods setClass
25
#' @useDynLib scAI
26
scAI <- methods::setClass("scAI",
27
                          slots = c(raw.data = "list",
28
                                    norm.data = "list",
29
                                    agg.data = "matrix",
30
                                    scale.data = "list",
31
                                    pData = "data.frame",
32
                                    var.features = "list",
33
                                    fit = "list",
34
                                    fit.variedK = "list",
35
                                    embed = "list",
36
                                    identity = "factor",
37
                                    cluster = "list",
38
                                    options = "list")
39
)
40
#' show method for scAI
41
#'
42
#' @param scAI object
43
#' @param show show the object
44
#' @param object object
45
#' @docType methods
46
#'
47
setMethod(f = "show", signature = "scAI", definition = function(object) {
48
  cat("An object of class", class(object), "\n", length(object@raw.data), "datasets.\n")
49
  invisible(x = NULL)
50
})
51
52
53
54
#' creat a new scAI object
55
#'
56
#' @param raw.data List of raw data matrices, a paired single-cell transcriptomic and epigenomic data
57
#' @param do.sparse whether use sparse format
58
#'
59
#' @return
60
#' @export
61
#'
62
#' @examples
63
#' @importFrom methods as new
64
create_scAIobject <- function(raw.data, do.sparse = T) {
65
  object <- methods::new(Class = "scAI",
66
                         raw.data = raw.data)
67
  if (do.sparse) {
68
    raw.data <- lapply(raw.data, function(x) {
69
      as(as.matrix(x), "dgCMatrix")
70
    })
71
  }
72
  object@raw.data <- raw.data
73
  return(object)
74
}
75
76
77
78
#' preprocess the raw.data including quality control and normalization
79
#'
80
#' @param object scAI object
81
#' @param assay List of assay names to be normalized
82
#' @param minFeatures Filter out cells with expressed features < minimum features
83
#' @param minCells Filter out features expressing in less than minCells
84
#' @param minCounts Filter out cells with expressed count < minCounts
85
#' @param maxCounts Filter out cells with expressed count > minCounts
86
#' @param libararyflag Whether do library size normalization
87
#' @param logNormalize whether do log transformation
88
#'
89
#' @return
90
#' @export
91
#'
92
#' @examples
93
preprocessing <- function(object, assay = list("RNA", "ATAC"), minFeatures = 200, minCells = 3, minCounts = NULL, maxCounts = NULL, libararyflag = TRUE, logNormalize = TRUE) {
94
    if (is.null(assay)) {
95
        for (i in 1:length(object@raw.data)) {
96
            object@norm.data[[i]] <- object@raw.data[[i]]
97
        }
98
99
    } else {
100
101
        for (i in 1:length(assay)) {
102
            iniData <- object@raw.data[[assay[[i]]]]
103
            print(dim(iniData))
104
105
            if (class(iniData) == "data.frame") {
106
                iniData <- as.matrix(iniData)
107
            }
108
            # filter cells that have features less than #minFeatures
109
            msum <- Matrix::colSums(iniData != 0)
110
            proData <- iniData[, msum >= minFeatures]
111
            # filter cells that have UMI counts less than #minCounts
112
            if (!is.null(minCounts)) {
113
                proData <- proData[, Matrix::colSums(proData) >= minCounts]
114
            }
115
116
            # filter cells that have expressed genes high than #maxGenes
117
            if (!is.null(maxCounts)) {
118
                proData <- proData[, Matrix::colSums(proData) <= maxCounts]
119
            }
120
121
            # filter genes that only express less than #minCells cells
122
            proData <- proData[Matrix::rowSums(proData != 0) >= minCells, ]
123
            # normalization:we employ a global-scaling normalization method that normalizes the gene expression measurements for each cell by the total expression multiplies this by a scale factor (10,000 by default)
124
            if (libararyflag) {
125
                proData <- sweep(proData, 2, Matrix::colSums(proData), FUN = `/`) * 10000
126
            }
127
            if (logNormalize) {
128
                proData = log(proData + 1)
129
            }
130
            object@norm.data[[assay[[i]]]] <- proData
131
132
        }
133
    }
134
    if (length(assay) == 2) {
135
        X1 <- object@norm.data[[assay[[1]]]]
136
        X2 <- object@norm.data[[assay[[2]]]]
137
        cell.keep <- intersect(colnames(X1), colnames(X2))
138
        object@norm.data[[assay[[1]]]] <- X1[, cell.keep]
139
        object@norm.data[[assay[[2]]]] <- X2[, cell.keep]
140
    } else if (length(assay) == 1) {
141
        X1 <- object@norm.data[[assay[[1]]]]
142
        assay2 <- setdiff(names(object@raw.data), assay[[1]])
143
        X2 <- object@raw.data[[assay2]]
144
        cell.keep <- intersect(colnames(X1), colnames(X2))
145
        object@norm.data[[assay[[1]]]] <- X1[, cell.keep]
146
        object@norm.data[[assay2]] <- X2[, cell.keep]
147
    }
148
  names(object@norm.data) <- names(object@raw.data)
149
150
    return(object)
151
}
152
153
154
155
#' add the cell information into pData slot
156
#'
157
#' @param object scAi object
158
#' @param pdata cell information to be added
159
#' @param pdata.name the name of column to be assigned
160
#'
161
#' @return
162
#' @export
163
#'
164
#' @examples
165
addpData <- function(object, pdata, pdata.name = NULL) {
166
  if (is.null(x = pdata.name) && is.atomic(x = pdata)) {
167
    stop("'pdata.name' must be provided for atomic pdata types (eg. vectors)")
168
  }
169
  if (inherits(x = pdata, what = c("matrix", "Matrix"))) {
170
    pdata <- as.data.frame(x = pdata)
171
  }
172
173
  if (is.null(x = pdata.name)) {
174
    pdata.name <- names(pdata)
175
  } else {
176
    names(pdata) <- pdata.name
177
  }
178
  object@pData <- pdata
179
  return(object)
180
}
181
182
183
184
#' run scAI model
185
#'
186
#' @param object scAI object
187
#' @param K An integer indicating the Rank of the inferred factor
188
#' @param do.fast whether use the python version for running scAI optimization model
189
#' @param nrun Number of times to repreat the running
190
#' @param hvg.use1 whether use high variable genes for RNA-seq data
191
#' @param hvg.use2 whether use high variable genes for ATAC-seq data
192
#' @param keep_all Whether keep all the results from multiple runs
193
#' @param s Probability of Bernoulli distribution
194
#' @param alpha model parameter
195
#' @param lambda model parameter
196
#' @param gamma model parameter
197
#' @param maxIter An integer indicating Maximum number of iteration
198
#' @param stop_rule Stop rule to be used
199
#' @param init List of the initialized low-rank matrices
200
#' @param rand.seed An integer indicating the seed
201
#'
202
#' @return
203
#' @export
204
#'
205
#' @examples
206
#' @importFrom foreach foreach "%dopar%"
207
#' @importFrom parallel makeForkCluster makeCluster detectCores
208
#' @importFrom doParallel registerDoParallel
209
#' @importFrom reticulate r_to_py source_python
210
run_scAI <- function(object, K, do.fast = FALSE, nrun = 5L, hvg.use1 = FALSE, hvg.use2 = FALSE, keep_all = F, s = 0.25, alpha = 1, lambda = 100000, gamma = 1, maxIter = 500L, stop_rule = 1L, init = NULL, rand.seed = 1L) {
211
    if (!is.null(init)) {
212
        W1.init = init$W1.init
213
        W2.init = init$W2.init
214
        H.init = init$H.init
215
        Z.init = init$Z.init
216
        R.init = init$R.init
217
    } else {
218
        R.init = NULL
219
        W1.init = NULL
220
        W2.init = NULL
221
        H.init = NULL
222
        Z.init = NULL
223
    }
224
    options(warn = -1)
225
    # Calculate the number of cores
226
    numCores <- min(parallel::detectCores(), nrun)
227
    cl <- tryCatch({
228
        parallel::makeForkCluster(numCores)
229
    }, error = function(e) {
230
        parallel::makeCluster(numCores)
231
    })
232
    doParallel::registerDoParallel(cl)
233
    if (hvg.use1) {
234
        X1 <- as.matrix(object@norm.data[[1]][object@var.features[[1]], ])
235
    } else {
236
        X1 <- as.matrix(object@norm.data[[1]])
237
    }
238
    if (hvg.use2) {
239
        X2 <- as.matrix(object@norm.data[[2]][object@var.features[[2]], ])
240
    } else {
241
        X2 <- as.matrix(object@norm.data[[2]])
242
    }
243
244
    if (!do.fast) {
245
      outs <- foreach(i = 1:nrun, .packages = c("Matrix")) %dopar% {
246
        set.seed(rand.seed + i - 1)
247
        scAImodel(X1, X2, K = K, s = s, alpha = alpha, lambda = lambda, gamma = gamma, maxIter = maxIter, stop_rule = stop_rule,
248
                  R.init = R.init, W1.init = W1.init, W2.init = W2.init, H.init = H.init, Z.init = Z.init)
249
      }
250
    } else {
251
      barcodes <- colnames(X2)
252
      feature1 <- rownames(X1)
253
      feature2 <- rownames(X2)
254
      names_com <- paste0("factor", seq_len(K))
255
256
      path <- paste(system.file(package="scAI"), "scAImodel_py.py", sep="/")
257
      reticulate::source_python(path)
258
      X1py = r_to_py(X1)
259
      X2py = r_to_py(X2)
260
      R.initpy = r_to_py(R.init); W1.initpy = r_to_py(W1.init); W2.initpy = r_to_py(W2.init); H.initpy = r_to_py(H.init); Z.initpy = r_to_py(Z.init)
261
      K = as.integer(K); maxIter = as.integer(maxIter)
262
      outs <- pbapply::pbsapply(
263
        X = 1:nrun,
264
        FUN = function(x) {
265
          seed = as.integer(rand.seed + x - 1)
266
          ptm = Sys.time()
267
          results = scAImodel_py(X1 = X1py, X2 = X2py, K = K, S = s, Alpha = alpha, Lambda = lambda, Gamma = gamma, Maxiter = maxIter, Stop_rule = stop_rule,
268
                                 Seed = seed, W1 = W1.initpy, W2 = W2.initpy, H = H.initpy, Z = Z.initpy,R = R.initpy)
269
          execution.time = Sys.time() - ptm
270
          names(results) <- c("W1", "W2", "H", "Z", "R")
271
          attr(results$W1, "dimnames") <- list(feature1, names_com)
272
          attr(results$W2, "dimnames") <- list(feature2, names_com)
273
          attr(results$H, "dimnames") <- list(names_com, barcodes)
274
          attr(results$Z, "dimnames") <- list(barcodes, barcodes)
275
          results$options = list(s = s, alpha = alpha, lambda = lambda, gamma = gamma, maxIter = maxIter, stop_rule = stop_rule, run.time = execution.time)
276
          return(results)
277
        },
278
        simplify = FALSE
279
      )
280
    }
281
282
    objs <- foreach(i = 1:nrun, .combine = c) %dopar% {
283
        W1 <- outs[[i]]$W1
284
        W2 <- outs[[i]]$W2
285
        sum(cor(as.matrix(W1))) + sum(cor(as.matrix(W2)))
286
    }
287
    parallel::stopCluster(cl)
288
    N <- ncol(X1)
289
    C <- base::matrix(0, N, N)
290
    for (i in seq_len(nrun)) {
291
        H <- outs[[i]]$H
292
        H <- sweep(H, 2, colSums(H), FUN = `/`)
293
        clusIndex <- apply(H, 2, which.max)
294
        # compute the consensus matrix
295
        adjMat <- clust2Mat(clusIndex)
296
        C <- C + adjMat
297
    }
298
    CM <- C/nrun
299
300
    if (!keep_all) {
301
        sprintf("The best seed is %d", which.min(objs))
302
        outs_final <- outs[[which.min(objs)]]
303
        #object@agg.data <- outs_final$agg.data
304
        W = list(W1 = outs_final$W1, W2 = outs_final$W2)
305
        names(W) <- names(object@norm.data)
306
        object@fit <- list(W = W, H = outs_final$H, Z = outs_final$Z, R = outs_final$R)
307
        object <- getAggregatedData(object)
308
        object@cluster$consensus <- CM
309
        object@options$cost <- objs
310
        object@options$paras <- outs_final$options
311
        object@options$paras$nrun <- nrun
312
        object@options$best.seed <- which.min(objs)
313
        return(object)
314
    } else {
315
        outs_final <- list()
316
        outs_final$best <- outs[[which.min(objs)]]
317
        outs_final$best$consensus <- CM
318
        outs_final$nruns <- outs
319
        outs_final$options$cost <- objs
320
        return(outs_final)
321
    }
322
}
323
324
325
326
#' Solving the optimization problem in scAI
327
#'
328
#' @param X1 Single-cell transcriptomic data matrix (norm.data)
329
#' @param X2 Single-cell epigenomic data matrix (norm.data)
330
#' @param K Rank of inferred factors
331
#' @param s Probability of Bernoulli distribution
332
#' @param alpha model parameter
333
#' @param lambda model parameter
334
#' @param gamma model parameter
335
#' @param maxIter Maximum number of iteration
336
#' @param stop_rule Stop rule to be used
337
#' @param R.init initialization of R
338
#' @param W1.init initialization of W1
339
#' @param W2.init initialization of W2
340
#' @param H.init initialization of H
341
#' @param Z.init initialization of Z
342
#'
343
#' @return
344
#' @export
345
#'
346
#' @examples
347
#' @importFrom stats rbinom runif
348
#' @importFrom rfunctions crossprodcpp
349
scAImodel <- function(X1, X2, K, s = 0.25, alpha = 1, lambda = 100000, gamma = 1, maxIter = 500, stop_rule = 1,
350
                 R.init = NULL, W1.init = NULL, W2.init = NULL, H.init = NULL, Z.init = NULL) {
351
  # Initialization W1,W2,H and Z
352
  p <- nrow(X1)
353
  n <- ncol(X1)
354
  q = nrow(X2)
355
  if (is.null(W1.init)) {
356
    W1.init = matrix(runif(p * K), p, K)
357
  }
358
  if (is.null(W2.init)) {
359
    W2.init = matrix(runif(q * K), q, K)
360
  }
361
  if (is.null(H.init)) {
362
    H.init = matrix(runif(K * n), K, n)
363
  }
364
  if (is.null(Z.init)) {
365
    Z.init = matrix(runif(n), n, n)
366
  }
367
  if (is.null(R.init)) {
368
    R.init = matrix(rbinom(n * n, 1, s), n, n)
369
  }
370
  W1 = W1.init
371
  W2 = W2.init
372
  H = H.init
373
  Z = Z.init
374
  R = R.init
375
376
  # start the clock to measure the execution time
377
  ptm = proc.time()
378
  eps <- .Machine$double.eps
379
  onesM_K <- matrix(1, K, K)
380
381
  XtX2 <- crossprodcpp(X2)
382
  index <- which(R == 0)
383
  for (iter in 1:maxIter) {
384
    # normalize H
385
    H = H/rowSums(H)
386
    # update W1
387
    HHt <- tcrossprod(H)
388
    X1Ht <- eigenMapMattcrossprod(X1, H)
389
    W1HHt <- eigenMapMatMult(W1, HHt)
390
    W1 <- W1 * X1Ht/(W1HHt + eps)
391
392
    # update W2
393
    ZR <- Z
394
    ZR[index] <- 0
395
    ZRHt <- eigenMapMattcrossprod(ZR, H)
396
    X2ZRHt <- eigenMapMatMult(X2, ZRHt)
397
    W2HHt <- eigenMapMatMult(W2, HHt)
398
    W2 = W2 * X2ZRHt/(W2HHt + eps)
399
400
    # update H
401
    W1tX1 <- eigenMapMatcrossprod(W1, X1)
402
    W2tX2 <- eigenMapMatcrossprod(W2, X2)
403
    W2tX2ZR <- eigenMapMatMult(W2tX2, ZR)
404
    HZZt <- eigenMapMatMult(H, Z + t(Z))
405
    W1tW1 <- crossprodcpp(W1)
406
    W2tW2 <- crossprodcpp(W2)
407
    temp1 <- H * (alpha * W1tX1 + W2tX2ZR + lambda * HZZt)
408
    temp2 <- eigenMapMatMult(alpha * W1tW1 + W2tW2 + 2 * lambda * HHt + gamma * onesM_K, H)
409
    H <- temp1/(temp2 + eps)
410
411
    # update Z
412
    HtH <- crossprodcpp(H)
413
    X2tW2H <- eigenMapMatcrossprod(W2tX2, H)
414
    RX2tW2H = X2tW2H
415
    RX2tW2H[index] = 0
416
    XtX2ZR <- eigenMapMatMult(XtX2, ZR)
417
    XtX2ZRR = XtX2ZR
418
    XtX2ZRR[index] = 0
419
    Z = Z * (RX2tW2H + lambda * HtH)/(XtX2ZRR + lambda * Z + eps)
420
421
    if (stop_rule == 2) {
422
      obj = alpha * norm(X1 - W1 %*% H, "F")^2 + norm(X2 %*% (Z * R) - W2 %*% H, "F")^2 + lambda * norm(Z - HtH, "F") + gamma * sum(colSums(H) * colSums(H))
423
      if (iter > 1 && ((obj_old - obj)/obj_old < 10^(-6)) || iter == maxIter) {
424
        break
425
      }
426
      obj_old = obj
427
    }
428
  }
429
  # compute the execution time
430
  execution.time = proc.time() - ptm
431
432
  barcodes <- colnames(X2)
433
  feature1 <- rownames(X1)
434
  feature2 <- rownames(X2)
435
  names_com <- paste0("factor", seq_len(K))
436
  attr(W1, "dimnames") <- list(feature1, names_com)
437
  attr(W2, "dimnames") <- list(feature2, names_com)
438
  attr(H, "dimnames") <- list(names_com, barcodes)
439
  attr(Z, "dimnames") <- list(barcodes, barcodes)
440
441
  outs <- list(W1 = W1, W2 = W2, H = H, Z = Z, R = R,
442
               options = list(s = s, alpha = alpha, lambda = lambda, gamma = gamma, maxIter = maxIter, stop_rule = stop_rule, run.time = execution.time))
443
  return(outs)
444
445
}
446
447
448
#' Generate the aggregated epigenomic data
449
#'
450
#' @param object an scAI object after running run_scAI
451
#' @param group cell group information if available; aggregate epigenomic data based on the available cell group information instead of the learned cell-cell similarity matrix from scAI
452
#'
453
#' @return
454
#' @export
455
#'
456
getAggregatedData <- function(object, group = NULL) {
457
  if (is.null(group)) {
458
    Z <-  object@fit$Z
459
  } else {
460
    if (is.factor(group) | is.character(group)) {
461
      group <- as.numeric(factor(group))
462
    }
463
    Z <- clust2Mat(group)
464
    diag(Z) <- 1
465
    object@fit$Z <- Z
466
  }
467
  R <- object@fit$R
468
  ZR <- Z * R
469
  ZR <- sweep(ZR, 2, colSums(ZR), FUN = `/`)
470
  X2 <- as.matrix(object@norm.data[[2]])
471
  X2agg <- eigenMapMatMult(X2, ZR)
472
  X2agg <- sweep(X2agg, 2, colSums(X2agg), FUN = `/`) * 10000
473
  X2agg <- log(1 + X2agg)
474
  attr(X2agg, "dimnames") <- list(rownames(object@norm.data[[2]]), colnames(object@norm.data[[2]]))
475
  object@agg.data <- X2agg
476
  return(object)
477
}
478
479
480
481
#' Perform dimensional reduction
482
#'
483
#' Dimension reduction by PCA, t-SNE or UMAP
484
#' @param object scAI object
485
#' @param return.object whether return scAI object
486
#' @param data.use input data
487
#' @param do.scale whether scale the data
488
#' @param do.center whether scale and center the data
489
#' @param method Method of dimensional reduction, one of tsne, FItsne and umap
490
#' @param rand.seed Set a random seed. By default, sets the seed to 42.
491
#' @param perplexity perplexity parameter in tsne
492
#' @param theta parameter in tsne
493
#' @param check_duplicates parameter in tsne
494
495
#' @param FItsne.path File path of FIt-SNE
496
#' @param dim.embed dimensions of t-SNE embedding
497
#' @param dim.use num of PCs used for t-SNE
498
#'
499
#' @param do.fast whether do fast PCA
500
#' @param dimPC the number of components to keep in PCA
501
#' @param weight.by.var whether use weighted pc.scores
502
#'
503
#' @param n.neighbors This determines the number of neighboring points used in
504
#' local approximations of manifold structure. Larger values will result in more
505
#' global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50.
506
#' @param n.components The dimension of the space to embed into.
507
#' @param distance This determines the choice of metric used to measure distance in the input space.
508
#' @param n.epochs the number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. If NULL is specified, a value will be selected based on the size of the input dataset (200 for large datasets, 500 for small).
509
#' @param learning.rate The initial learning rate for the embedding optimization.
510
#' @param min.dist This controls how tightly the embedding is allowed compress points together.
511
#' Larger values ensure embedded points are moreevenly distributed, while smaller values allow the
512
#' algorithm to optimise more accurately with regard to local structure. Sensible values are in the range 0.001 to 0.5.
513
#' @param spread he effective scale of embedded points. In combination with min.dist this determines how clustered/clumped the embedded points are.
514
#' @param set.op.mix.ratio Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets.
515
#' @param local.connectivity The local connectivity required - i.e. the number of nearest neighbors
516
#' that should be assumed to be connected at a local level. The higher this value the more connected
517
#' the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold.
518
#' @param repulsion.strength Weighting applied to negative samples in low dimensional embedding
519
#' optimization. Values higher than one will result in greater weight being given to negative samples.
520
#' @param negative.sample.rate The number of negative samples to select per positive sample in the
521
#' optimization process. Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
522
#' @param a More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread.
523
#' @param b More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread.
524
525
#'
526
#' @return
527
#' @export
528
#'
529
#' @examples
530
#' @importFrom Rtsne Rtsne
531
#' @importFrom reticulate py_module_available py_set_seed import
532
#'
533
reducedDims <- function(object, data.use = object@fit$H, do.scale = TRUE, do.center = TRUE, return.object = TRUE, method = "umap",
534
dim.embed = 2, dim.use = NULL, perplexity = 30, theta = 0.5, check_duplicates = F, rand.seed = 42L,
535
FItsne.path = NULL,
536
dimPC = 40,do.fast = TRUE, weight.by.var = TRUE,
537
n.neighbors = 30L, n.components = 2L, distance = "correlation",n.epochs = NULL,learning.rate = 1.0,min.dist = 0.3,spread = 1.0,set.op.mix.ratio = 1.0,local.connectivity = 1L,
538
repulsion.strength = 1,negative.sample.rate = 5,a = NULL,b = NULL) {
539
540
    data.use <- as.matrix(data.use)
541
    if (do.scale) {
542
        data.use <- t(scale(t(data.use), center = do.center, scale = TRUE))
543
        data.use[is.na(data.use)] <- 0
544
    }
545
    if (!is.null(dim.use)) {
546
        data.use = object@embed$pca[, dim.use]
547
    }
548
    if (method == "pca") {
549
        cell_coords <- runPCA(data.use, do.fast = do.fast, dimPC = dimPC, seed.use = rand.seed, weight.by.var = weight.by.var)
550
        object@embed$pca <- cell_coords
551
552
    } else if (method == "tsne") {
553
        set.seed(rand.seed)
554
        cell_coords <- Rtsne::Rtsne(t(data.use), pca = FALSE, dims = dim.embed, theta = theta, perplexity = perplexity, check_duplicates = F)$Y
555
        rownames(cell_coords) <- rownames(t(data.use))
556
        object@embed$tsne <- cell_coords
557
558
    } else if (method == "FItsne") {
559
        if (!exists("FItsne")) {
560
            if (is.null(fitsne.path)) {
561
                stop("Please pass in path to FIt-SNE directory as FItsne.path.")
562
            }
563
            source(paste0(fitsne.path, "/fast_tsne.R"), chdir = T)
564
        }
565
        cell_coords <- fftRtsne(t(data.use), pca = FALSE, dims = dim.embed, theta = theta, perplexity = perplexity, check_duplicates = F, rand_seed = rand.seed)
566
        rownames(cell_coords) <- rownames(t(data.use))
567
        object@embed$FItsne <- cell_coords
568
569
    } else if (method == "umap") {
570
        cell_coords <- runUMAP(data.use,
571
        n.neighbors = n.neighbors,
572
        n.components = n.components,
573
        distance = distance,
574
        n.epochs = n.epochs,
575
        learning.rate = learning.rate,
576
        min.dist = min.dist,
577
        spread = spread,
578
        set.op.mix.ratio = set.op.mix.ratio,
579
        local.connectivity = local.connectivity,
580
        repulsion.strength = repulsion.strength,
581
        negative.sample.rate = negative.sample.rate,
582
        a = a,
583
        b = b,
584
        seed.use = rand.seed
585
        )
586
        object@embed$umap <- cell_coords
587
588
    } else {
589
        stop("Error: unrecognized dimensionality reduction method.")
590
    }
591
    if (return.object) {
592
        return(object)
593
    } else {
594
        return(cell_coords)
595
    }
596
597
}
598
599
600
#' Dimension reduction using PCA
601
#'
602
#' @param data.use input data
603
#' @param do.fast whether do fast PCA
604
#' @param dimPC the number of components to keep
605
#' @param seed.use set a seed
606
#' @param weight.by.var whether use weighted pc.scores
607
#' @importFrom stats prcomp
608
#' @importFrom irlba irlba
609
#' @return
610
#' @export
611
#'
612
#' @examples
613
runPCA <- function(data.use, do.fast = T, dimPC = 50, seed.use = 42, weight.by.var = T) {
614
  set.seed(seed = seed.use)
615
  if (do.fast) {
616
    dimPC <- min(dimPC, nrow(data.use) - 1)
617
    pca.res <- irlba::irlba(t(data.use), nv = dimPC)
618
    sdev <- pca.res$d/sqrt(max(1, ncol(data.use) - 1))
619
    if (weight.by.var){
620
      pc.scores <- pca.res$u %*% diag(pca.res$d)
621
    } else {
622
      pc.scores <- pca.res$u
623
    }
624
  } else {
625
    dimPC <- min(dimPC, nrow(data.use) - 1)
626
    pca.res <- stats::prcomp(x = t(data.use), rank. = dimPC)
627
    sdev <- pca.res$sdev
628
    if (weight.by.var) {
629
      pc.scores <- pca.res$x %*% diag(pca.res$sdev[1:dimPC]^2)
630
    } else {
631
      pc.scores <- pca.res$x
632
    }
633
  }
634
  rownames(pc.scores) <- colnames(data.use)
635
  colnames(pc.scores) <- paste0('PC', 1:ncol(pc.scores))
636
  cell_coords <- pc.scores
637
  return(cell_coords)
638
}
639
640
#' Perform dimension reduction using UMAP
641
#'
642
#' @param data.use input data
643
#' @param n.neighbors This determines the number of neighboring points used in
644
#' local approximations of manifold structure. Larger values will result in more
645
#' global structure being preserved at the loss of detailed local structure. In general this parameter should often be in the range 5 to 50.
646
#' @param n.components The dimension of the space to embed into.
647
#' @param distance This determines the choice of metric used to measure distance in the input space.
648
#' @param n.epochs the number of training epochs to be used in optimizing the low dimensional embedding. Larger values result in more accurate embeddings. If NULL is specified, a value will be selected based on the size of the input dataset (200 for large datasets, 500 for small).
649
#' @param learning.rate The initial learning rate for the embedding optimization.
650
#' @param min.dist This controls how tightly the embedding is allowed compress points together.
651
#' Larger values ensure embedded points are moreevenly distributed, while smaller values allow the
652
#' algorithm to optimise more accurately with regard to local structure. Sensible values are in the range 0.001 to 0.5.
653
#' @param spread he effective scale of embedded points. In combination with min.dist this determines how clustered/clumped the embedded points are.
654
#' @param set.op.mix.ratio Interpolate between (fuzzy) union and intersection as the set operation used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets.
655
#' @param local.connectivity The local connectivity required - i.e. the number of nearest neighbors
656
#' that should be assumed to be connected at a local level. The higher this value the more connected
657
#' the manifold becomes locally. In practice this should be not more than the local intrinsic dimension of the manifold.
658
#' @param repulsion.strength Weighting applied to negative samples in low dimensional embedding
659
#' optimization. Values higher than one will result in greater weight being given to negative samples.
660
#' @param negative.sample.rate The number of negative samples to select per positive sample in the
661
#' optimization process. Increasing this value will result in greater repulsive force being applied, greater optimization cost, but slightly more accuracy.
662
#' @param a More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread.
663
#' @param b More specific parameters controlling the embedding. If NULL, these values are set automatically as determined by min. dist and spread.
664
#' @param seed.use Set a random seed. By default, sets the seed to 42.
665
#' @param metric.kwds A dictionary of arguments to pass on to the metric, such as the p value for Minkowski distance
666
#' @param angular.rp.forest Whether to use an angular random projection forest to initialise the
667
#' approximate nearest neighbor search. This can be faster, but is mostly on useful for metric that
668
#' use an angular style distance such as cosine, correlation etc. In the case of those metrics angular forests will be chosen automatically.
669
#' @param verbose Controls verbosity
670
#' This function is modified from Seurat package
671
#' @return
672
#' @export
673
#' @importFrom reticulate py_module_available py_set_seed import
674
#' @examples
675
runUMAP <- function(
676
  data.use,
677
  n.neighbors = 30L,
678
  n.components = 2L,
679
  distance = "correlation",
680
  n.epochs = NULL,
681
  learning.rate = 1.0,
682
  min.dist = 0.3,
683
  spread = 1.0,
684
  set.op.mix.ratio = 1.0,
685
  local.connectivity = 1L,
686
  repulsion.strength = 1,
687
  negative.sample.rate = 5,
688
  a = NULL,
689
  b = NULL,
690
  seed.use = 42L,
691
  metric.kwds = NULL,
692
  angular.rp.forest = FALSE,
693
  verbose = TRUE){
694
  if (!reticulate::py_module_available(module = 'umap')) {
695
    stop("Cannot find UMAP, please install through pip (e.g. pip install umap-learn or reticulate::py_install(packages = 'umap-learn')).")
696
  }
697
  set.seed(seed.use)
698
  reticulate::py_set_seed(seed.use)
699
  umap_import <- reticulate::import(module = "umap", delay_load = TRUE)
700
  umap <- umap_import$UMAP(
701
    n_neighbors = as.integer(n.neighbors),
702
    n_components = as.integer(n.components),
703
    metric = distance,
704
    n_epochs = n.epochs,
705
    learning_rate = learning.rate,
706
    min_dist = min.dist,
707
    spread = spread,
708
    set_op_mix_ratio = set.op.mix.ratio,
709
    local_connectivity = local.connectivity,
710
    repulsion_strength = repulsion.strength,
711
    negative_sample_rate = negative.sample.rate,
712
    a = a,
713
    b = b,
714
    metric_kwds = metric.kwds,
715
    angular_rp_forest = angular.rp.forest,
716
    verbose = verbose
717
  )
718
  Rumap <- umap$fit_transform
719
  umap_output <- Rumap(t(data.use))
720
  colnames(umap_output) <- paste0('UMAP', 1:ncol(umap_output))
721
  rownames(umap_output) <- colnames(data.use)
722
  return(umap_output)
723
}
724
725
726
727
728
#' Identify enriched features in each factor
729
#'
730
#' Rank features in each factor by Factor loading analysis
731
#' @param object scAI object
732
#' @param assay Name of assay to be analyzed
733
#' @param features a vector of features
734
#' @param cutoff.W Threshold of feature loading values
735
#' @param cutoff.H Threshold of cell loading values
736
#' @param thresh.pc Threshold of the percent of cells enriched in one factor
737
#' @param thresh.fc Threshold of Fold Change
738
#' @param thresh.p Threshold of p-values
739
#' @param n.top Number of top features to be returned
740
#' @importFrom stats sd wilcox.test
741
#' @importFrom dplyr top_n slice
742
#' @importFrom future nbrOfWorkers
743
#' @importFrom future.apply future_sapply
744
#' @importFrom pbapply pbsapply
745
#' @importFrom stats p.adjust
746
#'
747
#' @return
748
#' @export
749
#'
750
#' @examples
751
752
identifyFactorMarkers <- function(object, assay, features = NULL, cutoff.W = 0.5, cutoff.H = 0.5,
753
                                  thresh.pc = 0.05, thresh.fc = 0.25, thresh.p = 0.05, n.top = 10) {
754
    if (assay == "RNA") {
755
        X <- as.matrix(object@norm.data[[assay]])
756
    } else {
757
        X <- as.matrix(object@agg.data)
758
    }
759
    H <- object@fit$H
760
    W <- object@fit$W[[assay]]
761
    if (is.null(features)) {
762
        features <- row.names(W)
763
    }
764
765
    H <- sweep(H, 2, colSums(H), FUN = `/`)
766
    K = ncol(W)
767
    lib_W <- base::rowSums(W)
768
    lib_W[lib_W == 0] <- 1
769
    lib_W[lib_W < mean(lib_W) - 5 * sd(lib_W)] <- 1  #  omit the nearly null rows
770
    W <- sweep(W, 1, lib_W, FUN = `/`)
771
    MW <- base::colMeans(W)
772
    sW <- apply(W, 2, sd)
773
    # candidate markers for each factor
774
    IndexW_record <- vector("list", K)
775
    for (j in 1:K) {
776
        IndexW_record[[j]] <- which(W[, j] > MW[j] + cutoff.W * sW[j])
777
    }
778
779
    my.sapply <- ifelse(
780
    test = future::nbrOfWorkers() == 1,
781
    yes = pbapply::pbsapply,
782
    no = future.apply::future_sapply
783
    )
784
785
    # divided cells into two groups
786
    mH <- apply(H, 1, mean)
787
    sH <- apply(H, 1, sd)
788
    IndexH_record1 <- vector("list", K)
789
    IndexH_record2 <- vector("list", K)
790
    for (i in 1:K) {
791
        IndexH_record1[[i]] <- which(H[i, ] > mH[i] + cutoff.H * sH[i])
792
        IndexH_record2[[i]] <- base::setdiff(c(1:ncol(H)), IndexH_record1[[i]])
793
    }
794
795
    # identify factor-specific markers
796
    factor_markers = vector("list", K)
797
    markersTopn = vector("list", K)
798
    factors <- c()
799
    Features <- c()
800
    Pvalues <- c()
801
    Log2FC <- c()
802
    for (i in 1:K) {
803
        data1 <- X[IndexW_record[[i]], IndexH_record1[[i]]]
804
        data2 <- X[IndexW_record[[i]], IndexH_record2[[i]]]
805
        idx1 <- which(base::rowSums(data1 != 0) > thresh.pc * ncol(data1))  # at least expressed in thresh.pc% cells in one group
806
        FC <- log2(base::rowMeans(data1)/base::rowMeans(data2))
807
        idx2 <- which(FC > thresh.fc)
808
        pvalues <- my.sapply(
809
        X = 1:nrow(x = data1),
810
        FUN = function(x) {
811
            return(wilcox.test(data1[x, ], data2[x, ], alternative = "greater")$p.value)
812
        }
813
        )
814
815
        idx3 <- which(pvalues < thresh.p)
816
        idx = intersect(intersect(idx1, idx2), idx3)
817
818
        # order
819
        FC <- FC[idx]
820
        c = sort(FC, decreasing = T, index.return = T)$ix
821
        ri = IndexW_record[[i]]
822
823
        Pvalues <- c(Pvalues, pvalues[ri[idx[c]]])
824
        Log2FC <- c(Log2FC, FC[c])
825
        factors <- c(factors, rep(i, length(c)))
826
        Features <- c(Features, features[ri[idx[c]]])
827
828
    }
829
    # stopCluster(cl)
830
    markers.all <- cbind(factors = factors, features = Features, pvalues = Pvalues, log2FC = Log2FC)
831
832
    rownames(markers.all) <- Features
833
    markers.all <- as.data.frame(markers.all)
834
    markers.top <- markers.all %>% dplyr::group_by(factors) %>% top_n(n.top, log2FC) %>% dplyr::slice(1:n.top)
835
836
    markers = list(markers.all = markers.all, markers.top = markers.top)
837
838
    return(markers)
839
}
840
841
842
843
#' compute the 2D coordinates of embeded cells, genes, loci and factors using VscAI visualization
844
#'
845
#' @param object scAI object
846
#' @param genes.embed A vector of genes to be embedded
847
#' @param loci.embed A vector of loci to be embedded
848
#' @param alpha.embed Parameter controlling the distance between the cells and the factors
849
#' @param snn.embed Number of neighbors
850
#'
851
#' @return
852
#' @export
853
#'
854
#' @examples
855
#' @importFrom Matrix Matrix
856
getEmbeddings <- function(object, genes.embed = NULL, loci.embed = NULL, alpha.embed = 1.9, snn.embed = 5) {
857
  H <- object@fit$H
858
  W1 <- object@fit$W[[1]]
859
  W2 <- object@fit$W[[2]]
860
  Z <- object@fit$Z
861
862
  if (nrow(H) < 3 & length(object@fit.variedK) == 0) {
863
    print("VscAI needs at least three factors for embedding. Now rerun scAI with rank being 3...")
864
    outs <- run_scAI(object, K = 3, nrun = object@options$paras$nrun,
865
                     s = object@options$paras$s, alpha = object@options$paras$alpha, lambda = object@options$paras$lambda, gamma = object@options$paras$gamma,
866
                     maxIter = object@options$paras$maxIter, stop_rule = object@options$paras$stop_rule)
867
    W <- outs@fit$W
868
    H <- outs@fit$H
869
    Z <- outs@fit$Z
870
    W1 <- W[[1]]
871
    W2 <- W[[2]]
872
    object@fit.variedK$W <- W
873
    object@fit.variedK$H <- H
874
    object@fit.variedK$Z <- Z
875
  } else if (nrow(H) < 3 & length(object@fit.variedK) != 0) {
876
    print("Using the previous calculated factors for embedding.")
877
    W1 <- object@fit.variedK$W[[1]]
878
    W2 <- object@fit.variedK$W[[2]]
879
    H <- object@fit.variedK$H
880
    Z <- object@fit.variedK$Z
881
882
  }
883
884
  nmf.scores <- H
885
886
  Zsym <- Z/max(Z)
887
  Zsym[Zsym < 10^(-6)] <- 0
888
  Zsym <- (Zsym + t(Zsym))/2
889
  diag(Zsym) <- 1
890
  rownames(Zsym) <- colnames(H)
891
  colnames(Zsym) <- rownames(Zsym)
892
893
  alpha.exp <- alpha.embed  # Increase this > 1.0 to move the cells closer to the factors. Values > 2 start to distort the data.
894
  snn.exp <- snn.embed  # Lower this < 1.0 to move similar cells closer to each other
895
  n_pull <- nrow(H)  # The number of factors pulling on each cell. Must be at least 3.
896
897
  Zsym <- Matrix(data = Zsym, sparse = TRUE)
898
  snn <- Zsym
899
  swne.embedding <- swne::EmbedSWNE(nmf.scores, snn, alpha.exp = alpha.exp, snn.exp = snn.exp, n_pull = n_pull, proj.method = "sammon", dist.use = "cosine")
900
  # project cells and factors
901
  factor_coords <- swne.embedding$H.coords
902
  cell_coords <- swne.embedding$sample.coords
903
  rownames(factor_coords) <- rownames(H)
904
  rownames(cell_coords) <- colnames(H)
905
906
  # project genes onto the low dimension space by VscAI
907
  if (is.null(genes.embed)) {
908
    genes.embed <- rownames(W1)
909
  } else {
910
    genes.embed <- as.character(as.vector(as.matrix((genes.embed))))
911
  }
912
913
  swne.embedding <- swne::EmbedFeatures(swne.embedding, W1, genes.embed, n_pull = n_pull)
914
  gene_coords <- swne.embedding$feature.coords
915
  rownames(gene_coords) <- gene_coords$name
916
917
  # project loci
918
  if (is.null(loci.embed)) {
919
    loci.embed <- rownames(W2)
920
  } else {
921
    loci.embed <- as.character(as.vector(as.matrix((loci.embed))))
922
  }
923
924
  swne.embedding <- swne::EmbedFeatures(swne.embedding, W2, loci.embed, n_pull = n_pull)
925
  loci_coords <- swne.embedding$feature.coords
926
  rownames(loci_coords) <- loci_coords$name
927
928
  object@embed$VscAI$cells <- cell_coords
929
  object@embed$VscAI$factors <- factor_coords
930
  object@embed$VscAI$genes <- gene_coords
931
  object@embed$VscAI$loci <- loci_coords
932
  return(object)
933
}
934
935
936
937
#' Identify cell clusters
938
#'
939
#' @param object scAI object
940
#' @param partition.type Type of partition to use. Defaults to RBConfigurationVertexPartition. Options include: ModularityVertexPartition, RBERVertexPartition, CPMVertexPartition, MutableVertexPartition, SignificanceVertexPartition, SurpriseVertexPartition (see the Leiden python module documentation for more details)
941
#' @param seed.use set seed
942
#' @param n.iter number of iteration
943
#' @param initial.membership arameters to pass to the Python leidenalg function defaults initial_membership=None
944
#' @param weights defaults weights=None
945
#' @param node.sizes Parameters to pass to the Python leidenalg function
946
#' @param resolution A parameter controlling the coarseness of the clusters
947
#' @param K Number of clusters if performing hierarchical clustering of the consensus matrix
948
#' @return
949
#' @export
950
#'
951
#' @examples
952
#' @importFrom stats as.dist cophenetic cor cutree dist hclust
953
identifyClusters <- function(object, resolution = 1, partition.type = "RBConfigurationVertexPartition",
954
                             seed.use = 42L,n.iter = 10L,
955
                             initial.membership = NULL, weights = NULL, node.sizes = NULL,
956
                             K = NULL) {
957
  if (is.null(K) & !is.null(resolution)) {
958
    data.use <- object@fit$H
959
    data.use <- as.matrix(data.use)
960
    data.use <- t(scale(t(data.use), center = TRUE, scale = TRUE))
961
    snn <- swne::CalcSNN(data.use, k = 20, prune.SNN = 1/15)
962
    idents <- runLeiden(SNN = snn, resolution = resolution, partition_type = partition.type,
963
    seed.use = seed.use, n.iter = n.iter,
964
    initial.membership = initial.membership, weights = weights, node.sizes = node.sizes)
965
  } else {
966
    CM <- object@cluster$consensus
967
    d <- as.dist(1 - CM)
968
    hc <- hclust(d, "ave")
969
    idents<- hc %>% cutree(k = K)
970
    coph <- cophenetic(hc)
971
    cs <- cor(d, coph)
972
    object@cluster$cluster.stability <- cs
973
  }
974
  names(idents) <- rownames(object@pData)
975
  object@identity <- factor(idents)
976
  object@pData$cluster <- factor(idents)
977
  return(object)
978
}
979
980
981
#' Run Leiden clustering algorithm
982
#' This code is modified from Tom Kelly (https://github.com/TomKellyGenetics/leiden), where we added more parameters (seed.use and n.iter) to run the Python version. In addition, we also take care of the singleton issue after running leiden algorithm.
983
#' @description Implements the Leiden clustering algorithm in R using reticulate to run the Python version. Requires the python "leidenalg" and "igraph" modules to be installed. Returns a vector of partition indices.
984
#' @param SNN An adjacency matrix compatible with \code{\link[igraph]{igraph}} object.
985
#' @param seed.use set seed
986
#' @param n.iter number of iteration
987
#' @param initial.membership arameters to pass to the Python leidenalg function defaults initial_membership=None
988
#' @param node.sizes Parameters to pass to the Python leidenalg function
989
#' @param partition_type Type of partition to use. Defaults to RBConfigurationVertexPartition. Options include: ModularityVertexPartition, RBERVertexPartition, CPMVertexPartition, MutableVertexPartition, SignificanceVertexPartition, SurpriseVertexPartition (see the Leiden python module documentation for more details)
990
#' @param resolution A parameter controlling the coarseness of the clusters
991
#' @param weights defaults weights=None
992
#' @return A parition of clusters as a vector of integers
993
##'
994
##'
995
#' @keywords graph network igraph mvtnorm simulation
996
#' @importFrom reticulate import r_to_py
997
##' @export
998
999
runLeiden <- function(SNN = matrix(), resolution = 1, partition_type = c(
1000
  'RBConfigurationVertexPartition',
1001
  'ModularityVertexPartition',
1002
  'RBERVertexPartition',
1003
  'CPMVertexPartition',
1004
  'MutableVertexPartition',
1005
  'SignificanceVertexPartition',
1006
  'SurpriseVertexPartition'),
1007
seed.use = 42L,
1008
n.iter = 10L,
1009
initial.membership = NULL, weights = NULL, node.sizes = NULL) {
1010
  if (!reticulate::py_module_available(module = 'leidenalg')) {
1011
    stop("Cannot find Leiden algorithm, please install through pip (e.g. pip install leidenalg).")
1012
  }
1013
1014
  #import python modules with reticulate
1015
  leidenalg <- import("leidenalg", delay_load = TRUE)
1016
  ig <- import("igraph", delay_load = TRUE)
1017
1018
  resolution_parameter <- resolution
1019
  initial_membership <- initial.membership
1020
  node_sizes <- node.sizes
1021
  #convert matrix input (corrects for sparse matrix input)
1022
  adj_mat <- as.matrix(SNN)
1023
1024
  #compute weights if non-binary adjacency matrix given
1025
  is_pure_adj <- all(as.logical(adj_mat) == adj_mat)
1026
  if (is.null(weights) && !is_pure_adj) {
1027
    #assign weights to edges (without dependancy on igraph)
1028
    weights <- t(adj_mat)[t(adj_mat)!=0]
1029
    #remove zeroes from rows of matrix and return vector of length edges
1030
  }
1031
1032
  ##convert to python numpy.ndarray, then a list
1033
  adj_mat_py <- r_to_py(adj_mat)
1034
  adj_mat_py <- adj_mat_py$tolist()
1035
1036
  #convert graph structure to a Python compatible object
1037
  GraphClass <- if (!is.null(weights) && !is_pure_adj){
1038
    ig$Graph$Weighted_Adjacency
1039
  } else {
1040
    ig$Graph$Adjacency
1041
  }
1042
  snn_graph <- GraphClass(adj_mat_py)
1043
1044
  #compute partitions
1045
  partition_type <- match.arg(partition_type)
1046
  part <- switch(
1047
    EXPR = partition_type,
1048
    'RBConfigurationVertexPartition' = leidenalg$find_partition(
1049
      snn_graph,
1050
      leidenalg$RBConfigurationVertexPartition,
1051
      initial_membership = initial.membership, weights = weights,
1052
      resolution_parameter = resolution,
1053
      n_iterations = n.iter,
1054
      seed = seed.use
1055
    ),
1056
    'ModularityVertexPartition' = leidenalg$find_partition(
1057
      snn_graph,
1058
      leidenalg$ModularityVertexPartition,
1059
      initial_membership = initial.membership, weights = weights,
1060
      n_iterations = n.iter,
1061
      seed = seed.use
1062
    ),
1063
    'RBERVertexPartition' = leidenalg$find_partition(
1064
      snn_graph,
1065
      leidenalg$RBERVertexPartition,
1066
      initial_membership = initial.membership, weights = weights, node_sizes = node.sizes,
1067
      resolution_parameter = resolution_parameter,
1068
      n_iterations = n.iter,
1069
      seed = seed.use
1070
    ),
1071
    'CPMVertexPartition' = leidenalg$find_partition(
1072
      snn_graph,
1073
      leidenalg$CPMVertexPartition,
1074
      initial_membership = initial.membership, weights = weights, node_sizes = node.sizes,
1075
      resolution_parameter = resolution,
1076
      n_iterations = n.iter,
1077
      seed = seed.use
1078
    ),
1079
    'MutableVertexPartition' = leidenalg$find_partition(
1080
      snn_graph,
1081
      leidenalg$MutableVertexPartition,
1082
      initial_membership = initial.membership,
1083
      n_iterations = n.iter,
1084
      seed = seed.use
1085
    ),
1086
    'SignificanceVertexPartition' = leidenalg$find_partition(
1087
      snn_graph,
1088
      leidenalg$SignificanceVertexPartition,
1089
      initial_membership = initial.membership, node_sizes = node.sizes,
1090
      resolution_parameter = resolution,
1091
      n_iterations = n.iter,
1092
      seed = seed.use
1093
    ),
1094
    'SurpriseVertexPartition' = leidenalg$find_partition(
1095
      snn_graph,
1096
      leidenalg$SurpriseVertexPartition,
1097
      initial_membership = initial.membership, weights = weights, node_sizes = node.sizes,
1098
      n_iterations = n.iter,
1099
      seed = seed.use
1100
    ),
1101
    stop("please specify a partition type as a string out of those documented")
1102
  )
1103
  partition <- part$membership+1
1104
  idents <- partition
1105
1106
  if (min(table(idents)) == 1) {
1107
    idents <- assignSingletons(idents, SNN)
1108
  }
1109
  idents <- factor(idents)
1110
  names(idents) <- row.names(SNN)
1111
  return(idents)
1112
}
1113
1114
# Group single cells that make up their own cluster in with the cluster they are most connected to.
1115
#
1116
# @param idents  clustering result
1117
# @param SNN     SNN graph
1118
# @return        Returns scAI object with all singletons merged with most connected cluster
1119
1120
assignSingletons <- function(idents, SNN) {
1121
  # identify singletons
1122
  singletons <- c()
1123
  for (cluster in unique(idents)) {
1124
    if (length(which(idents %in% cluster)) == 1) {
1125
      singletons <- append(x = singletons, values = cluster)
1126
    }
1127
  }
1128
  #singletons = names(table(idents))[which(table(idents)==1)]
1129
  # calculate connectivity of singletons to other clusters, add singleton to cluster it is most connected to
1130
  cluster_names <- unique(x = idents)
1131
  cluster_names <- setdiff(x = cluster_names, y = singletons)
1132
  connectivity <- vector(mode="numeric", length = length(x = cluster_names))
1133
  names(x = connectivity) <- cluster_names
1134
  for (i in singletons) {
1135
    print(i)
1136
    for (j in cluster_names) {
1137
      subSNN = SNN[
1138
        which(idents %in% i),
1139
        which(idents %in% j)
1140
        ]
1141
      if (is.object(x = subSNN)) {
1142
        connectivity[j] <- sum(subSNN) / (nrow(x = subSNN) * ncol(x = subSNN))
1143
      } else {
1144
        connectivity[j] <- mean(x = subSNN)
1145
      }
1146
    }
1147
    m <- max(connectivity, na.rm = T)
1148
    mi <- which(x = connectivity == m, arr.ind = TRUE)
1149
    closest_cluster <- sample(x = names(x = connectivity[mi]), 1)
1150
    which(idents %in% i)[which(idents %in% i)] <- closest_cluster
1151
  }
1152
  if (length(x = singletons) > 0) {
1153
    message(paste(
1154
      length(x = singletons),
1155
      "singletons identified.",
1156
      length(x = unique(idents)),
1157
      "final clusters."
1158
    ))
1159
  }
1160
  return(idents)
1161
}
1162
1163
1164
#' Convert membership into an adjacent matrix
1165
#'
1166
#' @param memb Membership vector
1167
#'
1168
#' @return
1169
#' @export
1170
#'
1171
#' @examples
1172
clust2Mat <- function(memb) {
1173
  N <- length(memb)
1174
  return(as.numeric(outer(memb, memb, FUN = "==")) - outer(1:N, 1:N, "=="))
1175
}
1176
1177
1178
#' Identify markers in each cell cluster
1179
#'
1180
#' @param object scAI object
1181
#' @param assay Name of assay to be analyzed
1182
#' @param features a vector of features
1183
#' @param use.agg whether use the aggregated data
1184
#' @param test.use which test to use ("bimod" or "wilcox")
1185
#' @param thresh.pc Threshold of the percent of cells enriched in one cluster
1186
#' @param thresh.fc Threshold of Fold Change
1187
#' @param thresh.p Threshold of p-values
1188
#' @importFrom future nbrOfWorkers
1189
#' @importFrom pbapply pbsapply
1190
#' @importFrom future.apply future_sapply
1191
#' @importFrom stats sd wilcox.test
1192
#' @importFrom dplyr top_n slice
1193
#' @importFrom stats p.adjust
1194
#'
1195
#' @return
1196
#' @export
1197
#'
1198
#' @examples
1199
identifyClusterMarkers <- function(object, assay, features = NULL, use.agg = TRUE, test.use = "bimod",
1200
thresh.pc = 0.25, thresh.fc = 0.25, thresh.p = 0.01) {
1201
    if (assay == "RNA") {
1202
        X <- object@norm.data[[assay]]
1203
    } else {
1204
        if (use.agg) {
1205
            X <- object@agg.data
1206
        } else {
1207
            X <- object@norm.data[[assay]]
1208
        }
1209
    }
1210
    if (is.null(features)) {
1211
        features.use <- row.names(X)
1212
    } else {
1213
        features.use <- intersect(features, row.names(X))
1214
    }
1215
    data.use <- X[features.use,]
1216
    data.use <- as.matrix(data.use)
1217
1218
    groups <- object@identity
1219
    labels <- groups
1220
    level.use <- levels(labels)
1221
    numCluster <- length(level.use)
1222
1223
    my.sapply <- ifelse(
1224
    test = future::nbrOfWorkers() == 1,
1225
    yes = pbapply::pbsapply,
1226
    no = future.apply::future_sapply
1227
    )
1228
1229
1230
    mean.fxn <- function(x) {
1231
        return(log(x = mean(x = expm1(x = x)) + 1))
1232
    }
1233
    genes.de <- vector("list", length = numCluster)
1234
    for (i in 1:numCluster) {
1235
        features <- features.use
1236
        cell.use1 <- which(labels %in% level.use[i])
1237
        cell.use2 <- base::setdiff(1:length(labels), cell.use1)
1238
1239
# feature selection (based on percentages)
1240
    thresh.min <- 0
1241
    pct.1 <- round(
1242
      x = rowSums(x = data.use[features, cell.use1, drop = FALSE] > thresh.min) /
1243
        length(x = cell.use1),
1244
      digits = 3
1245
    )
1246
    pct.2 <- round(
1247
      x = rowSums(x = data.use[features, cell.use2, drop = FALSE] > thresh.min) /
1248
        length(x = cell.use2),
1249
      digits = 3
1250
    )
1251
    data.alpha <- cbind(pct.1, pct.2)
1252
    colnames(x = data.alpha) <- c("pct.1", "pct.2")
1253
    alpha.min <- apply(X = data.alpha, MARGIN = 1, FUN = max)
1254
    names(x = alpha.min) <- rownames(x = data.alpha)
1255
    features <- names(x = which(x = alpha.min > thresh.pc))
1256
    if (length(x = features) == 0) {
1257
      stop("No features pass thresh.pc threshold")
1258
    }
1259
1260
    # feature selection (based on average difference)
1261
    data.1 <- apply(X = data.use[features, cell.use1, drop = FALSE],MARGIN = 1,FUN = mean.fxn)
1262
    data.2 <- apply(X = data.use[features, cell.use2, drop = FALSE],MARGIN = 1,FUN = mean.fxn)
1263
    FC <- (data.1 - data.2)
1264
    features.diff <- names(x = which(x = FC > thresh.fc))
1265
    features <- intersect(x = features, y = features.diff)
1266
    if (length(x = features) == 0) {
1267
      stop("No features pass thresh.fc threshold")
1268
    }
1269
1270
    data1 <- data.use[features, cell.use1, drop = FALSE]
1271
    data2 <- data.use[features, cell.use2, drop = FALSE]
1272
1273
        if (test.use == "wilcox") {
1274
            pvalues <- unlist(
1275
            x = my.sapply(
1276
            X = 1:nrow(x = data1),
1277
            FUN = function(x) {
1278
                return(wilcox.test(data1[x, ], data2[x, ], alternative = "greater")$p.value)
1279
            }
1280
            )
1281
            )
1282
1283
        } else if (test.use == 'bimod') {
1284
            pvalues <- unlist(
1285
            x = my.sapply(
1286
            X = 1:nrow(x = data1),
1287
            FUN = function(x) {
1288
                return(DifferentialLRT(
1289
                x = as.numeric(x = data1[x,]),
1290
                y = as.numeric(x = data2[x,])
1291
                ))
1292
            }
1293
            )
1294
            )
1295
        }
1296
1297
        pval.adj = stats::p.adjust(
1298
        p = pvalues,
1299
        method = "bonferroni",
1300
        n = nrow(X)
1301
        )
1302
        genes.de[[i]] <- data.frame(clusters = level.use[i], features = as.character(rownames(data1)), pvalues = pvalues, pval_adj = pval.adj, logFC = FC[features], data.alpha[features,])
1303
    }
1304
1305
    markers.all <- data.frame()
1306
    for (i in 1:numCluster) {
1307
        gde <- genes.de[[i]]
1308
        gde <- gde[order(gde$pvalues, -gde$logFC), ]
1309
        gde <- subset(gde, subset = pvalues < thresh.p)
1310
        if (nrow(gde) > 0) {
1311
            markers.all <- rbind(markers.all, gde)
1312
        }
1313
    }
1314
    markers.all$features <- as.character(markers.all$features)
1315
    return(markers.all)
1316
}
1317
1318
# function to run mcdavid et al. DE test
1319
#' likelood ratio test
1320
#'
1321
#' @param x a vector
1322
#' @param y a vector
1323
#' @param xmin threshold for the values in the vector
1324
#'
1325
#'
1326
#' @importFrom stats pchisq
1327
DifferentialLRT <- function(x, y, xmin = 0) {
1328
  lrtX <- bimodLikData(x = x)
1329
  lrtY <- bimodLikData(x = y)
1330
  lrtZ <- bimodLikData(x = c(x, y))
1331
  lrt_diff <- 2 * (lrtX + lrtY - lrtZ)
1332
  return(pchisq(q = lrt_diff, df = 3, lower.tail = F))
1333
}
1334
1335
#' likelood ratio test
1336
#' @importFrom stats sd dnorm
1337
#' @param x a vector
1338
#' @param xmin threshold for the values in the vector
1339
1340
bimodLikData <- function(x, xmin = 0) {
1341
  x1 <- x[x <= xmin]
1342
  x2 <- x[x > xmin]
1343
  xal <- length(x = x2) / length(x = x)
1344
  xal[xal > 1 - 1e-5] <- 1 - 1e-5
1345
  xal[xal < 1e-5] <- 1e-5
1346
  likA <- length(x = x1) * log(x = 1 - xal)
1347
  if (length(x = x2) < 2) {
1348
    mysd <- 1
1349
  } else {
1350
    mysd <- sd(x = x2)
1351
  }
1352
  likB <- length(x = x2) *
1353
    log(x = xal) +
1354
    sum(dnorm(x = x2, mean = mean(x = x2), sd = mysd, log = TRUE))
1355
  return(likA + likB)
1356
}
1357
1358
1359
#' Infer regulatory relationships
1360
#'
1361
#' @param object scAI object
1362
#' @param gene.use genes to be inferred
1363
#' @param candinate_loci cadinate loci
1364
#' @param cutoff_H threshold of coefficient values
1365
#' @param cutoff the difference of correlation to be considered as significant
1366
#' @param thresh_corr threshold of correlation coefficient
1367
#'
1368
#' @return
1369
#' @export
1370
#'
1371
#' @examples
1372
inferRegulations <- function(object, gene.use, candinate_loci, cutoff_H = 0.5, cutoff = 0.1, thresh_corr = 0.2) {
1373
  H <- as.matrix(object@fit$H)
1374
  H <- sweep(H, 2, colSums(H), FUN = `/`)
1375
  K = nrow(H)
1376
  mH <- apply(H, 1, mean)
1377
  sH <- apply(H, 1, sd)
1378
  IndexH_record <- vector("list", K)
1379
  for (i in 1:K) {
1380
    IndexH_record[[i]] <- which(H[i, ] > mH[i] + cutoff_H * sH[i])
1381
  }
1382
  gene.use <- as.character(gene.use)
1383
  X1 <- object@norm.data[[1]]
1384
  X1 <- X1[gene.use, ]
1385
  X2a <- object@agg.data
1386
1387
  regulations <- vector("list", length(gene.use))
1388
  names(regulations) <- gene.use
1389
  for (i in 1:length(gene.use)) {
1390
    regulations_i = vector("list", K)
1391
    names(regulations_i) <- rownames(H)
1392
    for (j in 1:K) {
1393
      loci_j <- candinate_loci$markers.all$features[candinate_loci$markers.all$factors == j]
1394
      # compute the correlation between
1395
      x1 <- X1[i, ]
1396
      x2a <- X2a[loci_j, ]
1397
      cors1 <- cor(x1, t(x2a))
1398
1399
      # set the values of this gene and its candiate loci to be zero
1400
      X1_new <- X1
1401
      X2a_new <- X2a
1402
      X1_new[gene.use[i], IndexH_record[[j]]] <- 0
1403
      X2a_new[loci_j, IndexH_record[[j]]] <- 0
1404
      x1_new <- X1_new[gene.use[i], ]
1405
      x2a_new <- X2a_new[loci_j, ]
1406
      cors2 <- cor(x1_new, t(x2a))
1407
      cors3 <- cor(x1, t(x2a_new))
1408
      cors1[is.na(cors1)] <- 0
1409
      cors2[is.na(cors2)] <- 0
1410
      cors3[is.na(cors3)] <- 0
1411
      D <- rbind(cors1 - cors2, cors1 - cors3)
1412
      flag <- (rowSums(abs(D) > cutoff) > 0) & abs(cors1) > thresh_corr
1413
      regulations_i[[j]]$link <- as.character(loci_j[flag])
1414
      regulations_i[[j]]$intensity <- cors1[flag]
1415
    }
1416
    regulations[[i]] <- regulations_i
1417
  }
1418
  return(regulations)
1419
}
1420
1421
1422
1423
#' select highly variable features
1424
#'
1425
#' @param object scAI objecy
1426
#' @param assay Name of assay
1427
#' @param do.plot Whether showing plot
1428
#' @param do.text Whether adding feature names
1429
#' @param x.low.cutoff The minimum expression level
1430
#' @param x.high.cutoff The maximum expression level
1431
#' @param y.cutoff The minimum fano factor values
1432
#' @param y.high.cutoff The maximum fano factor values
1433
#' @param num.bin Number of bins
1434
#' @param pch.use Shape of dots in ggplot
1435
#' @param col.use Color of dots
1436
#' @param cex.text.use Size of text
1437
#'
1438
#' @return
1439
#' @export
1440
#'
1441
#' @examples
1442
#' @importFrom graphics smoothScatter text
1443
selectFeatures <- function(object, assay = "RNA", do.plot = TRUE, do.text = TRUE,
1444
x.low.cutoff = 0.01, x.high.cutoff = 3.5, y.cutoff = 0.5, y.high.cutoff = Inf,
1445
num.bin = 20, pch.use = 16, col.use = "black", cex.text.use = 0.5) {
1446
    # This function is modified from Seurat Package
1447
    data <- object@norm.data[[assay]]
1448
    genes.use <- rownames(data)
1449
1450
    gene.mean <- apply(X = data, MARGIN = 1, FUN = ExpMean)
1451
    gene.dispersion <- apply(X = data, MARGIN = 1, FUN = LogVMR)
1452
    gene.dispersion[is.na(x = gene.dispersion)] <- 0
1453
    gene.mean[is.na(x = gene.mean)] <- 0
1454
    names(x = gene.mean) <- genes.use
1455
    data_x_bin <- cut(x = gene.mean, breaks = num.bin)
1456
    names(x = data_x_bin) <- names(x = gene.mean)
1457
    mean_y <- tapply(X = gene.dispersion, INDEX = data_x_bin, FUN = mean)
1458
    sd_y <- tapply(X = gene.dispersion, INDEX = data_x_bin, FUN = sd)
1459
    gene.dispersion.scaled <- (gene.dispersion - mean_y[as.numeric(x = data_x_bin)])/sd_y[as.numeric(x = data_x_bin)]
1460
    gene.dispersion.scaled[is.na(x = gene.dispersion.scaled)] <- 0
1461
    names(x = gene.dispersion.scaled) <- names(x = gene.mean)
1462
1463
    mv.df <- data.frame(gene.mean, gene.dispersion, gene.dispersion.scaled)
1464
    rownames(x = mv.df) <- rownames(x = data)
1465
    hvg.info <- mv.df
1466
    names(x = gene.mean) <- names(x = gene.dispersion) <- names(x = gene.dispersion.scaled) <- rownames(hvg.info)
1467
    pass.cutoff <- names(x = gene.mean)[which(x = ((gene.mean > x.low.cutoff) & (gene.mean < x.high.cutoff)) & (gene.dispersion.scaled > y.cutoff) & (gene.dispersion.scaled < y.high.cutoff))]
1468
    if (do.plot) {
1469
        smoothScatter(x = gene.mean, y = gene.dispersion.scaled, pch = pch.use, cex = 0.5, col = col.use, xlab = "Average expression", ylab = "Dispersion", nrpoints = Inf)
1470
    }
1471
    if (do.text) {
1472
        text(x = gene.mean[pass.cutoff], y = gene.dispersion.scaled[pass.cutoff], labels = pass.cutoff, cex = cex.text.use)
1473
    }
1474
    hvg.info <- hvg.info[order(hvg.info$gene.dispersion, decreasing = TRUE), ]
1475
1476
    if (length(object@var.features) == 0) {
1477
        for (i in 1:length(object@raw.data)) {
1478
            object@var.features[[i]] <- vector("character")
1479
        }
1480
        names(object@var.features) <- names(object@raw.data)
1481
    }
1482
1483
    object@var.features[[assay]] <- pass.cutoff
1484
    return(object)
1485
1486
}
1487
1488
#' find chromsome regions of genes
1489
#'
1490
#' @param genes a given set of genes
1491
#' @param species mouse or human
1492
#' @importFrom biomaRt useMart getBM
1493
#' @export
1494
#'
1495
searchGeneRegions <- function(genes, species = "mouse") {
1496
    if (species == "mouse"){
1497
        mouse = biomaRt::useMart("ensembl", dataset = "mmusculus_gene_ensembl")
1498
        loci <- biomaRt::getBM(attributes = c( "mgi_symbol","chromosome_name",'start_position','end_position'), filters = "mgi_symbol", values = genes, mart = mouse)
1499
    } else{
1500
        human = biomaRt::useMart("ensembl", dataset = "hsapiens_gene_ensembl")
1501
        loci <- biomaRt::getBM(attributes = c( "hgnc_symbol","chromosome_name",'start_position','end_position'), filters = "hgnc_symbol", values = genes, mart = human)
1502
    }
1503
    return(loci)
1504
}
1505
1506
#' find the nearby loci in a given loci set for a given set of genes
1507
#'
1508
#' @param genes a given set of genes
1509
#' @param loci the background loci set
1510
#' @param width the width from TSS
1511
#' @param species mouse or human
1512
#' @importFrom bedr bedr.sort.region in.region
1513
#' @export
1514
#'
1515
1516
findNearbyLoci <- function(genes, loci, width = 50000, species = "mouse") {
1517
    gene.loci <- searchGeneRegions(genes, species)
1518
    gene.loci <- gene.loci[gene.loci$chromosome_name %in% c(1:22, "X","Y"), ]
1519
    gene.loci$start_position <- gene.loci$start_position-min(width, gene.loci$start_position)
1520
    gene.loci$end_position <- gene.loci$end_position + width
1521
    gene.loci.bed <- paste0("chr",gene.loci$chromosome_name, ":", gene.loci$start_position, "-",gene.loci$end_position)
1522
    gene.loci.bed.sort <- bedr::bedr.sort.region(gene.loci.bed);
1523
    a = grep(paste0("chr", c(1:22, "X","Y"), ":"), loci)
1524
    loci <- loci[grepl(paste(paste0("chr", c(1:22, "X","Y"), ":"), collapse="|"), loci)]
1525
1526
    loci.sort <- bedr::bedr.sort.region(loci);
1527
    is.region <- bedr::in.region(loci.sort, gene.loci.bed.sort);
1528
    loci.nearby <- loci.sort[is.region]
1529
    return(loci.nearby)
1530
}
1531
1532
#' Select number of the rank
1533
#'
1534
#' @param object scAI object
1535
#' @param rangeK A predefined range of K
1536
#'
1537
#' @return
1538
#' @export
1539
#'
1540
#' @examples
1541
selectK <- function(object, rangeK = c(2:15)) {
1542
  coph <- vector("double", length(rangeK))
1543
  i <- 0
1544
  for (k in rangeK) {
1545
    i <- i+1
1546
    outs <- run_scAI(object, K = k, nrun = object@options$paras$nrun,
1547
                     s = object@options$paras$s, alpha = object@options$paras$alpha, lambda = object@options$paras$lambda, gamma = object@options$paras$gamma,
1548
                     maxIter = object@options$paras$maxIter, stop_rule = object@options$paras$stop_rule)
1549
    CM <- outs@cluster$consensus
1550
    d1 <- dist(CM)
1551
    hc <- hclust(d1, method = "average")
1552
    d2 <- cophenetic(hc)
1553
    coph[i] <- cor(d1, d2)
1554
  }
1555
1556
  df <- data.frame(k = rangeK, Coph = coph)
1557
  gg <- ggplot(df, aes(x = k, y = Coph)) + geom_line(size=1) +
1558
    geom_point() +
1559
    theme_classic() + labs(x = 'K', y='Stability score (Coph)') +
1560
    scAI_theme_opts() + theme(text = element_text(size = 10)) + labs(x = 'K', y='Stability score (Coph)') +
1561
    theme(legend.position = "right", legend.title = NULL)
1562
  gg
1563
}
1564
1565
1566
1567
#' Reorder features according to the loading values
1568
#'
1569
#' @param W Basis matrix
1570
#' @param cutoff Threshold of the loading values
1571
#'
1572
#' @return
1573
#' @export
1574
#'
1575
#' @examples
1576
reorderFeatures <- function(W, cutoff = 0.5) {
1577
  M <- nrow(W)
1578
  K = ncol(W)
1579
  MW <- colMeans(W)
1580
  vw <- apply(W, 2, sd)
1581
  # order features
1582
  IndexW_record <- vector("list", K)
1583
  IndexW_record[[1]] <- which(W[, 1] > MW[1] + cutoff * VW[1])
1584
  n <- length(IndexW_record[[1]])
1585
  A <- c(1:M)
1586
  c <- match(A, IndexW_record[[1]])
1587
  A <- A[-c]
1588
  for (j in 2:K) {
1589
    IndexW_record[[j]] <- which(W[, j] > MW[j] + cutoff * VW[j])
1590
    s <- 0
1591
    for (k in 1:j - 1) {
1592
      s <- s + length(IndexW_record[[k]])
1593
    }
1594
    ir2 <- match(IndexW_record[[j]], 1:s)
1595
    IndexW_record[[j]] <- IndexW_record[[j]][-ir2]
1596
    n <- length(IndexW_record[[j]])
1597
    B <- union(1:s, IndexW_record[[j]])
1598
    A <- 1:M
1599
    c <- match(A, B)
1600
    A <- A[-c]
1601
    W0 <- W
1602
    W[s + 1:s + n, ] <- W0[IndexW_record[[j]], ]
1603
    W[s + n + 1:M, ] <- W0[A, ]
1604
  }
1605
  results <- list()
1606
  list[["W"]] <- W
1607
  list[["index_record"]] <- IndexW_record
1608
  return(results)
1609
}
1610
1611
#' Calculate the variance to mean ratio of logged values
1612
#'
1613
#' Calculate the variance to mean ratio (VMR) in non-logspace (return answer in
1614
#' log-space)
1615
#'
1616
#' @param x A vector of values
1617
#'
1618
#' @return Returns the VMR in log-space
1619
#'
1620
#' @importFrom stats var
1621
#'
1622
#' @export
1623
#'
1624
#' @examples
1625
#' LogVMR(x = c(1, 2, 3))
1626
#'
1627
LogVMR <- function(x) {
1628
    return(log(x = var(x = exp(x = x) - 1) / mean(x = exp(x = x) - 1)))
1629
}
1630
1631
#' Calculate the mean of logged values
1632
#'
1633
#' Calculate mean of logged values in non-log space (return answer in log-space)
1634
#'
1635
#' @param x A vector of values
1636
#'
1637
#' @return Returns the mean in log-space
1638
#'
1639
#' @export
1640
#'
1641
#' @examples
1642
#' ExpMean(x = c(1, 2, 3))
1643
#'
1644
ExpMean <- function(x) {
1645
    return(log(x = mean(x = exp(x = x) - 1) + 1))
1646
}
1647
1648
1649
1650
#' Convert a sparse matrix to a dense matrix
1651
#'
1652
#' @param mat A sparse matrix
1653
#' @export
1654
1655
as_matrix <- function(mat){
1656
  Rcpp::sourceCpp(code='
1657
#include <Rcpp.h>
1658
using namespace Rcpp;
1659
    // [[Rcpp::export]]
1660
    IntegerMatrix asMatrix(NumericVector rp,
1661
    NumericVector cp,
1662
    NumericVector z,
1663
    int nrows,
1664
    int ncols){
1665
1666
        int k = z.size() ;
1667
1668
        IntegerMatrix  mat(nrows, ncols);
1669
1670
        for (int i = 0; i < k; i++){
1671
            mat(rp[i],cp[i]) = z[i];
1672
        }
1673
1674
        return mat;
1675
    }
1676
    ' )
1677
1678
    row_pos <- mat@i
1679
    col_pos <- findInterval(seq(mat@x)-1,mat@p[-1])
1680
1681
    tmp <- asMatrix(rp = row_pos, cp = col_pos, z = mat@x,
1682
    nrows =  mat@Dim[1], ncols = mat@Dim[2])
1683
1684
    row.names(tmp) <- mat@Dimnames[[1]]
1685
    colnames(tmp) <- mat@Dimnames[[2]]
1686
    return(tmp)
1687
}
1688
1689
1690
1691