MOVICS / Git / Diff of /R/getClustNum.R

Models:
AlyssaS/
MOVICS
Downloads: 1
Diff of /R/getClustNum.R [000000] .. [494cbf]
Switch to side-by-side view

--- a
+++ b/R/getClustNum.R
@@ -0,0 +1,171 @@
+#' @name getClustNum
+#' @title Get estimation of optimal clustering number
+#' @description This function provides two measurements (i.e., clustering prediction index [CPI] and Gap-statistics) and aims to search the optimal number for multi-omics integrative clustering. In short, the peaks reach by the red (CPI) and blue (Gap-statistics) lines should be referred to determine `N.clust`.
+#' @param data List of matrices.
+#' @param is.binary A logicial vector to indicate if the subdata is binary matrix of 0 and 1 such as mutation.
+#' @param try.N.clust A integer vector to indicate possible choices of number of clusters.
+#' @param center A logical value to indicate if the variables should be centered. TRUE by default.
+#' @param scale A logical value to indicate if the variables should be scaled. FALSE by default.
+#' @param fig.path A string value to indicate the output figure path.
+#' @param fig.name A string value to indicate the name of the figure.
+#' @export
+#' @return A figure that helps to choose the optimal clustering number (argument of `N.clust`) for `get%algorithm_name%()` or `getMOIC()`, and a list contains the following components:
+#'
+#'         \code{CPI}   possible cluster number identified by clustering prediction index
+#'
+#'         \code{Gapk}  possible cluster number identified by Gap-statistics
+#' @import IntNMF
+#' @import mogsa
+#' @import SNFtool
+#' @importFrom ggplot2 alpha
+#' @importFrom dplyr %>%
+#' @importFrom grDevices dev.copy2pdf
+#' @examples # There is no example and please refer to vignette.
+#' @references Chalise P, Fridley BL (2017). Integrative clustering of multi-level omic data based on non-negative matrix factorization algorithm. PLoS One, 12(5):e0176278.
+#'
+#'Tibshirani, R., Walther, G., Hastie, T. (2001). Estimating the number of data clusters via the Gap statistic. J R Stat Soc Series B Stat Methodol, 63(2):411-423.
+getClustNum <- function(data        = NULL,
+                        is.binary   = rep(FALSE, length(data)),
+                        try.N.clust = 2:8,
+                        center      = TRUE,
+                        scale       = TRUE,
+                        fig.path    = getwd(),
+                        fig.name    = "optimal_number_cluster") {
+
+  # check data
+  n_dat <- length(data)
+  if(n_dat > 6){
+    stop('current verision of MOVICS can support up to 6 datasets.')
+  }
+  if(n_dat < 2){
+    stop('current verision of MOVICS needs at least 2 omics data.')
+  }
+
+  data.backup <- data # save a backup
+
+  #--------------------------------------------#
+  # Cluster Prediction Index (CPI) from IntNMF #
+  # remove features that made of categories not equal to 2 otherwise Error in svd(X) : a dimension is zero
+  if(!all(!is.binary)) {
+    bindex <- which(is.binary == TRUE)
+    for (i in bindex) {
+      a <- which(rowSums(data[[i]]) == 0)
+      b <- which(rowSums(data[[i]]) == ncol(data[[i]]))
+      if(length(a) > 0) {
+        data[[i]] <- data[[i]][which(rowSums(data[[i]]) != 0),] # remove all zero
+      }
+
+      if(length(b) > 0) {
+        data[[i]] <- data[[i]][which(rowSums(data[[i]]) != ncol(data[[i]])),] # remove all one
+      }
+
+      if(length(a) + length(b) > 0) {
+        message(paste0("--", names(data)[i],": a total of ",length(a) + length(b), " features were removed due to the categories were not equal to 2!"))
+      }
+    }
+  }
+
+  # In order to make the input data fit non-negativity constraint of intNMF,
+  # the values of the data were shifted to positive direction by adding absolute value of the smallest negative number.
+  # Further, each data was rescaled by dividing by maximum value of the data to make the magnitudes comparable (between 0 and 1) across the several datasets.
+  dat <- lapply(data, function (dd){
+    if (!all(dd >= 0)) dd <- pmax(dd + abs(min(dd)), 0) + .Machine$double.eps # .Machine$double.eps as The smallest positive floating-point number x
+    dd <- dd/max(dd)
+    return(dd %>% as.matrix)
+  })
+
+  #dat <- lapply(dat, t)
+  dat <- lapply(dat, function(x) t(x) + .Machine$double.eps)
+
+  message("calculating Cluster Prediction Index...")
+  optk1 <- IntNMF::nmf.opt.k(dat      = dat,
+                            n.runs    = 5,
+                            n.fold    = 5,
+                            k.range   = try.N.clust,
+                            st.count  = 10,
+                            maxiter   = 100,
+                            make.plot = FALSE)
+  optk1 <- as.data.frame(optk1)
+
+  #-------------------------------#
+  # Gap-statistics from MoCluster #
+  message("calculating Gap-statistics...")
+  moas <- data.backup %>% mogsa::mbpca(ncomp      = 2,
+                                       k          = "all",
+                                       method     = "globalScore",
+                                       center     = center,
+                                       scale      = scale,
+                                       moa        = TRUE,
+                                       svd.solver = "fast",
+                                       maxiter    = 1000,
+                                       verbose    = FALSE)
+  gap <- mogsa::moGap(moas, K.max = max(try.N.clust), cluster = "hclust", plot = FALSE)
+  optk2 <- as.data.frame(gap$Tab)[-1,] # remove k=1
+
+  #---------------------#
+  # Eigen-gaps from SNF #
+  # message("Calculating Eigen-gaps...")
+  # data <- lapply(data.backup, t)
+  # dat <- lapply(data, function (dd){
+  #   dd <- dd %>% as.matrix
+  #   W <- dd %>% SNFtool::dist2(dd) %>% SNFtool::affinityMatrix(K = 30, sigma = 0.5)
+  # })
+  # W <-  SNFtool::SNF(Wall = dat,
+  #                    K = 30,
+  #                    t = 20)
+  # optk3 <- SNFtool::estimateNumberOfClustersGivenGraph(W, NUMC = try.N.clust)
+
+  # nemo.ag <- nemo.affinity.graph(data.backup, k = 20)
+  # optk4 <- nemo.num.clusters(nemo.ag, NUMC = try.N.clust)
+
+  #---------------------------#
+  # calculate optimal N.clust #
+  # N.clust <- as.numeric(names(which(table(c(as.numeric((which.max(apply(optk1, 1, mean)) + 1)),
+  #                                           as.numeric((which.max(optk2$gap) + 1)),
+  #                                           optk3$`Eigen-gap best`)) >= 2)))
+
+  N.clust <- as.numeric(which.max(apply(optk1, 1, mean) + optk2$gap)) + 1
+  if(length(N.clust) == 0) {
+    message("--fail to define the optimal cluster number!")
+    N.clust <- "null"
+  }
+
+  #---------------#
+  # Visualization #
+  outFig <- paste0(fig.name,".pdf")
+  par(bty="o", mgp = c(1.9,.33,0), mar=c(3.1,3.1,2.1,3.1)+.1, las=1, tcl=-.25)
+  plot(NULL, NULL,
+       xlim = c(min(try.N.clust),max(try.N.clust)),
+       #ylim = c(min(optk1), max(optk1)),
+       ylim = c(0,1),
+       xlab = "Number of Multi-Omics Clusters",ylab = "")
+  rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#EAE9E9",border = FALSE)
+  grid(col = "white", lty = 1, lwd = 1.5)
+  # for (m in 1:n.runs) points(try.N.clust, optk1[, m], pch = 20, cex = 1.5, col = "#224A8D")
+  points(try.N.clust, apply(optk1, 1, mean), pch = 19, col = ggplot2::alpha("#224A8D"), cex = 1.5)
+  lines(try.N.clust, apply(optk1, 1, mean), col = "#224A8D", lwd = 2, lty = 4)
+  mtext("Cluster Prediction Index", side = 2, line = 2, cex = 1.5, col = "#224A8D", las = 3)
+
+  par(new = TRUE, xpd = FALSE)
+  plot(NULL,NULL,
+       xlim = c(min(try.N.clust),max(try.N.clust)),
+       #ylim = c(min(optk2$gap), max(optk2$gap)),
+       ylim = c(0,1),
+       xlab = "",ylab = "",xaxt = "n",yaxt = "n")
+  points(try.N.clust, optk2$gap, pch = 19, col = ggplot2::alpha("#E51718",0.8), cex = 1.5)
+  lines(try.N.clust, optk2$gap, col = "#E51718", lwd = 2, lty = 4)
+  #axis(side = 4, at = pretty(range(optk2$gap), n = 6))
+  axis(side = 4, at = seq(0,1,0.2), labels = c("0.0","0.2","0.4","0.6","0.8","1.0"))
+  mtext("Gap-statistics", side = 4, line = 2,las = 3, cex = 1.5, col = "#E51718")
+
+  # abline(v = optk3$`Eigen-gap best`, col = "#008B8A", lwd = 2, lty = 4)
+  # text(optk3$`Eigen-gap best`, par("usr")[3] + 0.1,"Eigen-gaps", cex = 1.5, col = "#008B8A", adj = -0.05)
+  invisible(dev.copy2pdf(file = file.path(fig.path, outFig), width = 5.5, height = 5))
+
+  message("visualization done...")
+  if(N.clust > 1) {
+    message(paste0("--the imputed optimal cluster number is ", N.clust, " arbitrarily, but it would be better referring to other priori knowledge."))
+  }
+  #return(list(N.clust = N.clust, CPI = optk1, Gapk = optk2, Eigen = optk3))
+  return(list(N.clust = N.clust, CPI = optk1, Gapk = optk2))
+}