Diff of /R/getClustNum.R [000000] .. [494cbf]

Switch to unified view

a b/R/getClustNum.R
1
#' @name getClustNum
2
#' @title Get estimation of optimal clustering number
3
#' @description This function provides two measurements (i.e., clustering prediction index [CPI] and Gap-statistics) and aims to search the optimal number for multi-omics integrative clustering. In short, the peaks reach by the red (CPI) and blue (Gap-statistics) lines should be referred to determine `N.clust`.
4
#' @param data List of matrices.
5
#' @param is.binary A logicial vector to indicate if the subdata is binary matrix of 0 and 1 such as mutation.
6
#' @param try.N.clust A integer vector to indicate possible choices of number of clusters.
7
#' @param center A logical value to indicate if the variables should be centered. TRUE by default.
8
#' @param scale A logical value to indicate if the variables should be scaled. FALSE by default.
9
#' @param fig.path A string value to indicate the output figure path.
10
#' @param fig.name A string value to indicate the name of the figure.
11
#' @export
12
#' @return A figure that helps to choose the optimal clustering number (argument of `N.clust`) for `get%algorithm_name%()` or `getMOIC()`, and a list contains the following components:
13
#'
14
#'         \code{CPI}   possible cluster number identified by clustering prediction index
15
#'
16
#'         \code{Gapk}  possible cluster number identified by Gap-statistics
17
#' @import IntNMF
18
#' @import mogsa
19
#' @import SNFtool
20
#' @importFrom ggplot2 alpha
21
#' @importFrom dplyr %>%
22
#' @importFrom grDevices dev.copy2pdf
23
#' @examples # There is no example and please refer to vignette.
24
#' @references Chalise P, Fridley BL (2017). Integrative clustering of multi-level omic data based on non-negative matrix factorization algorithm. PLoS One, 12(5):e0176278.
25
#'
26
#'Tibshirani, R., Walther, G., Hastie, T. (2001). Estimating the number of data clusters via the Gap statistic. J R Stat Soc Series B Stat Methodol, 63(2):411-423.
27
getClustNum <- function(data        = NULL,
28
                        is.binary   = rep(FALSE, length(data)),
29
                        try.N.clust = 2:8,
30
                        center      = TRUE,
31
                        scale       = TRUE,
32
                        fig.path    = getwd(),
33
                        fig.name    = "optimal_number_cluster") {
34
35
  # check data
36
  n_dat <- length(data)
37
  if(n_dat > 6){
38
    stop('current verision of MOVICS can support up to 6 datasets.')
39
  }
40
  if(n_dat < 2){
41
    stop('current verision of MOVICS needs at least 2 omics data.')
42
  }
43
44
  data.backup <- data # save a backup
45
46
  #--------------------------------------------#
47
  # Cluster Prediction Index (CPI) from IntNMF #
48
  # remove features that made of categories not equal to 2 otherwise Error in svd(X) : a dimension is zero
49
  if(!all(!is.binary)) {
50
    bindex <- which(is.binary == TRUE)
51
    for (i in bindex) {
52
      a <- which(rowSums(data[[i]]) == 0)
53
      b <- which(rowSums(data[[i]]) == ncol(data[[i]]))
54
      if(length(a) > 0) {
55
        data[[i]] <- data[[i]][which(rowSums(data[[i]]) != 0),] # remove all zero
56
      }
57
58
      if(length(b) > 0) {
59
        data[[i]] <- data[[i]][which(rowSums(data[[i]]) != ncol(data[[i]])),] # remove all one
60
      }
61
62
      if(length(a) + length(b) > 0) {
63
        message(paste0("--", names(data)[i],": a total of ",length(a) + length(b), " features were removed due to the categories were not equal to 2!"))
64
      }
65
    }
66
  }
67
68
  # In order to make the input data fit non-negativity constraint of intNMF,
69
  # the values of the data were shifted to positive direction by adding absolute value of the smallest negative number.
70
  # Further, each data was rescaled by dividing by maximum value of the data to make the magnitudes comparable (between 0 and 1) across the several datasets.
71
  dat <- lapply(data, function (dd){
72
    if (!all(dd >= 0)) dd <- pmax(dd + abs(min(dd)), 0) + .Machine$double.eps # .Machine$double.eps as The smallest positive floating-point number x
73
    dd <- dd/max(dd)
74
    return(dd %>% as.matrix)
75
  })
76
77
  #dat <- lapply(dat, t)
78
  dat <- lapply(dat, function(x) t(x) + .Machine$double.eps)
79
80
  message("calculating Cluster Prediction Index...")
81
  optk1 <- IntNMF::nmf.opt.k(dat      = dat,
82
                            n.runs    = 5,
83
                            n.fold    = 5,
84
                            k.range   = try.N.clust,
85
                            st.count  = 10,
86
                            maxiter   = 100,
87
                            make.plot = FALSE)
88
  optk1 <- as.data.frame(optk1)
89
90
  #-------------------------------#
91
  # Gap-statistics from MoCluster #
92
  message("calculating Gap-statistics...")
93
  moas <- data.backup %>% mogsa::mbpca(ncomp      = 2,
94
                                       k          = "all",
95
                                       method     = "globalScore",
96
                                       center     = center,
97
                                       scale      = scale,
98
                                       moa        = TRUE,
99
                                       svd.solver = "fast",
100
                                       maxiter    = 1000,
101
                                       verbose    = FALSE)
102
  gap <- mogsa::moGap(moas, K.max = max(try.N.clust), cluster = "hclust", plot = FALSE)
103
  optk2 <- as.data.frame(gap$Tab)[-1,] # remove k=1
104
105
  #---------------------#
106
  # Eigen-gaps from SNF #
107
  # message("Calculating Eigen-gaps...")
108
  # data <- lapply(data.backup, t)
109
  # dat <- lapply(data, function (dd){
110
  #   dd <- dd %>% as.matrix
111
  #   W <- dd %>% SNFtool::dist2(dd) %>% SNFtool::affinityMatrix(K = 30, sigma = 0.5)
112
  # })
113
  # W <-  SNFtool::SNF(Wall = dat,
114
  #                    K = 30,
115
  #                    t = 20)
116
  # optk3 <- SNFtool::estimateNumberOfClustersGivenGraph(W, NUMC = try.N.clust)
117
118
  # nemo.ag <- nemo.affinity.graph(data.backup, k = 20)
119
  # optk4 <- nemo.num.clusters(nemo.ag, NUMC = try.N.clust)
120
121
  #---------------------------#
122
  # calculate optimal N.clust #
123
  # N.clust <- as.numeric(names(which(table(c(as.numeric((which.max(apply(optk1, 1, mean)) + 1)),
124
  #                                           as.numeric((which.max(optk2$gap) + 1)),
125
  #                                           optk3$`Eigen-gap best`)) >= 2)))
126
127
  N.clust <- as.numeric(which.max(apply(optk1, 1, mean) + optk2$gap)) + 1
128
  if(length(N.clust) == 0) {
129
    message("--fail to define the optimal cluster number!")
130
    N.clust <- "null"
131
  }
132
133
  #---------------#
134
  # Visualization #
135
  outFig <- paste0(fig.name,".pdf")
136
  par(bty="o", mgp = c(1.9,.33,0), mar=c(3.1,3.1,2.1,3.1)+.1, las=1, tcl=-.25)
137
  plot(NULL, NULL,
138
       xlim = c(min(try.N.clust),max(try.N.clust)),
139
       #ylim = c(min(optk1), max(optk1)),
140
       ylim = c(0,1),
141
       xlab = "Number of Multi-Omics Clusters",ylab = "")
142
  rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#EAE9E9",border = FALSE)
143
  grid(col = "white", lty = 1, lwd = 1.5)
144
  # for (m in 1:n.runs) points(try.N.clust, optk1[, m], pch = 20, cex = 1.5, col = "#224A8D")
145
  points(try.N.clust, apply(optk1, 1, mean), pch = 19, col = ggplot2::alpha("#224A8D"), cex = 1.5)
146
  lines(try.N.clust, apply(optk1, 1, mean), col = "#224A8D", lwd = 2, lty = 4)
147
  mtext("Cluster Prediction Index", side = 2, line = 2, cex = 1.5, col = "#224A8D", las = 3)
148
149
  par(new = TRUE, xpd = FALSE)
150
  plot(NULL,NULL,
151
       xlim = c(min(try.N.clust),max(try.N.clust)),
152
       #ylim = c(min(optk2$gap), max(optk2$gap)),
153
       ylim = c(0,1),
154
       xlab = "",ylab = "",xaxt = "n",yaxt = "n")
155
  points(try.N.clust, optk2$gap, pch = 19, col = ggplot2::alpha("#E51718",0.8), cex = 1.5)
156
  lines(try.N.clust, optk2$gap, col = "#E51718", lwd = 2, lty = 4)
157
  #axis(side = 4, at = pretty(range(optk2$gap), n = 6))
158
  axis(side = 4, at = seq(0,1,0.2), labels = c("0.0","0.2","0.4","0.6","0.8","1.0"))
159
  mtext("Gap-statistics", side = 4, line = 2,las = 3, cex = 1.5, col = "#E51718")
160
161
  # abline(v = optk3$`Eigen-gap best`, col = "#008B8A", lwd = 2, lty = 4)
162
  # text(optk3$`Eigen-gap best`, par("usr")[3] + 0.1,"Eigen-gaps", cex = 1.5, col = "#008B8A", adj = -0.05)
163
  invisible(dev.copy2pdf(file = file.path(fig.path, outFig), width = 5.5, height = 5))
164
165
  message("visualization done...")
166
  if(N.clust > 1) {
167
    message(paste0("--the imputed optimal cluster number is ", N.clust, " arbitrarily, but it would be better referring to other priori knowledge."))
168
  }
169
  #return(list(N.clust = N.clust, CPI = optk1, Gapk = optk2, Eigen = optk3))
170
  return(list(N.clust = N.clust, CPI = optk1, Gapk = optk2))
171
}