[494cbf]: / R / getClustNum.R

Download this file

172 lines (154 with data), 8.4 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
#' @name getClustNum
#' @title Get estimation of optimal clustering number
#' @description This function provides two measurements (i.e., clustering prediction index [CPI] and Gap-statistics) and aims to search the optimal number for multi-omics integrative clustering. In short, the peaks reach by the red (CPI) and blue (Gap-statistics) lines should be referred to determine `N.clust`.
#' @param data List of matrices.
#' @param is.binary A logicial vector to indicate if the subdata is binary matrix of 0 and 1 such as mutation.
#' @param try.N.clust A integer vector to indicate possible choices of number of clusters.
#' @param center A logical value to indicate if the variables should be centered. TRUE by default.
#' @param scale A logical value to indicate if the variables should be scaled. FALSE by default.
#' @param fig.path A string value to indicate the output figure path.
#' @param fig.name A string value to indicate the name of the figure.
#' @export
#' @return A figure that helps to choose the optimal clustering number (argument of `N.clust`) for `get%algorithm_name%()` or `getMOIC()`, and a list contains the following components:
#'
#' \code{CPI} possible cluster number identified by clustering prediction index
#'
#' \code{Gapk} possible cluster number identified by Gap-statistics
#' @import IntNMF
#' @import mogsa
#' @import SNFtool
#' @importFrom ggplot2 alpha
#' @importFrom dplyr %>%
#' @importFrom grDevices dev.copy2pdf
#' @examples # There is no example and please refer to vignette.
#' @references Chalise P, Fridley BL (2017). Integrative clustering of multi-level omic data based on non-negative matrix factorization algorithm. PLoS One, 12(5):e0176278.
#'
#'Tibshirani, R., Walther, G., Hastie, T. (2001). Estimating the number of data clusters via the Gap statistic. J R Stat Soc Series B Stat Methodol, 63(2):411-423.
getClustNum <- function(data = NULL,
is.binary = rep(FALSE, length(data)),
try.N.clust = 2:8,
center = TRUE,
scale = TRUE,
fig.path = getwd(),
fig.name = "optimal_number_cluster") {
# check data
n_dat <- length(data)
if(n_dat > 6){
stop('current verision of MOVICS can support up to 6 datasets.')
}
if(n_dat < 2){
stop('current verision of MOVICS needs at least 2 omics data.')
}
data.backup <- data # save a backup
#--------------------------------------------#
# Cluster Prediction Index (CPI) from IntNMF #
# remove features that made of categories not equal to 2 otherwise Error in svd(X) : a dimension is zero
if(!all(!is.binary)) {
bindex <- which(is.binary == TRUE)
for (i in bindex) {
a <- which(rowSums(data[[i]]) == 0)
b <- which(rowSums(data[[i]]) == ncol(data[[i]]))
if(length(a) > 0) {
data[[i]] <- data[[i]][which(rowSums(data[[i]]) != 0),] # remove all zero
}
if(length(b) > 0) {
data[[i]] <- data[[i]][which(rowSums(data[[i]]) != ncol(data[[i]])),] # remove all one
}
if(length(a) + length(b) > 0) {
message(paste0("--", names(data)[i],": a total of ",length(a) + length(b), " features were removed due to the categories were not equal to 2!"))
}
}
}
# In order to make the input data fit non-negativity constraint of intNMF,
# the values of the data were shifted to positive direction by adding absolute value of the smallest negative number.
# Further, each data was rescaled by dividing by maximum value of the data to make the magnitudes comparable (between 0 and 1) across the several datasets.
dat <- lapply(data, function (dd){
if (!all(dd >= 0)) dd <- pmax(dd + abs(min(dd)), 0) + .Machine$double.eps # .Machine$double.eps as The smallest positive floating-point number x
dd <- dd/max(dd)
return(dd %>% as.matrix)
})
#dat <- lapply(dat, t)
dat <- lapply(dat, function(x) t(x) + .Machine$double.eps)
message("calculating Cluster Prediction Index...")
optk1 <- IntNMF::nmf.opt.k(dat = dat,
n.runs = 5,
n.fold = 5,
k.range = try.N.clust,
st.count = 10,
maxiter = 100,
make.plot = FALSE)
optk1 <- as.data.frame(optk1)
#-------------------------------#
# Gap-statistics from MoCluster #
message("calculating Gap-statistics...")
moas <- data.backup %>% mogsa::mbpca(ncomp = 2,
k = "all",
method = "globalScore",
center = center,
scale = scale,
moa = TRUE,
svd.solver = "fast",
maxiter = 1000,
verbose = FALSE)
gap <- mogsa::moGap(moas, K.max = max(try.N.clust), cluster = "hclust", plot = FALSE)
optk2 <- as.data.frame(gap$Tab)[-1,] # remove k=1
#---------------------#
# Eigen-gaps from SNF #
# message("Calculating Eigen-gaps...")
# data <- lapply(data.backup, t)
# dat <- lapply(data, function (dd){
# dd <- dd %>% as.matrix
# W <- dd %>% SNFtool::dist2(dd) %>% SNFtool::affinityMatrix(K = 30, sigma = 0.5)
# })
# W <- SNFtool::SNF(Wall = dat,
# K = 30,
# t = 20)
# optk3 <- SNFtool::estimateNumberOfClustersGivenGraph(W, NUMC = try.N.clust)
# nemo.ag <- nemo.affinity.graph(data.backup, k = 20)
# optk4 <- nemo.num.clusters(nemo.ag, NUMC = try.N.clust)
#---------------------------#
# calculate optimal N.clust #
# N.clust <- as.numeric(names(which(table(c(as.numeric((which.max(apply(optk1, 1, mean)) + 1)),
# as.numeric((which.max(optk2$gap) + 1)),
# optk3$`Eigen-gap best`)) >= 2)))
N.clust <- as.numeric(which.max(apply(optk1, 1, mean) + optk2$gap)) + 1
if(length(N.clust) == 0) {
message("--fail to define the optimal cluster number!")
N.clust <- "null"
}
#---------------#
# Visualization #
outFig <- paste0(fig.name,".pdf")
par(bty="o", mgp = c(1.9,.33,0), mar=c(3.1,3.1,2.1,3.1)+.1, las=1, tcl=-.25)
plot(NULL, NULL,
xlim = c(min(try.N.clust),max(try.N.clust)),
#ylim = c(min(optk1), max(optk1)),
ylim = c(0,1),
xlab = "Number of Multi-Omics Clusters",ylab = "")
rect(par("usr")[1],par("usr")[3],par("usr")[2],par("usr")[4],col = "#EAE9E9",border = FALSE)
grid(col = "white", lty = 1, lwd = 1.5)
# for (m in 1:n.runs) points(try.N.clust, optk1[, m], pch = 20, cex = 1.5, col = "#224A8D")
points(try.N.clust, apply(optk1, 1, mean), pch = 19, col = ggplot2::alpha("#224A8D"), cex = 1.5)
lines(try.N.clust, apply(optk1, 1, mean), col = "#224A8D", lwd = 2, lty = 4)
mtext("Cluster Prediction Index", side = 2, line = 2, cex = 1.5, col = "#224A8D", las = 3)
par(new = TRUE, xpd = FALSE)
plot(NULL,NULL,
xlim = c(min(try.N.clust),max(try.N.clust)),
#ylim = c(min(optk2$gap), max(optk2$gap)),
ylim = c(0,1),
xlab = "",ylab = "",xaxt = "n",yaxt = "n")
points(try.N.clust, optk2$gap, pch = 19, col = ggplot2::alpha("#E51718",0.8), cex = 1.5)
lines(try.N.clust, optk2$gap, col = "#E51718", lwd = 2, lty = 4)
#axis(side = 4, at = pretty(range(optk2$gap), n = 6))
axis(side = 4, at = seq(0,1,0.2), labels = c("0.0","0.2","0.4","0.6","0.8","1.0"))
mtext("Gap-statistics", side = 4, line = 2,las = 3, cex = 1.5, col = "#E51718")
# abline(v = optk3$`Eigen-gap best`, col = "#008B8A", lwd = 2, lty = 4)
# text(optk3$`Eigen-gap best`, par("usr")[3] + 0.1,"Eigen-gaps", cex = 1.5, col = "#008B8A", adj = -0.05)
invisible(dev.copy2pdf(file = file.path(fig.path, outFig), width = 5.5, height = 5))
message("visualization done...")
if(N.clust > 1) {
message(paste0("--the imputed optimal cluster number is ", N.clust, " arbitrarily, but it would be better referring to other priori knowledge."))
}
#return(list(N.clust = N.clust, CPI = optk1, Gapk = optk2, Eigen = optk3))
return(list(N.clust = N.clust, CPI = optk1, Gapk = optk2))
}