[d79ff0]: / R / tuneCluster.spca.R

Download this file

169 lines (141 with data), 6.8 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
#' Feature Selection Optimization for sPCA method
#'
#' This function identify the number of feautures to keep per component and thus by cluster in \code{mixOmics::spca}
#' by optimizing the silhouette coefficient, which assesses the quality of clustering.
#'
#' @param X numeric matrix (or data.frame) with features in columns and samples in rows
#' @param ncomp integer, number of component to include in the model
#' @param test.keepX vector of integer containing the different value of keepX to test for block \code{X}.
#' @param ... other parameters to be included in the spls model (see \code{mixOmics::spca})
#'
#' @return
#' \item{silhouette}{silhouette coef. computed for every combinasion of keepX/keepY}
#' \item{ncomp}{number of component included in the model}
#' \item{test.keepX}{list of tested keepX}
#' \item{block}{names of blocks}
#' \item{slopes}{"slopes" computed from the silhouette coef. for each keepX and keepY, used to determine the best keepX and keepY}
#' \item{choice.keepX}{best \code{keepX} for each component}
#'
#' @details
#' For each component and for each keepX value, a spls is done from these parameters.
#' Then the clustering is performed and the silhouette coefficient is calculated for this clustering.
#'
#' We then calculate "slopes" where keepX are the coordinates and the silhouette is the intensity.
#' A z-score is assigned to each slope.
#' We then identify the most significant slope which indicates a drop in the silhouette coefficient and thus a deterioration of the clustering.
#'
#'
#' @examples
#' demo <- suppressWarnings(get_demo_cluster())
#' X <- demo$X
#'
#' # tuning
#' tune.spca.res <- tuneCluster.spca(X = X, ncomp = 2, test.keepX = c(2:10))
#' keepX <- tune.spca.res$choice.keepX
#' plot(tune.spca.res)
#'
#' # final model
#' spca.res <- mixOmics::spca(X=X, ncomp = 2, keepX = keepX)
#' plotLong(spca.res)
#' @import mixOmics
#' @importFrom dplyr left_join
#' @importFrom dplyr mutate
#' @importFrom dplyr filter
#' @export
tuneCluster.spca <- function(X, ncomp = 2, test.keepX = rep(ncol(X), ncomp), ...){
#-- checking input parameters ---------------------------------------------#
#--------------------------------------------------------------------------#
#-- X
X <- validate_matrix_X(X)
#-- ncomp
ncomp <- validate_ncomp(ncomp, list(X))
#-- test.keepX
test.keepX <- validate_test_keepX(test.keepX = test.keepX, X = X)
min.test.keepX <- rep(min(test.keepX), ncomp)
#-- launch tuning --------------------------------------------------------#
#--------------------------------------------------------------------------#
#-- 0. set output object
result <- as.data.frame(matrix(ncol = 4, nrow = length(test.keepX)*ncomp))
colnames(result) <- c("comp", "X", "pos", "neg")
result.index <- 0
#-- 1. compute dissimilarity matrix for silhouette coef. (once and for all)
dmatrix <- dmatrix.spearman.dissimilarity(X)
cluster <- as.data.frame(list("feature" = rownames(dmatrix)))
#-- tuning
for(comp in 1:ncomp){
tmp.keepX <- min.test.keepX # foreach comp, keepX of other comp is set to minimum
for(keepX in test.keepX){
tmp.keepX[comp] <- keepX
#-- 2. run spca
kX = tmp.keepX
spca.res <- mixOmics::spca(X = X, ncomp = ncomp, keepX = kX)
#-- 3. extract clusters
tmp.cluster <- getCluster(spca.res)
tmp.cluster <- suppressWarnings(dplyr::left_join(cluster, tmp.cluster[c(1,4)],
by = c("feature"="molecule"))) %>%
dplyr::mutate(cluster = as.numeric(as.character(cluster))) %>%
dplyr::mutate(cluster = ifelse(is.na(cluster), 0, cluster))
#-- 4. compute silhouette
sil <- silhouette(dmatrix, tmp.cluster$cluster)
#-- 6. store
result.index <- result.index + 1
result[result.index, "comp"] <- comp
result[result.index, "X"] <- kX[comp]
pos.res <- sil$average.cluster %>%
dplyr::filter(cluster == comp) %>%
dplyr::pull(silhouette.coef)
result[result.index, "pos"] <- ifelse(length(pos.res) == 0, NA, pos.res)
neg.res <- sil$average.cluster %>%
dplyr::filter(cluster == -comp) %>%
dplyr::pull(silhouette.coef)
result[result.index, "neg"] <- ifelse(length(neg.res) == 0, NA, neg.res)
}
}
result <- list("silhouette" = result)
result[["ncomp"]] <- ncomp
result[["test.keepX"]] <- test.keepX
result[["block"]] <- c("X")
class(result) <- "spca.tune.silhouette"
#-- 7. choice.keepX
result[["slopes"]] <- tune.silhouette.get_slopes(result)
tmp <- tune.silhouette.get_choice_keepX(result) # choice keepX/keepY
result[["choice.keepX"]] <- unlist(lapply(tmp, function(x) x$X))
return(result)
}
#' @import mixOmics
#' @importFrom dplyr left_join mutate filter group_by top_n
#' @import ggplot2
#' @importFrom tidyr pivot_longer
#' @importFrom ggrepel geom_label_repel
#' @export
plot.spca.tune.silhouette <- function(x, ...){
#-- checking input parameters ---------------------------------------------#
#--------------------------------------------------------------------------#
#-- should be a spca.tune.silhouette" object.
ncomp <- x$ncomp
test.keepX <- x$test.keepX
tmp <- x$silhouette %>%
tidyr::pivot_longer(names_to = "contrib", values_to = "value", -c(comp,X)) %>%
na.omit %>% # remove NA intruced in pos/neg
dplyr::mutate(comp = as.factor(paste0("Comp ", comp)),
contrib = factor(contrib, levels = c("pos","neg"))) %>%
dplyr::group_by(comp, contrib)
choice <- list(comp= as.factor(paste0("Comp ",names(x$choice.keepX))),
X = unname(x$choice.keepX)) %>%
as.data.frame() %>%
dplyr::left_join(tmp, by = c("comp"="comp", "X"="X")) %>%
dplyr::group_by(comp, X) %>%
dplyr::top_n(n = 1, wt = value)
choice.vline <- choice %>%
dplyr::select(c("comp", "X"))
ggplot.df <- ggplot(tmp, aes(x=X, y =value, col = comp)) +
geom_line(aes(lty = contrib)) + facet_wrap(~comp) +
theme_bw() +
geom_point(data = choice, size = 5, pch = 18) +
ggrepel::geom_label_repel(data = choice, aes(label = X), col = "black") +
scale_color_manual(values = mixOmics::color.mixo(1:x$ncomp)) +
labs(x ="tested keepX", y = "Silhouette Coef.", color = "Comp.", lty = "Contrib.") +
geom_vline(data = choice.vline, aes(xintercept = X), lty = 5, col = "grey")
print(ggplot.df)
return(invisible(ggplot.df))
}