|
a |
|
b/R/tuneCluster.spca.R |
|
|
1 |
#' Feature Selection Optimization for sPCA method |
|
|
2 |
#' |
|
|
3 |
#' This function identify the number of feautures to keep per component and thus by cluster in \code{mixOmics::spca} |
|
|
4 |
#' by optimizing the silhouette coefficient, which assesses the quality of clustering. |
|
|
5 |
#' |
|
|
6 |
#' @param X numeric matrix (or data.frame) with features in columns and samples in rows |
|
|
7 |
#' @param ncomp integer, number of component to include in the model |
|
|
8 |
#' @param test.keepX vector of integer containing the different value of keepX to test for block \code{X}. |
|
|
9 |
#' @param ... other parameters to be included in the spls model (see \code{mixOmics::spca}) |
|
|
10 |
#' |
|
|
11 |
#' @return |
|
|
12 |
#' \item{silhouette}{silhouette coef. computed for every combinasion of keepX/keepY} |
|
|
13 |
#' \item{ncomp}{number of component included in the model} |
|
|
14 |
#' \item{test.keepX}{list of tested keepX} |
|
|
15 |
#' \item{block}{names of blocks} |
|
|
16 |
#' \item{slopes}{"slopes" computed from the silhouette coef. for each keepX and keepY, used to determine the best keepX and keepY} |
|
|
17 |
#' \item{choice.keepX}{best \code{keepX} for each component} |
|
|
18 |
#' |
|
|
19 |
#' @details |
|
|
20 |
#' For each component and for each keepX value, a spls is done from these parameters. |
|
|
21 |
#' Then the clustering is performed and the silhouette coefficient is calculated for this clustering. |
|
|
22 |
#' |
|
|
23 |
#' We then calculate "slopes" where keepX are the coordinates and the silhouette is the intensity. |
|
|
24 |
#' A z-score is assigned to each slope. |
|
|
25 |
#' We then identify the most significant slope which indicates a drop in the silhouette coefficient and thus a deterioration of the clustering. |
|
|
26 |
#' |
|
|
27 |
#' |
|
|
28 |
#' @examples |
|
|
29 |
#' demo <- suppressWarnings(get_demo_cluster()) |
|
|
30 |
#' X <- demo$X |
|
|
31 |
#' |
|
|
32 |
#' # tuning |
|
|
33 |
#' tune.spca.res <- tuneCluster.spca(X = X, ncomp = 2, test.keepX = c(2:10)) |
|
|
34 |
#' keepX <- tune.spca.res$choice.keepX |
|
|
35 |
#' plot(tune.spca.res) |
|
|
36 |
#' |
|
|
37 |
#' # final model |
|
|
38 |
#' spca.res <- mixOmics::spca(X=X, ncomp = 2, keepX = keepX) |
|
|
39 |
#' plotLong(spca.res) |
|
|
40 |
|
|
|
41 |
|
|
|
42 |
#' @import mixOmics |
|
|
43 |
#' @importFrom dplyr left_join |
|
|
44 |
#' @importFrom dplyr mutate |
|
|
45 |
#' @importFrom dplyr filter |
|
|
46 |
#' @export |
|
|
47 |
tuneCluster.spca <- function(X, ncomp = 2, test.keepX = rep(ncol(X), ncomp), ...){ |
|
|
48 |
#-- checking input parameters ---------------------------------------------# |
|
|
49 |
#--------------------------------------------------------------------------# |
|
|
50 |
|
|
|
51 |
#-- X |
|
|
52 |
X <- validate_matrix_X(X) |
|
|
53 |
|
|
|
54 |
#-- ncomp |
|
|
55 |
ncomp <- validate_ncomp(ncomp, list(X)) |
|
|
56 |
|
|
|
57 |
#-- test.keepX |
|
|
58 |
test.keepX <- validate_test_keepX(test.keepX = test.keepX, X = X) |
|
|
59 |
min.test.keepX <- rep(min(test.keepX), ncomp) |
|
|
60 |
|
|
|
61 |
|
|
|
62 |
#-- launch tuning --------------------------------------------------------# |
|
|
63 |
#--------------------------------------------------------------------------# |
|
|
64 |
|
|
|
65 |
#-- 0. set output object |
|
|
66 |
result <- as.data.frame(matrix(ncol = 4, nrow = length(test.keepX)*ncomp)) |
|
|
67 |
colnames(result) <- c("comp", "X", "pos", "neg") |
|
|
68 |
result.index <- 0 |
|
|
69 |
|
|
|
70 |
#-- 1. compute dissimilarity matrix for silhouette coef. (once and for all) |
|
|
71 |
dmatrix <- dmatrix.spearman.dissimilarity(X) |
|
|
72 |
cluster <- as.data.frame(list("feature" = rownames(dmatrix))) |
|
|
73 |
|
|
|
74 |
#-- tuning |
|
|
75 |
for(comp in 1:ncomp){ |
|
|
76 |
tmp.keepX <- min.test.keepX # foreach comp, keepX of other comp is set to minimum |
|
|
77 |
for(keepX in test.keepX){ |
|
|
78 |
tmp.keepX[comp] <- keepX |
|
|
79 |
|
|
|
80 |
#-- 2. run spca |
|
|
81 |
kX = tmp.keepX |
|
|
82 |
spca.res <- mixOmics::spca(X = X, ncomp = ncomp, keepX = kX) |
|
|
83 |
|
|
|
84 |
#-- 3. extract clusters |
|
|
85 |
tmp.cluster <- getCluster(spca.res) |
|
|
86 |
tmp.cluster <- suppressWarnings(dplyr::left_join(cluster, tmp.cluster[c(1,4)], |
|
|
87 |
by = c("feature"="molecule"))) %>% |
|
|
88 |
dplyr::mutate(cluster = as.numeric(as.character(cluster))) %>% |
|
|
89 |
dplyr::mutate(cluster = ifelse(is.na(cluster), 0, cluster)) |
|
|
90 |
|
|
|
91 |
#-- 4. compute silhouette |
|
|
92 |
sil <- silhouette(dmatrix, tmp.cluster$cluster) |
|
|
93 |
|
|
|
94 |
#-- 6. store |
|
|
95 |
result.index <- result.index + 1 |
|
|
96 |
result[result.index, "comp"] <- comp |
|
|
97 |
result[result.index, "X"] <- kX[comp] |
|
|
98 |
|
|
|
99 |
pos.res <- sil$average.cluster %>% |
|
|
100 |
dplyr::filter(cluster == comp) %>% |
|
|
101 |
dplyr::pull(silhouette.coef) |
|
|
102 |
result[result.index, "pos"] <- ifelse(length(pos.res) == 0, NA, pos.res) |
|
|
103 |
neg.res <- sil$average.cluster %>% |
|
|
104 |
dplyr::filter(cluster == -comp) %>% |
|
|
105 |
dplyr::pull(silhouette.coef) |
|
|
106 |
result[result.index, "neg"] <- ifelse(length(neg.res) == 0, NA, neg.res) |
|
|
107 |
} |
|
|
108 |
} |
|
|
109 |
result <- list("silhouette" = result) |
|
|
110 |
result[["ncomp"]] <- ncomp |
|
|
111 |
result[["test.keepX"]] <- test.keepX |
|
|
112 |
result[["block"]] <- c("X") |
|
|
113 |
class(result) <- "spca.tune.silhouette" |
|
|
114 |
|
|
|
115 |
#-- 7. choice.keepX |
|
|
116 |
result[["slopes"]] <- tune.silhouette.get_slopes(result) |
|
|
117 |
tmp <- tune.silhouette.get_choice_keepX(result) # choice keepX/keepY |
|
|
118 |
result[["choice.keepX"]] <- unlist(lapply(tmp, function(x) x$X)) |
|
|
119 |
return(result) |
|
|
120 |
} |
|
|
121 |
|
|
|
122 |
|
|
|
123 |
|
|
|
124 |
|
|
|
125 |
#' @import mixOmics |
|
|
126 |
#' @importFrom dplyr left_join mutate filter group_by top_n |
|
|
127 |
#' @import ggplot2 |
|
|
128 |
#' @importFrom tidyr pivot_longer |
|
|
129 |
#' @importFrom ggrepel geom_label_repel |
|
|
130 |
#' @export |
|
|
131 |
plot.spca.tune.silhouette <- function(x, ...){ |
|
|
132 |
#-- checking input parameters ---------------------------------------------# |
|
|
133 |
#--------------------------------------------------------------------------# |
|
|
134 |
|
|
|
135 |
#-- should be a spca.tune.silhouette" object. |
|
|
136 |
ncomp <- x$ncomp |
|
|
137 |
test.keepX <- x$test.keepX |
|
|
138 |
|
|
|
139 |
tmp <- x$silhouette %>% |
|
|
140 |
tidyr::pivot_longer(names_to = "contrib", values_to = "value", -c(comp,X)) %>% |
|
|
141 |
na.omit %>% # remove NA intruced in pos/neg |
|
|
142 |
dplyr::mutate(comp = as.factor(paste0("Comp ", comp)), |
|
|
143 |
contrib = factor(contrib, levels = c("pos","neg"))) %>% |
|
|
144 |
dplyr::group_by(comp, contrib) |
|
|
145 |
|
|
|
146 |
choice <- list(comp= as.factor(paste0("Comp ",names(x$choice.keepX))), |
|
|
147 |
X = unname(x$choice.keepX)) %>% |
|
|
148 |
as.data.frame() %>% |
|
|
149 |
dplyr::left_join(tmp, by = c("comp"="comp", "X"="X")) %>% |
|
|
150 |
dplyr::group_by(comp, X) %>% |
|
|
151 |
dplyr::top_n(n = 1, wt = value) |
|
|
152 |
|
|
|
153 |
choice.vline <- choice %>% |
|
|
154 |
dplyr::select(c("comp", "X")) |
|
|
155 |
|
|
|
156 |
ggplot.df <- ggplot(tmp, aes(x=X, y =value, col = comp)) + |
|
|
157 |
geom_line(aes(lty = contrib)) + facet_wrap(~comp) + |
|
|
158 |
theme_bw() + |
|
|
159 |
geom_point(data = choice, size = 5, pch = 18) + |
|
|
160 |
ggrepel::geom_label_repel(data = choice, aes(label = X), col = "black") + |
|
|
161 |
scale_color_manual(values = mixOmics::color.mixo(1:x$ncomp)) + |
|
|
162 |
labs(x ="tested keepX", y = "Silhouette Coef.", color = "Comp.", lty = "Contrib.") + |
|
|
163 |
geom_vline(data = choice.vline, aes(xintercept = X), lty = 5, col = "grey") |
|
|
164 |
|
|
|
165 |
|
|
|
166 |
print(ggplot.df) |
|
|
167 |
|
|
|
168 |
return(invisible(ggplot.df)) |
|
|
169 |
} |