|
a |
|
b/R/tuneCluster.spls.R |
|
|
1 |
#' Feature Selection Optimization for sPLS method |
|
|
2 |
#' |
|
|
3 |
#' This function identify the number of feautures to keep per component and thus by cluster in \code{mixOmics::spls} by optimizing the silhouette coefficient, which assesses the quality of clustering. |
|
|
4 |
#' |
|
|
5 |
#' @param X numeric matrix (or data.frame) with features in columns and samples in rows |
|
|
6 |
#' @param Y numeric matrix (or data.frame) with features in columns and samples in rows (same rows as \code{X}) |
|
|
7 |
#' @param ncomp integer, number of component to include in the model |
|
|
8 |
#' @param test.keepX vector of integer containing the different value of keepX to test for block \code{X}. |
|
|
9 |
#' @param test.keepY vector of integer containing the different value of keepY to test for block \code{Y}. |
|
|
10 |
#' @param ... other parameters to be included in the spls model (see \code{mixOmics::spls}) |
|
|
11 |
#' |
|
|
12 |
#' @return |
|
|
13 |
#' \item{silhouette}{silhouette coef. computed for every combinasion of keepX/keepY} |
|
|
14 |
#' \item{ncomp}{number of component included in the model} |
|
|
15 |
#' \item{test.keepX}{list of tested keepX} |
|
|
16 |
#' \item{test.keepY}{list of tested keepY} |
|
|
17 |
#' \item{block}{names of blocks} |
|
|
18 |
#' \item{slopes}{"slopes" computed from the silhouette coef. for each keepX and keepY, used to determine the best keepX and keepY} |
|
|
19 |
#' \item{choice.keepX}{best \code{keepX} for each component} |
|
|
20 |
#' \item{choice.keepY}{best \code{keepY} for each component} |
|
|
21 |
#' |
|
|
22 |
#' |
|
|
23 |
#' @details |
|
|
24 |
#' For each component and for each keepX/keepY value, a spls is done from these parameters. |
|
|
25 |
#' Then the clustering is performed and the silhouette coefficient is calculated for this clustering. |
|
|
26 |
#' |
|
|
27 |
#' We then calculate "slopes" where keepX/keepY are the coordinates and the silhouette is the intensity. |
|
|
28 |
#' A z-score is assigned to each slope. |
|
|
29 |
#' We then identify the most significant slope which indicates a drop in the silhouette coefficient and thus a deterioration of the clustering. |
|
|
30 |
#' |
|
|
31 |
#' |
|
|
32 |
#' @seealso |
|
|
33 |
#' \code{\link[mixOmics]{spls}}, \code{\link[timeOmics]{getCluster}}, \code{\link[timeOmics]{plotLong}} |
|
|
34 |
#' |
|
|
35 |
#' @examples |
|
|
36 |
#' demo <- suppressWarnings(get_demo_cluster()) |
|
|
37 |
#' X <- demo$X |
|
|
38 |
#' Y <- demo$Y |
|
|
39 |
#' |
|
|
40 |
#' # tuning |
|
|
41 |
#' tune.spls <- tuneCluster.spls(X, Y, ncomp= 2, test.keepX= c(5,10,15,20), test.keepY= c(2,4,6)) |
|
|
42 |
#' keepX <- tune.spls$choice.keepX |
|
|
43 |
#' keepY <- tune.spls$choice.keepY |
|
|
44 |
#' |
|
|
45 |
#' # final model |
|
|
46 |
#' spls.res <- mixOmics::spls(X, Y, ncomp= 2, keepX= keepX, keepY= keepY) |
|
|
47 |
#' |
|
|
48 |
#' # get clusters and plot longitudinal profile by cluster |
|
|
49 |
#' spls.cluster <- getCluster(spls.res) |
|
|
50 |
#' plotLong(spls.res) |
|
|
51 |
#' |
|
|
52 |
#' |
|
|
53 |
#' @export |
|
|
54 |
#' @import mixOmics |
|
|
55 |
#' @importFrom dplyr left_join |
|
|
56 |
#' @importFrom dplyr mutate |
|
|
57 |
#' @importFrom dplyr filter |
|
|
58 |
|
|
|
59 |
tuneCluster.spls <- function(X, Y, ncomp = 2, test.keepX = rep(ncol(X), ncomp), |
|
|
60 |
test.keepY = rep(ncol(Y), ncomp), ...){ |
|
|
61 |
#-- checking input parameters ------------ ---------------------------------# |
|
|
62 |
#--------------------------------------------------------------------------# |
|
|
63 |
|
|
|
64 |
#-- X |
|
|
65 |
X <- validate_matrix_X(X) |
|
|
66 |
Y <- validate_matrix_Y(Y) |
|
|
67 |
|
|
|
68 |
#-- ncomp |
|
|
69 |
ncomp <- validate_ncomp(ncomp, list(X,Y)) |
|
|
70 |
|
|
|
71 |
#-- test.keepX |
|
|
72 |
test.keepX <- validate_test_keepX(test.keepX = test.keepX, X = X) |
|
|
73 |
|
|
|
74 |
#-- test.keepY |
|
|
75 |
test.keepY <- validate_test_keepY(test.keepY = test.keepY, Y = Y) |
|
|
76 |
|
|
|
77 |
list.keepX.keepY <- list("keepX" = test.keepX, "keepY" = test.keepY) %>% |
|
|
78 |
expand.grid(stringsAsFactors = FALSE, KEEP.OUT.ATTRS = FALSE) |
|
|
79 |
|
|
|
80 |
|
|
|
81 |
#-- launch tuning --------------------------------------------------------# |
|
|
82 |
#--------------------------------------------------------------------------# |
|
|
83 |
|
|
|
84 |
#-- 0. set output object |
|
|
85 |
result <- as.data.frame(matrix(ncol = 5, nrow = nrow(list.keepX.keepY)*ncomp)) |
|
|
86 |
colnames(result) <- c("comp", "X", "Y", "pos", "neg") |
|
|
87 |
result.index <- 0 |
|
|
88 |
|
|
|
89 |
#--1. compute dissimilarity matrix for silhouette coef. (once and for all) |
|
|
90 |
all_data <- cbind(X, Y) |
|
|
91 |
dmatrix <- dmatrix.spearman.dissimilarity(all_data) |
|
|
92 |
|
|
|
93 |
cluster <- as.data.frame(list("feature" = rownames(dmatrix), |
|
|
94 |
"block" = c(rep("X", ncol(X)), rep("Y", ncol(Y))))) |
|
|
95 |
|
|
|
96 |
#--2. tuning |
|
|
97 |
for(comp in 1:ncomp){ |
|
|
98 |
for(index.list.kX.kY in 1:nrow(list.keepX.keepY)){ |
|
|
99 |
# foreach comp, keepX and keepY of other comp is set to minimum |
|
|
100 |
kX <- rep(min(test.keepX), ncomp) |
|
|
101 |
kY <- rep(min(test.keepY), ncomp) |
|
|
102 |
kX[comp] <- list.keepX.keepY[index.list.kX.kY,"keepX"] |
|
|
103 |
kY[comp] <- list.keepX.keepY[index.list.kX.kY,"keepY"] |
|
|
104 |
|
|
|
105 |
#--3. run spls |
|
|
106 |
spls.res <- mixOmics::spls(X = X, Y = Y, ncomp = ncomp, keepX = kX, keepY = kY, ...) |
|
|
107 |
|
|
|
108 |
#--4. extract clusters |
|
|
109 |
tmp.cluster <- getCluster(spls.res) |
|
|
110 |
tmp.cluster <- suppressWarnings(dplyr::left_join(cluster, tmp.cluster[c(1,4)], |
|
|
111 |
by = c("feature"="molecule"))) %>% |
|
|
112 |
dplyr::mutate(cluster = as.numeric(as.character(cluster))) %>% |
|
|
113 |
dplyr::mutate(cluster = ifelse(is.na(cluster), 0, cluster)) |
|
|
114 |
|
|
|
115 |
#--5. compute silhouette |
|
|
116 |
sil <- silhouette(dmatrix, tmp.cluster$cluster) |
|
|
117 |
|
|
|
118 |
#--6. store |
|
|
119 |
result.index <- result.index + 1 |
|
|
120 |
result[result.index, "comp"] <- comp |
|
|
121 |
result[result.index, "X"] <- kX[comp] |
|
|
122 |
result[result.index, "Y"] <- kY[comp] |
|
|
123 |
# result[result.index, "pos"] <- sil$average.cluster %>% |
|
|
124 |
# dplyr::filter(cluster == comp) %>% pull(silhouette.coef) |
|
|
125 |
# result[result.index, "neg"] <- sil$average.cluster %>% |
|
|
126 |
# dplyr::filter(cluster == -comp) %>% pull(silhouette.coef) |
|
|
127 |
pos.res <- sil$average.cluster %>% |
|
|
128 |
dplyr::filter(cluster == comp) %>% |
|
|
129 |
dplyr::pull(silhouette.coef) |
|
|
130 |
result[result.index, "pos"] <- ifelse(length(pos.res) == 0, NA, pos.res) |
|
|
131 |
neg.res <- sil$average.cluster %>% |
|
|
132 |
dplyr::filter(cluster == -comp) %>% |
|
|
133 |
dplyr::pull(silhouette.coef) |
|
|
134 |
result[result.index, "neg"] <- ifelse(length(neg.res) == 0, NA, neg.res) |
|
|
135 |
} |
|
|
136 |
} |
|
|
137 |
result <- list("silhouette" = result) |
|
|
138 |
result[["ncomp"]] <- ncomp |
|
|
139 |
result[["test.keepX"]] <- test.keepX |
|
|
140 |
result[["test.keepY"]] <- test.keepY |
|
|
141 |
result[["block"]] <- c("X", "Y") |
|
|
142 |
class(result) <- "spls.tune.silhouette" |
|
|
143 |
|
|
|
144 |
#-- 7. choice.keepX / choice.keepY |
|
|
145 |
result[["slopes"]] <- tune.silhouette.get_slopes(result) |
|
|
146 |
tmp <- tune.silhouette.get_choice_keepX(result) # choice keepX/keepY |
|
|
147 |
result[["choice.keepX"]] <- unlist(lapply(tmp, function(x) x$X)) |
|
|
148 |
result[["choice.keepY"]] <- unlist(lapply(tmp, function(x) x$Y)) |
|
|
149 |
return(result) |
|
|
150 |
} |