|
a |
|
b/R/tuneCluster.block.spls.R |
|
|
1 |
#' Feature Selection Optimization for block (s)PLS method |
|
|
2 |
#' |
|
|
3 |
#' This function identify the number of feautures to keep per component and thus by cluster in \code{mixOmics::block.spls} by optimizing the silhouette coefficient, which assesses the quality of clustering. |
|
|
4 |
#' |
|
|
5 |
#' @param X list of numeric matrix (or data.frame) with features in columns and samples in rows (with samples order matching in all data sets). |
|
|
6 |
#' @param Y (optional) numeric matrix (or data.frame) with features in columns and samples in rows (same rows as \code{X}). |
|
|
7 |
#' @param indY integer, to supply if Y is missing, indicates the position of the matrix response in the list \code{X}. |
|
|
8 |
#' @param ncomp integer, number of component to include in the model |
|
|
9 |
#' @param test.list.keepX list of integers with the same size as X. Each entry corresponds to the different keepX value to test for each block of \code{X}. |
|
|
10 |
#' @param test.keepY only if Y is provideid. Vector of integer containing the different value of keepY to test for block \code{Y}. |
|
|
11 |
#' @param ... other parameters to be included in the spls model (see \code{mixOmics::block.spls}) |
|
|
12 |
#' |
|
|
13 |
#' @return |
|
|
14 |
#' \item{silhouette}{silhouette coef. computed for every combinasion of keepX/keepY} |
|
|
15 |
#' \item{ncomp}{number of component included in the model} |
|
|
16 |
#' \item{test.keepX}{list of tested keepX} |
|
|
17 |
#' \item{test.keepY}{list of tested keepY} |
|
|
18 |
#' \item{block}{names of blocks} |
|
|
19 |
#' \item{slopes}{"slopes" computed from the silhouette coef. for each keepX and keepY, used to determine the best keepX and keepY} |
|
|
20 |
#' \item{choice.keepX}{best \code{keepX} for each component} |
|
|
21 |
#' \item{choice.keepY}{best \code{keepY} for each component} |
|
|
22 |
#' |
|
|
23 |
#' |
|
|
24 |
#' @details |
|
|
25 |
#' For each component and for each keepX/keepY value, a spls is done from these parameters. |
|
|
26 |
#' Then the clustering is performed and the silhouette coefficient is calculated for this clustering. |
|
|
27 |
#' |
|
|
28 |
#' We then calculate "slopes" where keepX/keepY are the coordinates and the silhouette is the intensity. |
|
|
29 |
#' A z-score is assigned to each slope. |
|
|
30 |
#' We then identify the most significant slope which indicates a drop in the silhouette coefficient and thus a deterioration of the clustering. |
|
|
31 |
#' |
|
|
32 |
#' |
|
|
33 |
#' @seealso |
|
|
34 |
#' \code{\link[mixOmics]{block.spls}}, \code{\link[timeOmics]{getCluster}}, \code{\link[timeOmics]{plotLong}} |
|
|
35 |
#' |
|
|
36 |
#' @examples |
|
|
37 |
#' demo <- suppressWarnings(get_demo_cluster()) |
|
|
38 |
#' X <- list(X = demo$X, Z = demo$Z) |
|
|
39 |
#' Y <- demo$Y |
|
|
40 |
#' test.list.keepX <- list("X" = c(5,10,15,20), "Z" = c(2,4,6,8)) |
|
|
41 |
#' test.keepY <- c(2:5) |
|
|
42 |
#' |
|
|
43 |
#' # tuning |
|
|
44 |
#' tune.block.spls <- tuneCluster.block.spls(X= X, Y= Y, |
|
|
45 |
#' test.list.keepX= test.list.keepX, |
|
|
46 |
#' test.keepY= test.keepY, |
|
|
47 |
#' mode= "canonical") |
|
|
48 |
#' keepX <- tune.block.spls$choice.keepX |
|
|
49 |
#' keepY <- tune.block.spls$choice.keepY |
|
|
50 |
#' |
|
|
51 |
#' # final model |
|
|
52 |
#' block.spls.res <- mixOmics::block.spls(X= X, Y= Y, keepX = keepX, |
|
|
53 |
#' keepY = keepY, ncomp = 2, mode = "canonical") |
|
|
54 |
#' # get clusters and plot longitudinal profile by cluster |
|
|
55 |
#' block.spls.cluster <- getCluster(block.spls.res) |
|
|
56 |
|
|
|
57 |
#' @import mixOmics |
|
|
58 |
#' @importFrom dplyr left_join |
|
|
59 |
#' @importFrom dplyr mutate |
|
|
60 |
#' @importFrom dplyr filter |
|
|
61 |
#' @export |
|
|
62 |
tuneCluster.block.spls <- function(X, Y = NULL, indY = NULL, ncomp = 2, |
|
|
63 |
test.list.keepX = NULL, test.keepY = NULL, ...){ |
|
|
64 |
#-- checking input parameters ---------------------------------------------# |
|
|
65 |
#--------------------------------------------------------------------------# |
|
|
66 |
|
|
|
67 |
#-- X |
|
|
68 |
X <- validate_list_matrix_X(X) |
|
|
69 |
|
|
|
70 |
#-- Y |
|
|
71 |
if(!is.null(Y)){ |
|
|
72 |
Y <- validate_matrix_Y(Y) |
|
|
73 |
} else { |
|
|
74 |
indY <- validate_indY(indY, X) |
|
|
75 |
} |
|
|
76 |
|
|
|
77 |
#-- ncomp |
|
|
78 |
ncomp <- validate_ncomp(ncomp = ncomp, X =X) |
|
|
79 |
|
|
|
80 |
#-- keepX |
|
|
81 |
test.list.keepX <- validate_test_list_keepX(test.keepX = test.list.keepX, X = X, ncomp = ncomp) |
|
|
82 |
|
|
|
83 |
#-- keepY |
|
|
84 |
test.keepY <- validate_test_keepY(test.keepY = test.keepY, Y = Y) |
|
|
85 |
|
|
|
86 |
list.keepX.keepY <- test.list.keepX |
|
|
87 |
if(!is.null(Y)) { |
|
|
88 |
list.keepX.keepY$Y <- test.keepY |
|
|
89 |
} |
|
|
90 |
list.keepX.keepY <- expand.grid(list.keepX.keepY, stringsAsFactors = FALSE, |
|
|
91 |
KEEP.OUT.ATTRS = FALSE) |
|
|
92 |
|
|
|
93 |
|
|
|
94 |
#-- launch tuning --------------------------------------------------------# |
|
|
95 |
#--------------------------------------------------------------------------# |
|
|
96 |
|
|
|
97 |
#-- 0. set output object |
|
|
98 |
result <- matrix(ncol = 3 + length(X),nrow = nrow(list.keepX.keepY)*ncomp) %>% |
|
|
99 |
as.data.frame() %>% |
|
|
100 |
purrr::set_names(c("comp", names(X), "pos", "neg")) |
|
|
101 |
if(!is.null(Y)){ |
|
|
102 |
result <- result %>% |
|
|
103 |
mutate("Y" = NA) |
|
|
104 |
} |
|
|
105 |
result.index <- 1 |
|
|
106 |
|
|
|
107 |
#--1. compute dissimilarity matrix for silhouette coef. (once and for all) |
|
|
108 |
all_data <- X |
|
|
109 |
all_data_name <- as.character(unlist(imap(X, ~rep(.y,ncol(.x))))) |
|
|
110 |
if(is.null(all_data$Y) & !is.null(Y)){ |
|
|
111 |
all_data$Y <- Y |
|
|
112 |
all_data_name <- c(all_data_name, rep("Y", ncol(Y))) |
|
|
113 |
} |
|
|
114 |
all_data <- do.call("cbind", all_data) |
|
|
115 |
dmatrix <- dmatrix.spearman.dissimilarity(all_data) |
|
|
116 |
|
|
|
117 |
cluster <- as.data.frame(list("feature" = rownames(dmatrix), |
|
|
118 |
"block" = all_data_name)) |
|
|
119 |
|
|
|
120 |
#--2. tuning |
|
|
121 |
for(comp in 1:ncomp){ |
|
|
122 |
for(index.list.kX.kY in 1:nrow(list.keepX.keepY)){ |
|
|
123 |
# foreach comp, keepX and keepY of other comp is set to minimum |
|
|
124 |
kX <- lapply(list.keepX.keepY, function(x) rep(min(x), ncomp)) |
|
|
125 |
for(block in names(list.keepX.keepY)){ |
|
|
126 |
kX[[block]][comp] <- list.keepX.keepY[index.list.kX.kY, block] |
|
|
127 |
result[result.index, block] = list.keepX.keepY[index.list.kX.kY, block] |
|
|
128 |
} |
|
|
129 |
if(!is.null(Y)){ |
|
|
130 |
kY <- kX[["Y"]] |
|
|
131 |
kX[["Y"]] <- NULL |
|
|
132 |
|
|
|
133 |
#--3. run spls |
|
|
134 |
block.spls.res <- mixOmics::block.spls(X = X, Y = Y, ncomp = ncomp, |
|
|
135 |
keepX = kX, keepY = kY, ...) |
|
|
136 |
} else { |
|
|
137 |
#--3. run spls |
|
|
138 |
block.spls.res <- mixOmics::block.spls(X = X, ncomp = ncomp, |
|
|
139 |
keepX = kX, indY = indY, ...) |
|
|
140 |
} |
|
|
141 |
|
|
|
142 |
|
|
|
143 |
#--4. extract clusters |
|
|
144 |
tmp.cluster <- getCluster(block.spls.res) |
|
|
145 |
tmp.cluster <- suppressWarnings(dplyr::left_join(cluster, tmp.cluster[c(1,4)], |
|
|
146 |
by = c("feature"="molecule"))) %>% |
|
|
147 |
mutate(cluster = as.numeric(as.character(cluster))) %>% |
|
|
148 |
mutate(cluster = ifelse(is.na(cluster), 0, cluster)) |
|
|
149 |
|
|
|
150 |
#--5. compute silhouette |
|
|
151 |
sil <- silhouette(dmatrix, tmp.cluster$cluster) |
|
|
152 |
|
|
|
153 |
#--6. store |
|
|
154 |
result[result.index, "comp"] <- comp |
|
|
155 |
pos.res <- sil$average.cluster %>% |
|
|
156 |
dplyr::filter(cluster == comp) %>% |
|
|
157 |
dplyr::pull(silhouette.coef) |
|
|
158 |
result[result.index, "pos"] <- ifelse(length(pos.res) == 0, NA, pos.res) |
|
|
159 |
neg.res <- sil$average.cluster %>% |
|
|
160 |
dplyr::filter(cluster == -comp) %>% |
|
|
161 |
dplyr::pull(silhouette.coef) |
|
|
162 |
result[result.index, "neg"] <- ifelse(length(neg.res) == 0, NA, neg.res) |
|
|
163 |
result.index <- result.index + 1 |
|
|
164 |
} |
|
|
165 |
} |
|
|
166 |
result <- list("silhouette" = result) |
|
|
167 |
result[["ncomp"]] <- ncomp |
|
|
168 |
result[["test.keepX"]] <- test.list.keepX |
|
|
169 |
result[["test.keepY"]] <- test.keepY |
|
|
170 |
result[["block"]] <- names(list.keepX.keepY) |
|
|
171 |
class(result) <- "block.spls.tune.silhouette" |
|
|
172 |
|
|
|
173 |
#-- 7. choice.keepX / choice.keepY |
|
|
174 |
result[["slopes"]] <- tune.silhouette.get_slopes(result) |
|
|
175 |
tmp <- tune.silhouette.get_choice_keepX(result) %>% |
|
|
176 |
do.call(what = "rbind") |
|
|
177 |
if(!is.null(Y)){ # choice keepY |
|
|
178 |
result[["choice.keepY"]] <- tmp$Y |
|
|
179 |
tmp <- dplyr::select(tmp, -Y) |
|
|
180 |
} |
|
|
181 |
result[["choice.keepX"]] <- as.list(tmp) |
|
|
182 |
return(result) |
|
|
183 |
} |
|
|
184 |
|
|
|
185 |
#' @importFrom purrr imap set_names map_dfr |
|
|
186 |
#' @importFrom dplyr filter mutate group_by summarise left_join n |
|
|
187 |
#' @importFrom stringr str_split |
|
|
188 |
tune.silhouette.get_slopes <- function(object){ |
|
|
189 |
stopifnot(class(object) %in% c("block.spls.tune.silhouette", "spls.tune.silhouette", "spca.tune.silhouette")) |
|
|
190 |
# tune.silhouette is a data.frame (comp, X, Y, bock ..., pos, neg) |
|
|
191 |
# tune.silhouette <- tune.block.spls$silhouette |
|
|
192 |
|
|
|
193 |
block <- object$block |
|
|
194 |
ncomp <- object$ncomp |
|
|
195 |
|
|
|
196 |
if(is(object, "block.spls.tune.silhouette")){ |
|
|
197 |
coord <- object$test.keepX |
|
|
198 |
if(!is.null(object[["test.keepY"]])){ |
|
|
199 |
coord[["Y"]] <- object$test.keepY |
|
|
200 |
coord <- lapply(coord, sort) |
|
|
201 |
} |
|
|
202 |
}else if(is(object, "spls.tune.silhouette")){ |
|
|
203 |
coord <- list(X= sort(object$test.keepX), |
|
|
204 |
Y= sort(object$test.keepY)) |
|
|
205 |
}else if(is(object, "spca.tune.silhouette")){ |
|
|
206 |
coord <- list(X= sort(object$test.keepX)) |
|
|
207 |
} |
|
|
208 |
|
|
|
209 |
# get all points |
|
|
210 |
all.points <- unique(object$silhouette[block]) |
|
|
211 |
|
|
|
212 |
# define neighbours |
|
|
213 |
neighbourhood <- map_dfr(1:nrow(all.points), ~tune.silhouette.get_neighbours(coord = coord, all.points[.x,,drop=FALSE])) |
|
|
214 |
|
|
|
215 |
# extract pos and neg and set it as named list for better performance |
|
|
216 |
split_by_comp <- split(object$silhouette, f= as.factor(object$silhouette$comp)) |
|
|
217 |
names_split_by_comp <- lapply(split_by_comp, |
|
|
218 |
function(x) as.vector(apply(x[block], 1, function(y) paste(y, collapse = "_")))) |
|
|
219 |
|
|
|
220 |
POS <- purrr::imap(split_by_comp, ~ purrr::set_names(.x[["pos"]], names_split_by_comp[[.y]])) |
|
|
221 |
NEG <- purrr::imap(split_by_comp, ~ purrr::set_names(.x[["neg"]], names_split_by_comp[[.y]])) |
|
|
222 |
|
|
|
223 |
# compute slope (pos / neg by comp) |
|
|
224 |
slopes <- purrr::map_dfr(as.character(1:ncomp), ~{ |
|
|
225 |
cbind(neighbourhood, |
|
|
226 |
"origin.pos" = POS[[.x]][neighbourhood$origin], |
|
|
227 |
"destination.pos" = POS[[.x]][neighbourhood$destination], |
|
|
228 |
"origin.neg" = NEG[[.x]][neighbourhood$origin], |
|
|
229 |
"destination.neg" = NEG[[.x]][neighbourhood$destination], |
|
|
230 |
"comp" = as.numeric(.x))}) |
|
|
231 |
|
|
|
232 |
slopes <- slopes %>% |
|
|
233 |
dplyr::filter(origin!=destination) %>% |
|
|
234 |
dplyr::mutate("slope.pos" = tune.silhouette.get_slopes_coef(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric), |
|
|
235 |
x2 = lapply(stringr::str_split(.$destination, "_"), as.numeric), |
|
|
236 |
y1 = .$origin.pos, |
|
|
237 |
y2 = .$destination.pos)) %>% |
|
|
238 |
dplyr::mutate("slope.neg" = tune.silhouette.get_slopes_coef(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric), |
|
|
239 |
x2 = lapply(stringr::str_split(.$destination, "_"), as.numeric), |
|
|
240 |
y1 = .$origin.neg, |
|
|
241 |
y2 = .$destination.neg)) %>% |
|
|
242 |
dplyr::mutate("distance_from_origin" = tune.silhouette.distance_from_origin(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric))) |
|
|
243 |
|
|
|
244 |
# cumpute SD by comp and direction |
|
|
245 |
SD <- slopes %>% |
|
|
246 |
dplyr::group_by(comp, direction) %>% |
|
|
247 |
dplyr::summarise(sd.pos = sd_new(slope.pos, na.rm = TRUE), |
|
|
248 |
sd.neg = sd_new(slope.neg, na.rm = TRUE), |
|
|
249 |
mean.pos = mean(slope.pos, na.rm = TRUE), |
|
|
250 |
mean.neg = mean(slope.neg, na.rm = TRUE), N=dplyr::n()) |
|
|
251 |
|
|
|
252 |
# add Pval for signif slopes |
|
|
253 |
slopes <- slopes %>% |
|
|
254 |
dplyr::left_join(SD, by = c("direction", "comp")) %>% |
|
|
255 |
# pos |
|
|
256 |
dplyr::mutate(Z_score.pos = ifelse(.$sd.pos == 0,0,(.$slope.pos - .$mean.pos)/.$sd.pos)) %>% |
|
|
257 |
dplyr::mutate(Pval.pos = ifelse(Z_score.pos >= 0,1-pnorm(Z_score.pos), pnorm(Z_score.pos))) %>% |
|
|
258 |
# neg |
|
|
259 |
dplyr::mutate(Z_score.neg = ifelse(.$sd.neg == 0,0,(.$slope.neg - .$mean.neg)/.$sd.neg)) %>% |
|
|
260 |
dplyr::mutate(Pval.neg = ifelse(Z_score.neg >= 0,1-pnorm(Z_score.neg), pnorm(Z_score.neg))) |
|
|
261 |
return(slopes) |
|
|
262 |
} |
|
|
263 |
|
|
|
264 |
#' @importFrom purrr map2 |
|
|
265 |
tune.silhouette.get_neighbours <- function(coord, point){ |
|
|
266 |
# return forward neighbourhood of a point |
|
|
267 |
|
|
|
268 |
# valid point in coord |
|
|
269 |
stopifnot(all(names(coord) == names(point))) |
|
|
270 |
|
|
|
271 |
neighbour.max <- point |
|
|
272 |
index <- purrr::map2(coord, point, ~which(.x == .y)) |
|
|
273 |
|
|
|
274 |
# get max neighbours |
|
|
275 |
for(i in names(index)){ |
|
|
276 |
if(!(length(coord[[i]]) == index[[i]])){ |
|
|
277 |
neighbour.max[[i]] <- coord[[i]][index[[i]]+1] |
|
|
278 |
} |
|
|
279 |
} # can be simplified |
|
|
280 |
|
|
|
281 |
# get all close possible neighbours |
|
|
282 |
neighbourhood <- as.list(rbind(as.data.frame(point), as.data.frame(neighbour.max))) %>% |
|
|
283 |
lapply(unique) %>% |
|
|
284 |
expand.grid(stringsAsFactors = FALSE, KEEP.OUT.ATTRS = FALSE) |
|
|
285 |
|
|
|
286 |
# get direction for standard deviation computation |
|
|
287 |
direction <- vector(mode = "numeric", length = nrow(neighbourhood)) |
|
|
288 |
for(i in 1:nrow(neighbourhood)){ |
|
|
289 |
direction[i] <- purrr::map2(coord, neighbourhood[i,], ~which(.x == .y)) %>% |
|
|
290 |
purrr::map2(index, ~{.x-.y}) %>% |
|
|
291 |
paste(collapse = "_") |
|
|
292 |
} |
|
|
293 |
|
|
|
294 |
neighbourhood["direction"] <- direction |
|
|
295 |
neighbourhood["origin"] <- paste(point, collapse = "_") |
|
|
296 |
neighbourhood["destination"] <- apply(neighbourhood[names(coord)], 1, |
|
|
297 |
function(x) paste(x, collapse = "_")) |
|
|
298 |
return(neighbourhood) |
|
|
299 |
} |
|
|
300 |
|
|
|
301 |
#' @importFrom purrr map_dbl |
|
|
302 |
tune.silhouette.get_slopes_coef <- function(x1,x2,y1,y2){ |
|
|
303 |
stopifnot(length(x1) == length(x2)) |
|
|
304 |
stopifnot(length(x2) == length(y1)) |
|
|
305 |
stopifnot(length(y1) == length(y2)) |
|
|
306 |
|
|
|
307 |
res <- purrr::map_dbl(seq_along(x1), ~{ |
|
|
308 |
euc.dist <- sqrt(sum((x1[[.x]] - x2[[.x]])^2)); |
|
|
309 |
ifelse(euc.dist == 0, 0, (y2[.x] - y1[.x])/euc.dist)}) |
|
|
310 |
return(res) |
|
|
311 |
} |
|
|
312 |
|
|
|
313 |
#' @importFrom purrr map_dbl |
|
|
314 |
tune.silhouette.distance_from_origin <- function(x1){ |
|
|
315 |
euc.dist <- purrr::map_dbl(seq_along(x1), ~{ sqrt(sum((x1[[.x]])^2))}) |
|
|
316 |
return(euc.dist) |
|
|
317 |
} |
|
|
318 |
|
|
|
319 |
#' @importFrom dplyr select summarise left_join |
|
|
320 |
#' @importFrom tidyr gather |
|
|
321 |
#' @importFrom magrittr %>% |
|
|
322 |
tune.silhouette.get_choice_keepX <- function(tune.block.spls){ |
|
|
323 |
# from slopes, keep useful columns and remove NAs. |
|
|
324 |
slopes <- tune.block.spls$slopes |
|
|
325 |
tmp <- slopes %>% |
|
|
326 |
dplyr::select(c(tune.block.spls$block, comp, direction, Pval.pos, Pval.neg, distance_from_origin)) %>% |
|
|
327 |
tidyr::gather(Pval.dir, Pval.value, -c(tune.block.spls$block, comp, direction, distance_from_origin)) |
|
|
328 |
|
|
|
329 |
# for each comp, arrange by Pvalue and distance from origin and get first result |
|
|
330 |
MIN <- split(tmp, f=tmp$comp) %>% |
|
|
331 |
na.omit() %>% |
|
|
332 |
lapply(function(x) x %>% |
|
|
333 |
dplyr::arrange(Pval.value, distance_from_origin) %>% |
|
|
334 |
.[1,] %>% |
|
|
335 |
dplyr::select(tune.block.spls$block)) |
|
|
336 |
|
|
|
337 |
return(MIN) |
|
|
338 |
} |