[d79ff0]: / R / tuneCluster.block.spls.R

Download this file

339 lines (295 with data), 15.1 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
#' Feature Selection Optimization for block (s)PLS method
#'
#' This function identify the number of feautures to keep per component and thus by cluster in \code{mixOmics::block.spls} by optimizing the silhouette coefficient, which assesses the quality of clustering.
#'
#' @param X list of numeric matrix (or data.frame) with features in columns and samples in rows (with samples order matching in all data sets).
#' @param Y (optional) numeric matrix (or data.frame) with features in columns and samples in rows (same rows as \code{X}).
#' @param indY integer, to supply if Y is missing, indicates the position of the matrix response in the list \code{X}.
#' @param ncomp integer, number of component to include in the model
#' @param test.list.keepX list of integers with the same size as X. Each entry corresponds to the different keepX value to test for each block of \code{X}.
#' @param test.keepY only if Y is provideid. Vector of integer containing the different value of keepY to test for block \code{Y}.
#' @param ... other parameters to be included in the spls model (see \code{mixOmics::block.spls})
#'
#' @return
#' \item{silhouette}{silhouette coef. computed for every combinasion of keepX/keepY}
#' \item{ncomp}{number of component included in the model}
#' \item{test.keepX}{list of tested keepX}
#' \item{test.keepY}{list of tested keepY}
#' \item{block}{names of blocks}
#' \item{slopes}{"slopes" computed from the silhouette coef. for each keepX and keepY, used to determine the best keepX and keepY}
#' \item{choice.keepX}{best \code{keepX} for each component}
#' \item{choice.keepY}{best \code{keepY} for each component}
#'
#'
#' @details
#' For each component and for each keepX/keepY value, a spls is done from these parameters.
#' Then the clustering is performed and the silhouette coefficient is calculated for this clustering.
#'
#' We then calculate "slopes" where keepX/keepY are the coordinates and the silhouette is the intensity.
#' A z-score is assigned to each slope.
#' We then identify the most significant slope which indicates a drop in the silhouette coefficient and thus a deterioration of the clustering.
#'
#'
#' @seealso
#' \code{\link[mixOmics]{block.spls}}, \code{\link[timeOmics]{getCluster}}, \code{\link[timeOmics]{plotLong}}
#'
#' @examples
#' demo <- suppressWarnings(get_demo_cluster())
#' X <- list(X = demo$X, Z = demo$Z)
#' Y <- demo$Y
#' test.list.keepX <- list("X" = c(5,10,15,20), "Z" = c(2,4,6,8))
#' test.keepY <- c(2:5)
#'
#' # tuning
#' tune.block.spls <- tuneCluster.block.spls(X= X, Y= Y,
#' test.list.keepX= test.list.keepX,
#' test.keepY= test.keepY,
#' mode= "canonical")
#' keepX <- tune.block.spls$choice.keepX
#' keepY <- tune.block.spls$choice.keepY
#'
#' # final model
#' block.spls.res <- mixOmics::block.spls(X= X, Y= Y, keepX = keepX,
#' keepY = keepY, ncomp = 2, mode = "canonical")
#' # get clusters and plot longitudinal profile by cluster
#' block.spls.cluster <- getCluster(block.spls.res)
#' @import mixOmics
#' @importFrom dplyr left_join
#' @importFrom dplyr mutate
#' @importFrom dplyr filter
#' @export
tuneCluster.block.spls <- function(X, Y = NULL, indY = NULL, ncomp = 2,
test.list.keepX = NULL, test.keepY = NULL, ...){
#-- checking input parameters ---------------------------------------------#
#--------------------------------------------------------------------------#
#-- X
X <- validate_list_matrix_X(X)
#-- Y
if(!is.null(Y)){
Y <- validate_matrix_Y(Y)
} else {
indY <- validate_indY(indY, X)
}
#-- ncomp
ncomp <- validate_ncomp(ncomp = ncomp, X =X)
#-- keepX
test.list.keepX <- validate_test_list_keepX(test.keepX = test.list.keepX, X = X, ncomp = ncomp)
#-- keepY
test.keepY <- validate_test_keepY(test.keepY = test.keepY, Y = Y)
list.keepX.keepY <- test.list.keepX
if(!is.null(Y)) {
list.keepX.keepY$Y <- test.keepY
}
list.keepX.keepY <- expand.grid(list.keepX.keepY, stringsAsFactors = FALSE,
KEEP.OUT.ATTRS = FALSE)
#-- launch tuning --------------------------------------------------------#
#--------------------------------------------------------------------------#
#-- 0. set output object
result <- matrix(ncol = 3 + length(X),nrow = nrow(list.keepX.keepY)*ncomp) %>%
as.data.frame() %>%
purrr::set_names(c("comp", names(X), "pos", "neg"))
if(!is.null(Y)){
result <- result %>%
mutate("Y" = NA)
}
result.index <- 1
#--1. compute dissimilarity matrix for silhouette coef. (once and for all)
all_data <- X
all_data_name <- as.character(unlist(imap(X, ~rep(.y,ncol(.x)))))
if(is.null(all_data$Y) & !is.null(Y)){
all_data$Y <- Y
all_data_name <- c(all_data_name, rep("Y", ncol(Y)))
}
all_data <- do.call("cbind", all_data)
dmatrix <- dmatrix.spearman.dissimilarity(all_data)
cluster <- as.data.frame(list("feature" = rownames(dmatrix),
"block" = all_data_name))
#--2. tuning
for(comp in 1:ncomp){
for(index.list.kX.kY in 1:nrow(list.keepX.keepY)){
# foreach comp, keepX and keepY of other comp is set to minimum
kX <- lapply(list.keepX.keepY, function(x) rep(min(x), ncomp))
for(block in names(list.keepX.keepY)){
kX[[block]][comp] <- list.keepX.keepY[index.list.kX.kY, block]
result[result.index, block] = list.keepX.keepY[index.list.kX.kY, block]
}
if(!is.null(Y)){
kY <- kX[["Y"]]
kX[["Y"]] <- NULL
#--3. run spls
block.spls.res <- mixOmics::block.spls(X = X, Y = Y, ncomp = ncomp,
keepX = kX, keepY = kY, ...)
} else {
#--3. run spls
block.spls.res <- mixOmics::block.spls(X = X, ncomp = ncomp,
keepX = kX, indY = indY, ...)
}
#--4. extract clusters
tmp.cluster <- getCluster(block.spls.res)
tmp.cluster <- suppressWarnings(dplyr::left_join(cluster, tmp.cluster[c(1,4)],
by = c("feature"="molecule"))) %>%
mutate(cluster = as.numeric(as.character(cluster))) %>%
mutate(cluster = ifelse(is.na(cluster), 0, cluster))
#--5. compute silhouette
sil <- silhouette(dmatrix, tmp.cluster$cluster)
#--6. store
result[result.index, "comp"] <- comp
pos.res <- sil$average.cluster %>%
dplyr::filter(cluster == comp) %>%
dplyr::pull(silhouette.coef)
result[result.index, "pos"] <- ifelse(length(pos.res) == 0, NA, pos.res)
neg.res <- sil$average.cluster %>%
dplyr::filter(cluster == -comp) %>%
dplyr::pull(silhouette.coef)
result[result.index, "neg"] <- ifelse(length(neg.res) == 0, NA, neg.res)
result.index <- result.index + 1
}
}
result <- list("silhouette" = result)
result[["ncomp"]] <- ncomp
result[["test.keepX"]] <- test.list.keepX
result[["test.keepY"]] <- test.keepY
result[["block"]] <- names(list.keepX.keepY)
class(result) <- "block.spls.tune.silhouette"
#-- 7. choice.keepX / choice.keepY
result[["slopes"]] <- tune.silhouette.get_slopes(result)
tmp <- tune.silhouette.get_choice_keepX(result) %>%
do.call(what = "rbind")
if(!is.null(Y)){ # choice keepY
result[["choice.keepY"]] <- tmp$Y
tmp <- dplyr::select(tmp, -Y)
}
result[["choice.keepX"]] <- as.list(tmp)
return(result)
}
#' @importFrom purrr imap set_names map_dfr
#' @importFrom dplyr filter mutate group_by summarise left_join n
#' @importFrom stringr str_split
tune.silhouette.get_slopes <- function(object){
stopifnot(class(object) %in% c("block.spls.tune.silhouette", "spls.tune.silhouette", "spca.tune.silhouette"))
# tune.silhouette is a data.frame (comp, X, Y, bock ..., pos, neg)
# tune.silhouette <- tune.block.spls$silhouette
block <- object$block
ncomp <- object$ncomp
if(is(object, "block.spls.tune.silhouette")){
coord <- object$test.keepX
if(!is.null(object[["test.keepY"]])){
coord[["Y"]] <- object$test.keepY
coord <- lapply(coord, sort)
}
}else if(is(object, "spls.tune.silhouette")){
coord <- list(X= sort(object$test.keepX),
Y= sort(object$test.keepY))
}else if(is(object, "spca.tune.silhouette")){
coord <- list(X= sort(object$test.keepX))
}
# get all points
all.points <- unique(object$silhouette[block])
# define neighbours
neighbourhood <- map_dfr(1:nrow(all.points), ~tune.silhouette.get_neighbours(coord = coord, all.points[.x,,drop=FALSE]))
# extract pos and neg and set it as named list for better performance
split_by_comp <- split(object$silhouette, f= as.factor(object$silhouette$comp))
names_split_by_comp <- lapply(split_by_comp,
function(x) as.vector(apply(x[block], 1, function(y) paste(y, collapse = "_"))))
POS <- purrr::imap(split_by_comp, ~ purrr::set_names(.x[["pos"]], names_split_by_comp[[.y]]))
NEG <- purrr::imap(split_by_comp, ~ purrr::set_names(.x[["neg"]], names_split_by_comp[[.y]]))
# compute slope (pos / neg by comp)
slopes <- purrr::map_dfr(as.character(1:ncomp), ~{
cbind(neighbourhood,
"origin.pos" = POS[[.x]][neighbourhood$origin],
"destination.pos" = POS[[.x]][neighbourhood$destination],
"origin.neg" = NEG[[.x]][neighbourhood$origin],
"destination.neg" = NEG[[.x]][neighbourhood$destination],
"comp" = as.numeric(.x))})
slopes <- slopes %>%
dplyr::filter(origin!=destination) %>%
dplyr::mutate("slope.pos" = tune.silhouette.get_slopes_coef(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric),
x2 = lapply(stringr::str_split(.$destination, "_"), as.numeric),
y1 = .$origin.pos,
y2 = .$destination.pos)) %>%
dplyr::mutate("slope.neg" = tune.silhouette.get_slopes_coef(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric),
x2 = lapply(stringr::str_split(.$destination, "_"), as.numeric),
y1 = .$origin.neg,
y2 = .$destination.neg)) %>%
dplyr::mutate("distance_from_origin" = tune.silhouette.distance_from_origin(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric)))
# cumpute SD by comp and direction
SD <- slopes %>%
dplyr::group_by(comp, direction) %>%
dplyr::summarise(sd.pos = sd_new(slope.pos, na.rm = TRUE),
sd.neg = sd_new(slope.neg, na.rm = TRUE),
mean.pos = mean(slope.pos, na.rm = TRUE),
mean.neg = mean(slope.neg, na.rm = TRUE), N=dplyr::n())
# add Pval for signif slopes
slopes <- slopes %>%
dplyr::left_join(SD, by = c("direction", "comp")) %>%
# pos
dplyr::mutate(Z_score.pos = ifelse(.$sd.pos == 0,0,(.$slope.pos - .$mean.pos)/.$sd.pos)) %>%
dplyr::mutate(Pval.pos = ifelse(Z_score.pos >= 0,1-pnorm(Z_score.pos), pnorm(Z_score.pos))) %>%
# neg
dplyr::mutate(Z_score.neg = ifelse(.$sd.neg == 0,0,(.$slope.neg - .$mean.neg)/.$sd.neg)) %>%
dplyr::mutate(Pval.neg = ifelse(Z_score.neg >= 0,1-pnorm(Z_score.neg), pnorm(Z_score.neg)))
return(slopes)
}
#' @importFrom purrr map2
tune.silhouette.get_neighbours <- function(coord, point){
# return forward neighbourhood of a point
# valid point in coord
stopifnot(all(names(coord) == names(point)))
neighbour.max <- point
index <- purrr::map2(coord, point, ~which(.x == .y))
# get max neighbours
for(i in names(index)){
if(!(length(coord[[i]]) == index[[i]])){
neighbour.max[[i]] <- coord[[i]][index[[i]]+1]
}
} # can be simplified
# get all close possible neighbours
neighbourhood <- as.list(rbind(as.data.frame(point), as.data.frame(neighbour.max))) %>%
lapply(unique) %>%
expand.grid(stringsAsFactors = FALSE, KEEP.OUT.ATTRS = FALSE)
# get direction for standard deviation computation
direction <- vector(mode = "numeric", length = nrow(neighbourhood))
for(i in 1:nrow(neighbourhood)){
direction[i] <- purrr::map2(coord, neighbourhood[i,], ~which(.x == .y)) %>%
purrr::map2(index, ~{.x-.y}) %>%
paste(collapse = "_")
}
neighbourhood["direction"] <- direction
neighbourhood["origin"] <- paste(point, collapse = "_")
neighbourhood["destination"] <- apply(neighbourhood[names(coord)], 1,
function(x) paste(x, collapse = "_"))
return(neighbourhood)
}
#' @importFrom purrr map_dbl
tune.silhouette.get_slopes_coef <- function(x1,x2,y1,y2){
stopifnot(length(x1) == length(x2))
stopifnot(length(x2) == length(y1))
stopifnot(length(y1) == length(y2))
res <- purrr::map_dbl(seq_along(x1), ~{
euc.dist <- sqrt(sum((x1[[.x]] - x2[[.x]])^2));
ifelse(euc.dist == 0, 0, (y2[.x] - y1[.x])/euc.dist)})
return(res)
}
#' @importFrom purrr map_dbl
tune.silhouette.distance_from_origin <- function(x1){
euc.dist <- purrr::map_dbl(seq_along(x1), ~{ sqrt(sum((x1[[.x]])^2))})
return(euc.dist)
}
#' @importFrom dplyr select summarise left_join
#' @importFrom tidyr gather
#' @importFrom magrittr %>%
tune.silhouette.get_choice_keepX <- function(tune.block.spls){
# from slopes, keep useful columns and remove NAs.
slopes <- tune.block.spls$slopes
tmp <- slopes %>%
dplyr::select(c(tune.block.spls$block, comp, direction, Pval.pos, Pval.neg, distance_from_origin)) %>%
tidyr::gather(Pval.dir, Pval.value, -c(tune.block.spls$block, comp, direction, distance_from_origin))
# for each comp, arrange by Pvalue and distance from origin and get first result
MIN <- split(tmp, f=tmp$comp) %>%
na.omit() %>%
lapply(function(x) x %>%
dplyr::arrange(Pval.value, distance_from_origin) %>%
.[1,] %>%
dplyr::select(tune.block.spls$block))
return(MIN)
}