a b/R/tuneCluster.block.spls.R
1
#' Feature Selection Optimization for block (s)PLS method 
2
#'
3
#' This function identify the number of feautures to keep per component and thus by cluster in \code{mixOmics::block.spls} by optimizing the silhouette coefficient, which assesses the quality of clustering.
4
#' 
5
#' @param X list of numeric matrix (or data.frame) with features in columns and samples in rows (with samples order matching in all data sets).
6
#' @param Y (optional) numeric matrix (or data.frame) with features in columns and samples in rows (same rows as \code{X}).
7
#' @param indY integer, to supply if Y is missing, indicates the position of the matrix response in the list \code{X}.
8
#' @param ncomp integer, number of component to include in the model
9
#' @param test.list.keepX list of integers with the same size as X. Each entry corresponds to the different keepX value to test for each block of \code{X}.
10
#' @param test.keepY only if Y is provideid. Vector of integer containing the different value of keepY to test for block \code{Y}.
11
#' @param ... other parameters to be included in the spls model (see \code{mixOmics::block.spls})
12
#' 
13
#' @return 
14
#' \item{silhouette}{silhouette coef. computed for every combinasion of keepX/keepY}
15
#' \item{ncomp}{number of component included in the model}
16
#' \item{test.keepX}{list of tested keepX}
17
#' \item{test.keepY}{list of tested keepY}
18
#' \item{block}{names of blocks}
19
#' \item{slopes}{"slopes" computed from the silhouette coef. for each keepX and keepY, used to determine the best keepX and keepY}
20
#' \item{choice.keepX}{best \code{keepX} for each component}
21
#' \item{choice.keepY}{best \code{keepY} for each component}
22
#' 
23
#'
24
#' @details
25
#' For each component and for each keepX/keepY value, a spls is done from these parameters.
26
#' Then the clustering is performed and the silhouette coefficient is calculated for this clustering.
27
#'
28
#' We then calculate "slopes" where keepX/keepY are the coordinates and the silhouette is the intensity.
29
#' A z-score is assigned to each slope.
30
#' We then identify the most significant slope which indicates a drop in the silhouette coefficient and thus a deterioration of the clustering.
31
#'
32
#' 
33
#' @seealso 
34
#' \code{\link[mixOmics]{block.spls}}, \code{\link[timeOmics]{getCluster}}, \code{\link[timeOmics]{plotLong}}
35
#'
36
#' @examples
37
#' demo <- suppressWarnings(get_demo_cluster())
38
#' X <- list(X = demo$X, Z = demo$Z)
39
#' Y <- demo$Y
40
#' test.list.keepX <- list("X" = c(5,10,15,20), "Z" = c(2,4,6,8))
41
#' test.keepY <- c(2:5)
42
#' 
43
#' # tuning
44
#' tune.block.spls <- tuneCluster.block.spls(X= X, Y= Y, 
45
#'                                           test.list.keepX= test.list.keepX, 
46
#'                                           test.keepY= test.keepY, 
47
#'                                           mode= "canonical")
48
#' keepX <- tune.block.spls$choice.keepX
49
#' keepY <- tune.block.spls$choice.keepY
50
#' 
51
#' # final model
52
#' block.spls.res <- mixOmics::block.spls(X= X, Y= Y, keepX = keepX, 
53
#'                              keepY = keepY, ncomp = 2, mode = "canonical")
54
#' # get clusters and plot longitudinal profile by cluster
55
#' block.spls.cluster <- getCluster(block.spls.res)
56
57
#' @import mixOmics
58
#' @importFrom dplyr left_join
59
#' @importFrom dplyr mutate
60
#' @importFrom dplyr filter
61
#' @export
62
tuneCluster.block.spls <- function(X, Y = NULL,  indY = NULL, ncomp = 2, 
63
                                   test.list.keepX = NULL, test.keepY = NULL, ...){
64
    #-- checking input parameters ---------------------------------------------#
65
    #--------------------------------------------------------------------------#
66
67
    #-- X
68
    X <- validate_list_matrix_X(X)
69
70
    #-- Y
71
    if(!is.null(Y)){
72
        Y <- validate_matrix_Y(Y)
73
    } else {
74
        indY <- validate_indY(indY, X)
75
    }
76
77
    #-- ncomp
78
    ncomp <- validate_ncomp(ncomp = ncomp, X =X)
79
80
    #-- keepX
81
    test.list.keepX <- validate_test_list_keepX(test.keepX = test.list.keepX, X = X, ncomp = ncomp)
82
83
    #-- keepY
84
    test.keepY <- validate_test_keepY(test.keepY = test.keepY, Y = Y)
85
86
    list.keepX.keepY <- test.list.keepX
87
    if(!is.null(Y)) {
88
        list.keepX.keepY$Y <- test.keepY
89
    }
90
    list.keepX.keepY <- expand.grid(list.keepX.keepY, stringsAsFactors = FALSE,
91
                                    KEEP.OUT.ATTRS = FALSE)
92
93
94
    #-- launch tuning  --------------------------------------------------------#
95
    #--------------------------------------------------------------------------#
96
97
    #-- 0. set output object
98
    result <- matrix(ncol = 3 + length(X),nrow = nrow(list.keepX.keepY)*ncomp) %>%
99
        as.data.frame() %>%
100
        purrr::set_names(c("comp", names(X), "pos", "neg"))
101
    if(!is.null(Y)){
102
        result <- result %>%
103
            mutate("Y" = NA)
104
    }
105
    result.index <- 1
106
107
    #--1. compute dissimilarity matrix for silhouette coef. (once and for all)
108
    all_data <- X
109
    all_data_name <- as.character(unlist(imap(X, ~rep(.y,ncol(.x)))))
110
    if(is.null(all_data$Y) & !is.null(Y)){
111
        all_data$Y <- Y
112
        all_data_name <- c(all_data_name, rep("Y", ncol(Y)))
113
    }
114
    all_data <- do.call("cbind", all_data)
115
    dmatrix <- dmatrix.spearman.dissimilarity(all_data)
116
117
    cluster <- as.data.frame(list("feature" = rownames(dmatrix),
118
                                  "block" = all_data_name))
119
120
    #--2. tuning
121
    for(comp in 1:ncomp){
122
        for(index.list.kX.kY in 1:nrow(list.keepX.keepY)){
123
            # foreach comp, keepX and keepY of other comp is set to minimum
124
            kX <- lapply(list.keepX.keepY, function(x) rep(min(x), ncomp))
125
            for(block in names(list.keepX.keepY)){
126
                kX[[block]][comp] <- list.keepX.keepY[index.list.kX.kY, block]
127
                result[result.index, block] = list.keepX.keepY[index.list.kX.kY, block]
128
            }
129
            if(!is.null(Y)){
130
                kY <- kX[["Y"]]
131
                kX[["Y"]] <- NULL
132
133
                #--3. run spls
134
                block.spls.res <- mixOmics::block.spls(X = X, Y = Y, ncomp =  ncomp,
135
                                                       keepX = kX, keepY = kY, ...)
136
            } else {
137
                #--3. run spls
138
                block.spls.res <- mixOmics::block.spls(X = X, ncomp =  ncomp,
139
                                                       keepX = kX, indY = indY, ...)
140
            }
141
142
143
            #--4. extract clusters
144
            tmp.cluster <- getCluster(block.spls.res)
145
            tmp.cluster <- suppressWarnings(dplyr::left_join(cluster, tmp.cluster[c(1,4)],
146
                                                             by = c("feature"="molecule"))) %>%
147
                mutate(cluster = as.numeric(as.character(cluster))) %>%
148
                mutate(cluster = ifelse(is.na(cluster), 0, cluster))
149
150
            #--5. compute silhouette
151
            sil <- silhouette(dmatrix, tmp.cluster$cluster)
152
153
            #--6. store
154
            result[result.index, "comp"] <- comp
155
            pos.res <-  sil$average.cluster  %>% 
156
                dplyr::filter(cluster == comp) %>%
157
                dplyr::pull(silhouette.coef)
158
            result[result.index, "pos"] <- ifelse(length(pos.res) == 0, NA, pos.res)
159
            neg.res <-  sil$average.cluster  %>%
160
                dplyr::filter(cluster == -comp) %>%
161
                dplyr::pull(silhouette.coef)
162
            result[result.index, "neg"] <- ifelse(length(neg.res) == 0, NA, neg.res)
163
            result.index <- result.index + 1
164
        }
165
    }
166
    result <- list("silhouette" = result)
167
    result[["ncomp"]] <- ncomp 
168
    result[["test.keepX"]] <- test.list.keepX
169
    result[["test.keepY"]] <- test.keepY
170
    result[["block"]] <- names(list.keepX.keepY)
171
    class(result) <- "block.spls.tune.silhouette"
172
    
173
    #-- 7. choice.keepX / choice.keepY
174
    result[["slopes"]] <- tune.silhouette.get_slopes(result)
175
    tmp <- tune.silhouette.get_choice_keepX(result) %>% 
176
        do.call(what = "rbind")
177
    if(!is.null(Y)){ # choice keepY
178
        result[["choice.keepY"]] <- tmp$Y
179
        tmp <- dplyr::select(tmp, -Y)
180
    }
181
    result[["choice.keepX"]] <- as.list(tmp)
182
    return(result)
183
}
184
185
#' @importFrom purrr imap set_names map_dfr
186
#' @importFrom dplyr filter mutate group_by summarise left_join n
187
#' @importFrom stringr str_split
188
tune.silhouette.get_slopes <- function(object){
189
    stopifnot(class(object) %in% c("block.spls.tune.silhouette", "spls.tune.silhouette", "spca.tune.silhouette"))
190
    # tune.silhouette is a data.frame (comp, X, Y, bock ..., pos, neg)
191
    # tune.silhouette <- tune.block.spls$silhouette
192
    
193
    block <- object$block
194
    ncomp <- object$ncomp
195
    
196
    if(is(object, "block.spls.tune.silhouette")){
197
        coord <- object$test.keepX
198
        if(!is.null(object[["test.keepY"]])){
199
            coord[["Y"]] <- object$test.keepY
200
            coord <- lapply(coord, sort)
201
        }
202
    }else if(is(object, "spls.tune.silhouette")){
203
        coord <- list(X= sort(object$test.keepX), 
204
                      Y= sort(object$test.keepY))
205
    }else if(is(object, "spca.tune.silhouette")){
206
        coord <- list(X= sort(object$test.keepX))
207
    }
208
209
    # get all points 
210
    all.points <- unique(object$silhouette[block])
211
    
212
    # define neighbours
213
    neighbourhood <- map_dfr(1:nrow(all.points), ~tune.silhouette.get_neighbours(coord = coord, all.points[.x,,drop=FALSE]))
214
    
215
    # extract pos and neg and set it as named list for better performance
216
    split_by_comp <- split(object$silhouette, f= as.factor(object$silhouette$comp))
217
    names_split_by_comp <- lapply(split_by_comp, 
218
                                  function(x) as.vector(apply(x[block], 1, function(y) paste(y, collapse = "_"))))
219
    
220
    POS <- purrr::imap(split_by_comp, ~ purrr::set_names(.x[["pos"]], names_split_by_comp[[.y]]))
221
    NEG <- purrr::imap(split_by_comp, ~ purrr::set_names(.x[["neg"]], names_split_by_comp[[.y]]))
222
    
223
    # compute slope (pos / neg by comp)
224
    slopes <- purrr::map_dfr(as.character(1:ncomp), ~{
225
        cbind(neighbourhood,
226
              "origin.pos" = POS[[.x]][neighbourhood$origin],
227
              "destination.pos" = POS[[.x]][neighbourhood$destination],
228
              "origin.neg" = NEG[[.x]][neighbourhood$origin],
229
              "destination.neg" = NEG[[.x]][neighbourhood$destination],
230
              "comp" = as.numeric(.x))})
231
    
232
    slopes <- slopes %>%
233
        dplyr::filter(origin!=destination) %>%
234
        dplyr::mutate("slope.pos" = tune.silhouette.get_slopes_coef(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric),
235
                                                             x2 = lapply(stringr::str_split(.$destination, "_"), as.numeric),
236
                                                             y1 = .$origin.pos,
237
                                                             y2 = .$destination.pos)) %>%
238
        dplyr::mutate("slope.neg" = tune.silhouette.get_slopes_coef(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric),
239
                                                             x2 = lapply(stringr::str_split(.$destination, "_"), as.numeric),
240
                                                             y1 = .$origin.neg,
241
                                                             y2 = .$destination.neg)) %>%
242
        dplyr::mutate("distance_from_origin" = tune.silhouette.distance_from_origin(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric)))
243
    
244
    # cumpute SD by comp and direction
245
    SD <- slopes %>% 
246
        dplyr::group_by(comp, direction) %>% 
247
        dplyr::summarise(sd.pos = sd_new(slope.pos, na.rm = TRUE),
248
                         sd.neg = sd_new(slope.neg, na.rm = TRUE),
249
                         mean.pos = mean(slope.pos, na.rm = TRUE),
250
                         mean.neg = mean(slope.neg, na.rm = TRUE), N=dplyr::n())
251
    
252
    # add Pval for signif slopes
253
    slopes <- slopes %>%
254
        dplyr::left_join(SD, by = c("direction", "comp")) %>%
255
        # pos
256
        dplyr::mutate(Z_score.pos = ifelse(.$sd.pos == 0,0,(.$slope.pos - .$mean.pos)/.$sd.pos)) %>%
257
        dplyr::mutate(Pval.pos = ifelse(Z_score.pos >= 0,1-pnorm(Z_score.pos), pnorm(Z_score.pos))) %>%
258
        # neg
259
        dplyr::mutate(Z_score.neg = ifelse(.$sd.neg == 0,0,(.$slope.neg - .$mean.neg)/.$sd.neg)) %>%
260
        dplyr::mutate(Pval.neg = ifelse(Z_score.neg >= 0,1-pnorm(Z_score.neg), pnorm(Z_score.neg)))
261
    return(slopes)
262
}
263
264
#' @importFrom purrr map2
265
tune.silhouette.get_neighbours <- function(coord, point){
266
    # return forward neighbourhood of a point
267
    
268
    # valid point in coord
269
    stopifnot(all(names(coord) == names(point)))
270
    
271
    neighbour.max <- point
272
    index <- purrr::map2(coord, point, ~which(.x == .y))
273
274
    # get max neighbours
275
    for(i in names(index)){
276
        if(!(length(coord[[i]]) == index[[i]])){ 
277
            neighbour.max[[i]] <- coord[[i]][index[[i]]+1]
278
        }
279
    } # can be simplified
280
    
281
    # get all close possible neighbours
282
    neighbourhood <- as.list(rbind(as.data.frame(point), as.data.frame(neighbour.max))) %>%
283
        lapply(unique) %>%
284
    expand.grid(stringsAsFactors = FALSE, KEEP.OUT.ATTRS = FALSE)
285
    
286
    # get direction for standard deviation computation
287
    direction <- vector(mode = "numeric", length = nrow(neighbourhood))
288
    for(i in 1:nrow(neighbourhood)){
289
        direction[i] <- purrr::map2(coord, neighbourhood[i,], ~which(.x == .y)) %>%
290
            purrr::map2(index, ~{.x-.y}) %>%
291
            paste(collapse = "_")
292
    }
293
    
294
    neighbourhood["direction"] <- direction
295
    neighbourhood["origin"] <- paste(point, collapse = "_")
296
    neighbourhood["destination"] <- apply(neighbourhood[names(coord)], 1, 
297
                                          function(x) paste(x, collapse = "_"))
298
    return(neighbourhood)
299
}
300
301
#' @importFrom purrr map_dbl
302
tune.silhouette.get_slopes_coef <- function(x1,x2,y1,y2){
303
    stopifnot(length(x1) == length(x2))
304
    stopifnot(length(x2) == length(y1))
305
    stopifnot(length(y1) == length(y2))
306
    
307
    res <- purrr::map_dbl(seq_along(x1), ~{
308
        euc.dist <- sqrt(sum((x1[[.x]] - x2[[.x]])^2));
309
        ifelse(euc.dist == 0, 0, (y2[.x] - y1[.x])/euc.dist)})
310
    return(res)
311
}
312
313
#' @importFrom purrr map_dbl
314
tune.silhouette.distance_from_origin <- function(x1){
315
    euc.dist <- purrr::map_dbl(seq_along(x1), ~{ sqrt(sum((x1[[.x]])^2))})
316
    return(euc.dist)
317
}
318
319
#' @importFrom dplyr select summarise left_join
320
#' @importFrom tidyr gather
321
#' @importFrom magrittr %>%
322
tune.silhouette.get_choice_keepX <- function(tune.block.spls){
323
    # from slopes, keep useful columns and remove NAs.
324
    slopes <- tune.block.spls$slopes
325
    tmp <- slopes %>%
326
        dplyr::select(c(tune.block.spls$block, comp, direction, Pval.pos, Pval.neg, distance_from_origin)) %>%
327
        tidyr::gather(Pval.dir, Pval.value, -c(tune.block.spls$block, comp, direction, distance_from_origin)) 
328
    
329
    # for each comp, arrange by Pvalue and distance from origin and get first result    
330
    MIN <- split(tmp, f=tmp$comp) %>% 
331
        na.omit() %>% 
332
        lapply(function(x) x %>%
333
                   dplyr::arrange(Pval.value, distance_from_origin) %>%
334
                   .[1,] %>%
335
                   dplyr::select(tune.block.spls$block))
336
    
337
    return(MIN)
338
}