timeOmics / Git / Diff of /R/tuneCluster.block.spls.R

Models:
AlyssaS/
timeOmics
Downloads: 1
Diff of /R/tuneCluster.block.spls.R [000000] .. [d79ff0]
Switch to side-by-side view

--- a
+++ b/R/tuneCluster.block.spls.R
@@ -0,0 +1,338 @@
+#' Feature Selection Optimization for block (s)PLS method 
+#'
+#' This function identify the number of feautures to keep per component and thus by cluster in \code{mixOmics::block.spls} by optimizing the silhouette coefficient, which assesses the quality of clustering.
+#' 
+#' @param X list of numeric matrix (or data.frame) with features in columns and samples in rows (with samples order matching in all data sets).
+#' @param Y (optional) numeric matrix (or data.frame) with features in columns and samples in rows (same rows as \code{X}).
+#' @param indY integer, to supply if Y is missing, indicates the position of the matrix response in the list \code{X}.
+#' @param ncomp integer, number of component to include in the model
+#' @param test.list.keepX list of integers with the same size as X. Each entry corresponds to the different keepX value to test for each block of \code{X}.
+#' @param test.keepY only if Y is provideid. Vector of integer containing the different value of keepY to test for block \code{Y}.
+#' @param ... other parameters to be included in the spls model (see \code{mixOmics::block.spls})
+#' 
+#' @return 
+#' \item{silhouette}{silhouette coef. computed for every combinasion of keepX/keepY}
+#' \item{ncomp}{number of component included in the model}
+#' \item{test.keepX}{list of tested keepX}
+#' \item{test.keepY}{list of tested keepY}
+#' \item{block}{names of blocks}
+#' \item{slopes}{"slopes" computed from the silhouette coef. for each keepX and keepY, used to determine the best keepX and keepY}
+#' \item{choice.keepX}{best \code{keepX} for each component}
+#' \item{choice.keepY}{best \code{keepY} for each component}
+#' 
+#'
+#' @details
+#' For each component and for each keepX/keepY value, a spls is done from these parameters.
+#' Then the clustering is performed and the silhouette coefficient is calculated for this clustering.
+#'
+#' We then calculate "slopes" where keepX/keepY are the coordinates and the silhouette is the intensity.
+#' A z-score is assigned to each slope.
+#' We then identify the most significant slope which indicates a drop in the silhouette coefficient and thus a deterioration of the clustering.
+#'
+#' 
+#' @seealso 
+#' \code{\link[mixOmics]{block.spls}}, \code{\link[timeOmics]{getCluster}}, \code{\link[timeOmics]{plotLong}}
+#'
+#' @examples
+#' demo <- suppressWarnings(get_demo_cluster())
+#' X <- list(X = demo$X, Z = demo$Z)
+#' Y <- demo$Y
+#' test.list.keepX <- list("X" = c(5,10,15,20), "Z" = c(2,4,6,8))
+#' test.keepY <- c(2:5)
+#' 
+#' # tuning
+#' tune.block.spls <- tuneCluster.block.spls(X= X, Y= Y, 
+#'                                           test.list.keepX= test.list.keepX, 
+#'                                           test.keepY= test.keepY, 
+#'                                           mode= "canonical")
+#' keepX <- tune.block.spls$choice.keepX
+#' keepY <- tune.block.spls$choice.keepY
+#' 
+#' # final model
+#' block.spls.res <- mixOmics::block.spls(X= X, Y= Y, keepX = keepX, 
+#'                              keepY = keepY, ncomp = 2, mode = "canonical")
+#' # get clusters and plot longitudinal profile by cluster
+#' block.spls.cluster <- getCluster(block.spls.res)
+
+#' @import mixOmics
+#' @importFrom dplyr left_join
+#' @importFrom dplyr mutate
+#' @importFrom dplyr filter
+#' @export
+tuneCluster.block.spls <- function(X, Y = NULL,  indY = NULL, ncomp = 2, 
+                                   test.list.keepX = NULL, test.keepY = NULL, ...){
+    #-- checking input parameters ---------------------------------------------#
+    #--------------------------------------------------------------------------#
+
+    #-- X
+    X <- validate_list_matrix_X(X)
+
+    #-- Y
+    if(!is.null(Y)){
+        Y <- validate_matrix_Y(Y)
+    } else {
+        indY <- validate_indY(indY, X)
+    }
+
+    #-- ncomp
+    ncomp <- validate_ncomp(ncomp = ncomp, X =X)
+
+    #-- keepX
+    test.list.keepX <- validate_test_list_keepX(test.keepX = test.list.keepX, X = X, ncomp = ncomp)
+
+    #-- keepY
+    test.keepY <- validate_test_keepY(test.keepY = test.keepY, Y = Y)
+
+    list.keepX.keepY <- test.list.keepX
+    if(!is.null(Y)) {
+        list.keepX.keepY$Y <- test.keepY
+    }
+    list.keepX.keepY <- expand.grid(list.keepX.keepY, stringsAsFactors = FALSE,
+                                    KEEP.OUT.ATTRS = FALSE)
+
+
+    #-- launch tuning  --------------------------------------------------------#
+    #--------------------------------------------------------------------------#
+
+    #-- 0. set output object
+    result <- matrix(ncol = 3 + length(X),nrow = nrow(list.keepX.keepY)*ncomp) %>%
+        as.data.frame() %>%
+        purrr::set_names(c("comp", names(X), "pos", "neg"))
+    if(!is.null(Y)){
+        result <- result %>%
+            mutate("Y" = NA)
+    }
+    result.index <- 1
+
+    #--1. compute dissimilarity matrix for silhouette coef. (once and for all)
+    all_data <- X
+    all_data_name <- as.character(unlist(imap(X, ~rep(.y,ncol(.x)))))
+    if(is.null(all_data$Y) & !is.null(Y)){
+        all_data$Y <- Y
+        all_data_name <- c(all_data_name, rep("Y", ncol(Y)))
+    }
+    all_data <- do.call("cbind", all_data)
+    dmatrix <- dmatrix.spearman.dissimilarity(all_data)
+
+    cluster <- as.data.frame(list("feature" = rownames(dmatrix),
+                                  "block" = all_data_name))
+
+    #--2. tuning
+    for(comp in 1:ncomp){
+        for(index.list.kX.kY in 1:nrow(list.keepX.keepY)){
+            # foreach comp, keepX and keepY of other comp is set to minimum
+            kX <- lapply(list.keepX.keepY, function(x) rep(min(x), ncomp))
+            for(block in names(list.keepX.keepY)){
+                kX[[block]][comp] <- list.keepX.keepY[index.list.kX.kY, block]
+                result[result.index, block] = list.keepX.keepY[index.list.kX.kY, block]
+            }
+            if(!is.null(Y)){
+                kY <- kX[["Y"]]
+                kX[["Y"]] <- NULL
+
+                #--3. run spls
+                block.spls.res <- mixOmics::block.spls(X = X, Y = Y, ncomp =  ncomp,
+                                                       keepX = kX, keepY = kY, ...)
+            } else {
+                #--3. run spls
+                block.spls.res <- mixOmics::block.spls(X = X, ncomp =  ncomp,
+                                                       keepX = kX, indY = indY, ...)
+            }
+
+
+            #--4. extract clusters
+            tmp.cluster <- getCluster(block.spls.res)
+            tmp.cluster <- suppressWarnings(dplyr::left_join(cluster, tmp.cluster[c(1,4)],
+                                                             by = c("feature"="molecule"))) %>%
+                mutate(cluster = as.numeric(as.character(cluster))) %>%
+                mutate(cluster = ifelse(is.na(cluster), 0, cluster))
+
+            #--5. compute silhouette
+            sil <- silhouette(dmatrix, tmp.cluster$cluster)
+
+            #--6. store
+            result[result.index, "comp"] <- comp
+            pos.res <-  sil$average.cluster  %>% 
+                dplyr::filter(cluster == comp) %>%
+                dplyr::pull(silhouette.coef)
+            result[result.index, "pos"] <- ifelse(length(pos.res) == 0, NA, pos.res)
+            neg.res <-  sil$average.cluster  %>%
+                dplyr::filter(cluster == -comp) %>%
+                dplyr::pull(silhouette.coef)
+            result[result.index, "neg"] <- ifelse(length(neg.res) == 0, NA, neg.res)
+            result.index <- result.index + 1
+        }
+    }
+    result <- list("silhouette" = result)
+    result[["ncomp"]] <- ncomp 
+    result[["test.keepX"]] <- test.list.keepX
+    result[["test.keepY"]] <- test.keepY
+    result[["block"]] <- names(list.keepX.keepY)
+    class(result) <- "block.spls.tune.silhouette"
+    
+    #-- 7. choice.keepX / choice.keepY
+    result[["slopes"]] <- tune.silhouette.get_slopes(result)
+    tmp <- tune.silhouette.get_choice_keepX(result) %>% 
+        do.call(what = "rbind")
+    if(!is.null(Y)){ # choice keepY
+        result[["choice.keepY"]] <- tmp$Y
+        tmp <- dplyr::select(tmp, -Y)
+    }
+    result[["choice.keepX"]] <- as.list(tmp)
+    return(result)
+}
+
+#' @importFrom purrr imap set_names map_dfr
+#' @importFrom dplyr filter mutate group_by summarise left_join n
+#' @importFrom stringr str_split
+tune.silhouette.get_slopes <- function(object){
+    stopifnot(class(object) %in% c("block.spls.tune.silhouette", "spls.tune.silhouette", "spca.tune.silhouette"))
+    # tune.silhouette is a data.frame (comp, X, Y, bock ..., pos, neg)
+    # tune.silhouette <- tune.block.spls$silhouette
+    
+    block <- object$block
+    ncomp <- object$ncomp
+    
+    if(is(object, "block.spls.tune.silhouette")){
+        coord <- object$test.keepX
+        if(!is.null(object[["test.keepY"]])){
+            coord[["Y"]] <- object$test.keepY
+            coord <- lapply(coord, sort)
+        }
+    }else if(is(object, "spls.tune.silhouette")){
+        coord <- list(X= sort(object$test.keepX), 
+                      Y= sort(object$test.keepY))
+    }else if(is(object, "spca.tune.silhouette")){
+        coord <- list(X= sort(object$test.keepX))
+    }
+
+    # get all points 
+    all.points <- unique(object$silhouette[block])
+    
+    # define neighbours
+    neighbourhood <- map_dfr(1:nrow(all.points), ~tune.silhouette.get_neighbours(coord = coord, all.points[.x,,drop=FALSE]))
+    
+    # extract pos and neg and set it as named list for better performance
+    split_by_comp <- split(object$silhouette, f= as.factor(object$silhouette$comp))
+    names_split_by_comp <- lapply(split_by_comp, 
+                                  function(x) as.vector(apply(x[block], 1, function(y) paste(y, collapse = "_"))))
+    
+    POS <- purrr::imap(split_by_comp, ~ purrr::set_names(.x[["pos"]], names_split_by_comp[[.y]]))
+    NEG <- purrr::imap(split_by_comp, ~ purrr::set_names(.x[["neg"]], names_split_by_comp[[.y]]))
+    
+    # compute slope (pos / neg by comp)
+    slopes <- purrr::map_dfr(as.character(1:ncomp), ~{
+        cbind(neighbourhood,
+              "origin.pos" = POS[[.x]][neighbourhood$origin],
+              "destination.pos" = POS[[.x]][neighbourhood$destination],
+              "origin.neg" = NEG[[.x]][neighbourhood$origin],
+              "destination.neg" = NEG[[.x]][neighbourhood$destination],
+              "comp" = as.numeric(.x))})
+    
+    slopes <- slopes %>%
+        dplyr::filter(origin!=destination) %>%
+        dplyr::mutate("slope.pos" = tune.silhouette.get_slopes_coef(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric),
+                                                             x2 = lapply(stringr::str_split(.$destination, "_"), as.numeric),
+                                                             y1 = .$origin.pos,
+                                                             y2 = .$destination.pos)) %>%
+        dplyr::mutate("slope.neg" = tune.silhouette.get_slopes_coef(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric),
+                                                             x2 = lapply(stringr::str_split(.$destination, "_"), as.numeric),
+                                                             y1 = .$origin.neg,
+                                                             y2 = .$destination.neg)) %>%
+        dplyr::mutate("distance_from_origin" = tune.silhouette.distance_from_origin(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric)))
+    
+    # cumpute SD by comp and direction
+    SD <- slopes %>% 
+        dplyr::group_by(comp, direction) %>% 
+        dplyr::summarise(sd.pos = sd_new(slope.pos, na.rm = TRUE),
+                         sd.neg = sd_new(slope.neg, na.rm = TRUE),
+                         mean.pos = mean(slope.pos, na.rm = TRUE),
+                         mean.neg = mean(slope.neg, na.rm = TRUE), N=dplyr::n())
+    
+    # add Pval for signif slopes
+    slopes <- slopes %>%
+        dplyr::left_join(SD, by = c("direction", "comp")) %>%
+        # pos
+        dplyr::mutate(Z_score.pos = ifelse(.$sd.pos == 0,0,(.$slope.pos - .$mean.pos)/.$sd.pos)) %>%
+        dplyr::mutate(Pval.pos = ifelse(Z_score.pos >= 0,1-pnorm(Z_score.pos), pnorm(Z_score.pos))) %>%
+        # neg
+        dplyr::mutate(Z_score.neg = ifelse(.$sd.neg == 0,0,(.$slope.neg - .$mean.neg)/.$sd.neg)) %>%
+        dplyr::mutate(Pval.neg = ifelse(Z_score.neg >= 0,1-pnorm(Z_score.neg), pnorm(Z_score.neg)))
+    return(slopes)
+}
+
+#' @importFrom purrr map2
+tune.silhouette.get_neighbours <- function(coord, point){
+    # return forward neighbourhood of a point
+    
+    # valid point in coord
+    stopifnot(all(names(coord) == names(point)))
+    
+    neighbour.max <- point
+    index <- purrr::map2(coord, point, ~which(.x == .y))
+
+    # get max neighbours
+    for(i in names(index)){
+        if(!(length(coord[[i]]) == index[[i]])){ 
+            neighbour.max[[i]] <- coord[[i]][index[[i]]+1]
+        }
+    } # can be simplified
+    
+    # get all close possible neighbours
+    neighbourhood <- as.list(rbind(as.data.frame(point), as.data.frame(neighbour.max))) %>%
+        lapply(unique) %>%
+    expand.grid(stringsAsFactors = FALSE, KEEP.OUT.ATTRS = FALSE)
+    
+    # get direction for standard deviation computation
+    direction <- vector(mode = "numeric", length = nrow(neighbourhood))
+    for(i in 1:nrow(neighbourhood)){
+        direction[i] <- purrr::map2(coord, neighbourhood[i,], ~which(.x == .y)) %>%
+            purrr::map2(index, ~{.x-.y}) %>%
+            paste(collapse = "_")
+    }
+    
+    neighbourhood["direction"] <- direction
+    neighbourhood["origin"] <- paste(point, collapse = "_")
+    neighbourhood["destination"] <- apply(neighbourhood[names(coord)], 1, 
+                                          function(x) paste(x, collapse = "_"))
+    return(neighbourhood)
+}
+
+#' @importFrom purrr map_dbl
+tune.silhouette.get_slopes_coef <- function(x1,x2,y1,y2){
+    stopifnot(length(x1) == length(x2))
+    stopifnot(length(x2) == length(y1))
+    stopifnot(length(y1) == length(y2))
+    
+    res <- purrr::map_dbl(seq_along(x1), ~{
+        euc.dist <- sqrt(sum((x1[[.x]] - x2[[.x]])^2));
+        ifelse(euc.dist == 0, 0, (y2[.x] - y1[.x])/euc.dist)})
+    return(res)
+}
+
+#' @importFrom purrr map_dbl
+tune.silhouette.distance_from_origin <- function(x1){
+    euc.dist <- purrr::map_dbl(seq_along(x1), ~{ sqrt(sum((x1[[.x]])^2))})
+    return(euc.dist)
+}
+
+#' @importFrom dplyr select summarise left_join
+#' @importFrom tidyr gather
+#' @importFrom magrittr %>%
+tune.silhouette.get_choice_keepX <- function(tune.block.spls){
+    # from slopes, keep useful columns and remove NAs.
+    slopes <- tune.block.spls$slopes
+    tmp <- slopes %>%
+        dplyr::select(c(tune.block.spls$block, comp, direction, Pval.pos, Pval.neg, distance_from_origin)) %>%
+        tidyr::gather(Pval.dir, Pval.value, -c(tune.block.spls$block, comp, direction, distance_from_origin)) 
+    
+    # for each comp, arrange by Pvalue and distance from origin and get first result    
+    MIN <- split(tmp, f=tmp$comp) %>% 
+        na.omit() %>% 
+        lapply(function(x) x %>%
+                   dplyr::arrange(Pval.value, distance_from_origin) %>%
+                   .[1,] %>%
+                   dplyr::select(tune.block.spls$block))
+    
+    return(MIN)
+}