--- a +++ b/R/tuneCluster.block.spls.R @@ -0,0 +1,338 @@ +#' Feature Selection Optimization for block (s)PLS method +#' +#' This function identify the number of feautures to keep per component and thus by cluster in \code{mixOmics::block.spls} by optimizing the silhouette coefficient, which assesses the quality of clustering. +#' +#' @param X list of numeric matrix (or data.frame) with features in columns and samples in rows (with samples order matching in all data sets). +#' @param Y (optional) numeric matrix (or data.frame) with features in columns and samples in rows (same rows as \code{X}). +#' @param indY integer, to supply if Y is missing, indicates the position of the matrix response in the list \code{X}. +#' @param ncomp integer, number of component to include in the model +#' @param test.list.keepX list of integers with the same size as X. Each entry corresponds to the different keepX value to test for each block of \code{X}. +#' @param test.keepY only if Y is provideid. Vector of integer containing the different value of keepY to test for block \code{Y}. +#' @param ... other parameters to be included in the spls model (see \code{mixOmics::block.spls}) +#' +#' @return +#' \item{silhouette}{silhouette coef. computed for every combinasion of keepX/keepY} +#' \item{ncomp}{number of component included in the model} +#' \item{test.keepX}{list of tested keepX} +#' \item{test.keepY}{list of tested keepY} +#' \item{block}{names of blocks} +#' \item{slopes}{"slopes" computed from the silhouette coef. for each keepX and keepY, used to determine the best keepX and keepY} +#' \item{choice.keepX}{best \code{keepX} for each component} +#' \item{choice.keepY}{best \code{keepY} for each component} +#' +#' +#' @details +#' For each component and for each keepX/keepY value, a spls is done from these parameters. +#' Then the clustering is performed and the silhouette coefficient is calculated for this clustering. +#' +#' We then calculate "slopes" where keepX/keepY are the coordinates and the silhouette is the intensity. +#' A z-score is assigned to each slope. +#' We then identify the most significant slope which indicates a drop in the silhouette coefficient and thus a deterioration of the clustering. +#' +#' +#' @seealso +#' \code{\link[mixOmics]{block.spls}}, \code{\link[timeOmics]{getCluster}}, \code{\link[timeOmics]{plotLong}} +#' +#' @examples +#' demo <- suppressWarnings(get_demo_cluster()) +#' X <- list(X = demo$X, Z = demo$Z) +#' Y <- demo$Y +#' test.list.keepX <- list("X" = c(5,10,15,20), "Z" = c(2,4,6,8)) +#' test.keepY <- c(2:5) +#' +#' # tuning +#' tune.block.spls <- tuneCluster.block.spls(X= X, Y= Y, +#' test.list.keepX= test.list.keepX, +#' test.keepY= test.keepY, +#' mode= "canonical") +#' keepX <- tune.block.spls$choice.keepX +#' keepY <- tune.block.spls$choice.keepY +#' +#' # final model +#' block.spls.res <- mixOmics::block.spls(X= X, Y= Y, keepX = keepX, +#' keepY = keepY, ncomp = 2, mode = "canonical") +#' # get clusters and plot longitudinal profile by cluster +#' block.spls.cluster <- getCluster(block.spls.res) + +#' @import mixOmics +#' @importFrom dplyr left_join +#' @importFrom dplyr mutate +#' @importFrom dplyr filter +#' @export +tuneCluster.block.spls <- function(X, Y = NULL, indY = NULL, ncomp = 2, + test.list.keepX = NULL, test.keepY = NULL, ...){ + #-- checking input parameters ---------------------------------------------# + #--------------------------------------------------------------------------# + + #-- X + X <- validate_list_matrix_X(X) + + #-- Y + if(!is.null(Y)){ + Y <- validate_matrix_Y(Y) + } else { + indY <- validate_indY(indY, X) + } + + #-- ncomp + ncomp <- validate_ncomp(ncomp = ncomp, X =X) + + #-- keepX + test.list.keepX <- validate_test_list_keepX(test.keepX = test.list.keepX, X = X, ncomp = ncomp) + + #-- keepY + test.keepY <- validate_test_keepY(test.keepY = test.keepY, Y = Y) + + list.keepX.keepY <- test.list.keepX + if(!is.null(Y)) { + list.keepX.keepY$Y <- test.keepY + } + list.keepX.keepY <- expand.grid(list.keepX.keepY, stringsAsFactors = FALSE, + KEEP.OUT.ATTRS = FALSE) + + + #-- launch tuning --------------------------------------------------------# + #--------------------------------------------------------------------------# + + #-- 0. set output object + result <- matrix(ncol = 3 + length(X),nrow = nrow(list.keepX.keepY)*ncomp) %>% + as.data.frame() %>% + purrr::set_names(c("comp", names(X), "pos", "neg")) + if(!is.null(Y)){ + result <- result %>% + mutate("Y" = NA) + } + result.index <- 1 + + #--1. compute dissimilarity matrix for silhouette coef. (once and for all) + all_data <- X + all_data_name <- as.character(unlist(imap(X, ~rep(.y,ncol(.x))))) + if(is.null(all_data$Y) & !is.null(Y)){ + all_data$Y <- Y + all_data_name <- c(all_data_name, rep("Y", ncol(Y))) + } + all_data <- do.call("cbind", all_data) + dmatrix <- dmatrix.spearman.dissimilarity(all_data) + + cluster <- as.data.frame(list("feature" = rownames(dmatrix), + "block" = all_data_name)) + + #--2. tuning + for(comp in 1:ncomp){ + for(index.list.kX.kY in 1:nrow(list.keepX.keepY)){ + # foreach comp, keepX and keepY of other comp is set to minimum + kX <- lapply(list.keepX.keepY, function(x) rep(min(x), ncomp)) + for(block in names(list.keepX.keepY)){ + kX[[block]][comp] <- list.keepX.keepY[index.list.kX.kY, block] + result[result.index, block] = list.keepX.keepY[index.list.kX.kY, block] + } + if(!is.null(Y)){ + kY <- kX[["Y"]] + kX[["Y"]] <- NULL + + #--3. run spls + block.spls.res <- mixOmics::block.spls(X = X, Y = Y, ncomp = ncomp, + keepX = kX, keepY = kY, ...) + } else { + #--3. run spls + block.spls.res <- mixOmics::block.spls(X = X, ncomp = ncomp, + keepX = kX, indY = indY, ...) + } + + + #--4. extract clusters + tmp.cluster <- getCluster(block.spls.res) + tmp.cluster <- suppressWarnings(dplyr::left_join(cluster, tmp.cluster[c(1,4)], + by = c("feature"="molecule"))) %>% + mutate(cluster = as.numeric(as.character(cluster))) %>% + mutate(cluster = ifelse(is.na(cluster), 0, cluster)) + + #--5. compute silhouette + sil <- silhouette(dmatrix, tmp.cluster$cluster) + + #--6. store + result[result.index, "comp"] <- comp + pos.res <- sil$average.cluster %>% + dplyr::filter(cluster == comp) %>% + dplyr::pull(silhouette.coef) + result[result.index, "pos"] <- ifelse(length(pos.res) == 0, NA, pos.res) + neg.res <- sil$average.cluster %>% + dplyr::filter(cluster == -comp) %>% + dplyr::pull(silhouette.coef) + result[result.index, "neg"] <- ifelse(length(neg.res) == 0, NA, neg.res) + result.index <- result.index + 1 + } + } + result <- list("silhouette" = result) + result[["ncomp"]] <- ncomp + result[["test.keepX"]] <- test.list.keepX + result[["test.keepY"]] <- test.keepY + result[["block"]] <- names(list.keepX.keepY) + class(result) <- "block.spls.tune.silhouette" + + #-- 7. choice.keepX / choice.keepY + result[["slopes"]] <- tune.silhouette.get_slopes(result) + tmp <- tune.silhouette.get_choice_keepX(result) %>% + do.call(what = "rbind") + if(!is.null(Y)){ # choice keepY + result[["choice.keepY"]] <- tmp$Y + tmp <- dplyr::select(tmp, -Y) + } + result[["choice.keepX"]] <- as.list(tmp) + return(result) +} + +#' @importFrom purrr imap set_names map_dfr +#' @importFrom dplyr filter mutate group_by summarise left_join n +#' @importFrom stringr str_split +tune.silhouette.get_slopes <- function(object){ + stopifnot(class(object) %in% c("block.spls.tune.silhouette", "spls.tune.silhouette", "spca.tune.silhouette")) + # tune.silhouette is a data.frame (comp, X, Y, bock ..., pos, neg) + # tune.silhouette <- tune.block.spls$silhouette + + block <- object$block + ncomp <- object$ncomp + + if(is(object, "block.spls.tune.silhouette")){ + coord <- object$test.keepX + if(!is.null(object[["test.keepY"]])){ + coord[["Y"]] <- object$test.keepY + coord <- lapply(coord, sort) + } + }else if(is(object, "spls.tune.silhouette")){ + coord <- list(X= sort(object$test.keepX), + Y= sort(object$test.keepY)) + }else if(is(object, "spca.tune.silhouette")){ + coord <- list(X= sort(object$test.keepX)) + } + + # get all points + all.points <- unique(object$silhouette[block]) + + # define neighbours + neighbourhood <- map_dfr(1:nrow(all.points), ~tune.silhouette.get_neighbours(coord = coord, all.points[.x,,drop=FALSE])) + + # extract pos and neg and set it as named list for better performance + split_by_comp <- split(object$silhouette, f= as.factor(object$silhouette$comp)) + names_split_by_comp <- lapply(split_by_comp, + function(x) as.vector(apply(x[block], 1, function(y) paste(y, collapse = "_")))) + + POS <- purrr::imap(split_by_comp, ~ purrr::set_names(.x[["pos"]], names_split_by_comp[[.y]])) + NEG <- purrr::imap(split_by_comp, ~ purrr::set_names(.x[["neg"]], names_split_by_comp[[.y]])) + + # compute slope (pos / neg by comp) + slopes <- purrr::map_dfr(as.character(1:ncomp), ~{ + cbind(neighbourhood, + "origin.pos" = POS[[.x]][neighbourhood$origin], + "destination.pos" = POS[[.x]][neighbourhood$destination], + "origin.neg" = NEG[[.x]][neighbourhood$origin], + "destination.neg" = NEG[[.x]][neighbourhood$destination], + "comp" = as.numeric(.x))}) + + slopes <- slopes %>% + dplyr::filter(origin!=destination) %>% + dplyr::mutate("slope.pos" = tune.silhouette.get_slopes_coef(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric), + x2 = lapply(stringr::str_split(.$destination, "_"), as.numeric), + y1 = .$origin.pos, + y2 = .$destination.pos)) %>% + dplyr::mutate("slope.neg" = tune.silhouette.get_slopes_coef(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric), + x2 = lapply(stringr::str_split(.$destination, "_"), as.numeric), + y1 = .$origin.neg, + y2 = .$destination.neg)) %>% + dplyr::mutate("distance_from_origin" = tune.silhouette.distance_from_origin(x1 = lapply(stringr::str_split(.$origin, "_"), as.numeric))) + + # cumpute SD by comp and direction + SD <- slopes %>% + dplyr::group_by(comp, direction) %>% + dplyr::summarise(sd.pos = sd_new(slope.pos, na.rm = TRUE), + sd.neg = sd_new(slope.neg, na.rm = TRUE), + mean.pos = mean(slope.pos, na.rm = TRUE), + mean.neg = mean(slope.neg, na.rm = TRUE), N=dplyr::n()) + + # add Pval for signif slopes + slopes <- slopes %>% + dplyr::left_join(SD, by = c("direction", "comp")) %>% + # pos + dplyr::mutate(Z_score.pos = ifelse(.$sd.pos == 0,0,(.$slope.pos - .$mean.pos)/.$sd.pos)) %>% + dplyr::mutate(Pval.pos = ifelse(Z_score.pos >= 0,1-pnorm(Z_score.pos), pnorm(Z_score.pos))) %>% + # neg + dplyr::mutate(Z_score.neg = ifelse(.$sd.neg == 0,0,(.$slope.neg - .$mean.neg)/.$sd.neg)) %>% + dplyr::mutate(Pval.neg = ifelse(Z_score.neg >= 0,1-pnorm(Z_score.neg), pnorm(Z_score.neg))) + return(slopes) +} + +#' @importFrom purrr map2 +tune.silhouette.get_neighbours <- function(coord, point){ + # return forward neighbourhood of a point + + # valid point in coord + stopifnot(all(names(coord) == names(point))) + + neighbour.max <- point + index <- purrr::map2(coord, point, ~which(.x == .y)) + + # get max neighbours + for(i in names(index)){ + if(!(length(coord[[i]]) == index[[i]])){ + neighbour.max[[i]] <- coord[[i]][index[[i]]+1] + } + } # can be simplified + + # get all close possible neighbours + neighbourhood <- as.list(rbind(as.data.frame(point), as.data.frame(neighbour.max))) %>% + lapply(unique) %>% + expand.grid(stringsAsFactors = FALSE, KEEP.OUT.ATTRS = FALSE) + + # get direction for standard deviation computation + direction <- vector(mode = "numeric", length = nrow(neighbourhood)) + for(i in 1:nrow(neighbourhood)){ + direction[i] <- purrr::map2(coord, neighbourhood[i,], ~which(.x == .y)) %>% + purrr::map2(index, ~{.x-.y}) %>% + paste(collapse = "_") + } + + neighbourhood["direction"] <- direction + neighbourhood["origin"] <- paste(point, collapse = "_") + neighbourhood["destination"] <- apply(neighbourhood[names(coord)], 1, + function(x) paste(x, collapse = "_")) + return(neighbourhood) +} + +#' @importFrom purrr map_dbl +tune.silhouette.get_slopes_coef <- function(x1,x2,y1,y2){ + stopifnot(length(x1) == length(x2)) + stopifnot(length(x2) == length(y1)) + stopifnot(length(y1) == length(y2)) + + res <- purrr::map_dbl(seq_along(x1), ~{ + euc.dist <- sqrt(sum((x1[[.x]] - x2[[.x]])^2)); + ifelse(euc.dist == 0, 0, (y2[.x] - y1[.x])/euc.dist)}) + return(res) +} + +#' @importFrom purrr map_dbl +tune.silhouette.distance_from_origin <- function(x1){ + euc.dist <- purrr::map_dbl(seq_along(x1), ~{ sqrt(sum((x1[[.x]])^2))}) + return(euc.dist) +} + +#' @importFrom dplyr select summarise left_join +#' @importFrom tidyr gather +#' @importFrom magrittr %>% +tune.silhouette.get_choice_keepX <- function(tune.block.spls){ + # from slopes, keep useful columns and remove NAs. + slopes <- tune.block.spls$slopes + tmp <- slopes %>% + dplyr::select(c(tune.block.spls$block, comp, direction, Pval.pos, Pval.neg, distance_from_origin)) %>% + tidyr::gather(Pval.dir, Pval.value, -c(tune.block.spls$block, comp, direction, distance_from_origin)) + + # for each comp, arrange by Pvalue and distance from origin and get first result + MIN <- split(tmp, f=tmp$comp) %>% + na.omit() %>% + lapply(function(x) x %>% + dplyr::arrange(Pval.value, distance_from_origin) %>% + .[1,] %>% + dplyr::select(tune.block.spls$block)) + + return(MIN) +}