a b/R/tuneCluster.spca.R
1
#' Feature Selection Optimization for sPCA method 
2
#'
3
#' This function identify the number of feautures to keep per component and thus by cluster in \code{mixOmics::spca} 
4
#' by optimizing the silhouette coefficient, which assesses the quality of clustering.
5
#' 
6
#' @param X numeric matrix (or data.frame) with features in columns and samples in rows
7
#' @param ncomp integer, number of component to include in the model
8
#' @param test.keepX vector of integer containing the different value of keepX to test for block \code{X}.
9
#' @param ... other parameters to be included in the spls model (see \code{mixOmics::spca})
10
#' 
11
#' @return 
12
#' \item{silhouette}{silhouette coef. computed for every combinasion of keepX/keepY}
13
#' \item{ncomp}{number of component included in the model}
14
#' \item{test.keepX}{list of tested keepX}
15
#' \item{block}{names of blocks}
16
#' \item{slopes}{"slopes" computed from the silhouette coef. for each keepX and keepY, used to determine the best keepX and keepY}
17
#' \item{choice.keepX}{best \code{keepX} for each component}
18
#' 
19
#' @details
20
#' For each component and for each keepX value, a spls is done from these parameters.
21
#' Then the clustering is performed and the silhouette coefficient is calculated for this clustering.
22
#'
23
#' We then calculate "slopes" where keepX are the coordinates and the silhouette is the intensity.
24
#' A z-score is assigned to each slope.
25
#' We then identify the most significant slope which indicates a drop in the silhouette coefficient and thus a deterioration of the clustering.
26
#'
27
#' 
28
#' @examples
29
#' demo <- suppressWarnings(get_demo_cluster())
30
#' X <- demo$X
31
#' 
32
#' # tuning
33
#' tune.spca.res <- tuneCluster.spca(X = X, ncomp = 2, test.keepX = c(2:10))
34
#' keepX <- tune.spca.res$choice.keepX
35
#' plot(tune.spca.res)
36
#' 
37
#' # final model
38
#' spca.res <- mixOmics::spca(X=X, ncomp = 2, keepX = keepX)
39
#' plotLong(spca.res)
40
41
42
#' @import mixOmics
43
#' @importFrom dplyr left_join
44
#' @importFrom dplyr mutate
45
#' @importFrom dplyr filter
46
#' @export
47
tuneCluster.spca <- function(X, ncomp = 2, test.keepX = rep(ncol(X), ncomp), ...){
48
    #-- checking input parameters ---------------------------------------------#
49
    #--------------------------------------------------------------------------#
50
51
    #-- X
52
    X <- validate_matrix_X(X)
53
54
    #-- ncomp
55
    ncomp <- validate_ncomp(ncomp, list(X))
56
    
57
    #-- test.keepX
58
    test.keepX <- validate_test_keepX(test.keepX = test.keepX, X = X)
59
    min.test.keepX <- rep(min(test.keepX), ncomp)
60
61
62
    #-- launch tuning  --------------------------------------------------------#
63
    #--------------------------------------------------------------------------#
64
65
    #-- 0. set output object
66
    result <- as.data.frame(matrix(ncol = 4, nrow = length(test.keepX)*ncomp))
67
    colnames(result) <- c("comp", "X", "pos", "neg")
68
    result.index <- 0
69
70
    #-- 1. compute dissimilarity matrix for silhouette coef. (once and for all)
71
    dmatrix <- dmatrix.spearman.dissimilarity(X)
72
    cluster <- as.data.frame(list("feature" = rownames(dmatrix)))
73
74
    #-- tuning
75
    for(comp in 1:ncomp){
76
        tmp.keepX <- min.test.keepX  # foreach comp, keepX of other comp is set to minimum
77
        for(keepX in test.keepX){
78
            tmp.keepX[comp] <- keepX
79
80
            #-- 2. run spca
81
            kX = tmp.keepX
82
            spca.res <- mixOmics::spca(X = X, ncomp = ncomp, keepX = kX)
83
84
            #-- 3. extract clusters
85
            tmp.cluster <- getCluster(spca.res)
86
            tmp.cluster <- suppressWarnings(dplyr::left_join(cluster, tmp.cluster[c(1,4)],
87
                                                             by = c("feature"="molecule"))) %>%
88
                dplyr::mutate(cluster = as.numeric(as.character(cluster))) %>%
89
                dplyr::mutate(cluster = ifelse(is.na(cluster), 0, cluster))
90
91
            #-- 4. compute silhouette
92
            sil <- silhouette(dmatrix, tmp.cluster$cluster)
93
            
94
            #-- 6. store
95
            result.index <- result.index + 1
96
            result[result.index, "comp"] <- comp
97
            result[result.index, "X"] <- kX[comp]
98
            
99
            pos.res <-  sil$average.cluster  %>% 
100
                dplyr::filter(cluster == comp) %>%
101
                dplyr::pull(silhouette.coef)
102
            result[result.index, "pos"] <- ifelse(length(pos.res) == 0, NA, pos.res)
103
            neg.res <-  sil$average.cluster  %>%
104
                dplyr::filter(cluster == -comp) %>%
105
                dplyr::pull(silhouette.coef)
106
            result[result.index, "neg"] <- ifelse(length(neg.res) == 0, NA, neg.res)
107
        }
108
    }
109
    result <- list("silhouette" = result)
110
    result[["ncomp"]] <- ncomp
111
    result[["test.keepX"]] <- test.keepX
112
    result[["block"]] <- c("X")
113
    class(result) <- "spca.tune.silhouette"
114
115
    #-- 7. choice.keepX
116
    result[["slopes"]] <- tune.silhouette.get_slopes(result)
117
    tmp <- tune.silhouette.get_choice_keepX(result) # choice keepX/keepY
118
    result[["choice.keepX"]] <- unlist(lapply(tmp, function(x) x$X))
119
    return(result)
120
}
121
122
123
124
125
#' @import mixOmics
126
#' @importFrom dplyr left_join mutate filter group_by top_n
127
#' @import ggplot2
128
#' @importFrom tidyr pivot_longer
129
#' @importFrom ggrepel geom_label_repel
130
#' @export
131
plot.spca.tune.silhouette <- function(x, ...){
132
    #-- checking input parameters ---------------------------------------------#
133
    #--------------------------------------------------------------------------#
134
135
    #-- should be a spca.tune.silhouette" object.
136
    ncomp <- x$ncomp
137
    test.keepX <- x$test.keepX
138
    
139
    tmp <- x$silhouette %>% 
140
        tidyr::pivot_longer(names_to = "contrib", values_to = "value", -c(comp,X)) %>%
141
        na.omit %>% # remove NA intruced in pos/neg
142
        dplyr::mutate(comp = as.factor(paste0("Comp ", comp)), 
143
               contrib = factor(contrib, levels = c("pos","neg"))) %>%
144
        dplyr::group_by(comp, contrib)
145
    
146
    choice <- list(comp= as.factor(paste0("Comp ",names(x$choice.keepX))), 
147
                   X = unname(x$choice.keepX)) %>%
148
        as.data.frame() %>%
149
        dplyr::left_join(tmp, by = c("comp"="comp", "X"="X")) %>% 
150
        dplyr::group_by(comp, X) %>%
151
        dplyr::top_n(n = 1, wt = value)
152
    
153
    choice.vline <- choice %>%
154
        dplyr::select(c("comp", "X"))
155
    
156
    ggplot.df <- ggplot(tmp, aes(x=X, y =value, col = comp)) + 
157
        geom_line(aes(lty = contrib)) + facet_wrap(~comp) +
158
        theme_bw() +
159
        geom_point(data = choice, size = 5, pch = 18) +
160
        ggrepel::geom_label_repel(data = choice, aes(label = X), col = "black") +
161
        scale_color_manual(values = mixOmics::color.mixo(1:x$ncomp)) +
162
        labs(x ="tested keepX",  y = "Silhouette Coef.", color = "Comp.", lty = "Contrib.") +
163
        geom_vline(data = choice.vline, aes(xintercept = X), lty = 5, col = "grey")
164
165
166
    print(ggplot.df)
167
    
168
    return(invisible(ggplot.df))
169
}