[d79ff0]: / R / silhouette.R

Download this file

141 lines (118 with data), 4.7 kB

  1
  2
  3
  4
  5
  6
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
#' Get data for silhouette demo
#'
#' @return A matrix of expression profile, sample in raws, time in columns.
#'
#' @examples
#' data <- get_demo_silhouette()
#'
#' @export
get_demo_silhouette <- function() {
readRDS(system.file("extdata/data_silhouette.rds", package="timeOmics",
mustWork = TRUE))
}
#' @importFrom dplyr group_by
#' @importFrom dplyr summarise
silhouette <- function(dmatrix, # distance matrix
cluster) # cluster vector of size ncol(dmatrix)
{
#-- checking input parameters ---------------------------------------------#
#--------------------------------------------------------------------------#
#-- check dmatrix
stopifnot(is.matrix(dmatrix) || is.data.frame(dmatrix))
dmatrix <- as.matrix(dmatrix)
stopifnot(nrow(dmatrix) == ncol(dmatrix))
#-- check cluster
stopifnot(is.vector(cluster) || is.factor(cluster))
stopifnot(length(cluster) == ncol(dmatrix))
cluster <- factor(cluster)
cluster.levels <- levels(cluster)
#- compute silhouette -----------------------------------------------------#
#--------------------------------------------------------------------------#
average.dist <- matrix(ncol = length(cluster.levels), nrow = nrow(dmatrix))
result <- vector(length = ncol(dmatrix))
for(i in 1:nrow(dmatrix)){
for(j in 1:length(cluster.levels)){
index.tmp <- cluster == cluster.levels[j]
if(cluster.levels[j] == cluster[i]) {
# we do not include d(i,i) in the sum
# can introduce NaN if size of cluster is 1
# but this is handle after because score is 0 when size of cluster is 1
index.tmp[i] <- FALSE
}
average.dist[i,j] <- mean(dmatrix[i, index.tmp])
}
A <- average.dist[i, cluster[i]] # a : inside
#B <- min(average.dist[i,-c(cluster[i])]) # b
# fix R 4.1
B <- min(average.dist[i,-which(cluster[i] %in% cluster.levels)]) # b
result[i] <- silhoutte.formula(A = A, B = B)
}
#-- return
to_return <- list()
#-- silhouette coefficient by feature
to_return[["feature"]] <- cbind(colnames(dmatrix), cluster, as.data.frame(result))
colnames(to_return[["feature"]]) <- c("feature", "cluster", "silhouette.coef")
#-- average silhouette coefficient
to_return[["average"]] <- mean(to_return[["feature"]][["silhouette.coef"]])
#-- average silhouette coefficient by cluster
to_return[["average.cluster"]] <- dplyr::group_by(to_return[["feature"]], cluster) %>%
dplyr::summarise(silhouette.coef = mean(silhouette.coef)) %>%
as.data.frame
return(to_return)
}
#' dmatrix.spearman.dissimilarity
#'
#' Compute the spearman dissimilarity distance.
#'
#' @param X A numeric matrix with feature in colnames
#'
#' @return
#' Return a dissimilarity matrix of size PxP.
#'
dmatrix.spearman.dissimilarity <- function(X){
# between 0 and 2
dmatrix <- cor(x = X, use = 'pairwise.complete.obs',
method = 'spearman')
dmatrix <- 1 - dmatrix
return(dmatrix)
}
# dmatrix.proportionality.distance <- function(X){
# # clr first
# dmatrix <- matrix(ncol = ncol(X), nrow = ncol(X))
# rownames(dmatrix) <- colnames(dmatrix) <- colnames(X)
# for(i in 1:ncol(X)){
# for(j in 1:ncol(X)){
# dmatrix[i,j] <- var(X[,i] - X[,j])/var(X[,i] + X[,j])
# }
# }
# return(dmatrix)
# }
silhoutte.formula <- function(A,B){
# A average dist inside cluster; # B: min average dist outside
stopifnot(is.vector(A) && is.vector(B))
stopifnot(is.numeric(A) && is.numeric(B))
if(!(is.finite(A))){
return(0)
}else{
return((B - A)/(max(A,B)))
}
}
wrapper.silhouette <- function(X, cluster)
{
#-- checking input parameters ---------------------------------------------#
#--------------------------------------------------------------------------#
#-- check X
stopifnot(is.matrix(X) || is.data.frame(X))
X <- as.matrix(X)
#-- check cluster
stopifnot(is.vector(cluster) || is.factor(cluster))
stopifnot(length(cluster) == ncol(X))
cluster <- factor(cluster)
cluster.levels <- levels(cluster)
#-- compute distance matrix -----------------------------------------------#
#--------------------------------------------------------------------------#
dmatrix <- dmatrix.spearman.dissimilarity(X)
#- compute silhouette -----------------------------------------------------#
#--------------------------------------------------------------------------#
silhouette(dmatrix = dmatrix, cluster = cluster)
}