Diff of /R/clustering.R [000000] .. [413088]

Switch to unified view

a b/R/clustering.R
1
#' @include allgenerics.R
2
#'
3
NULL
4
5
####
6
# Nearest Neighbor graphs ####
7
####
8
9
#' Get profile specific neighbors
10
#'
11
#' Get neighbors of spatial points
12
#'
13
#' @param object a VoltRon object
14
#' @param assay assay name (exp: Assay1) or assay class (exp: Visium, Xenium), see \link{SampleMetadata}. 
15
#' if NULL, the default assay will be used, see \link{vrMainAssay}.
16
#' @param method the method used for graph construction, SNN or kNN
17
#' @param k number of neighbors for kNN
18
#' @param data.type the type of embedding used for neighborhood calculation, e.g. raw counts (raw), normalized counts (norm), PCA embeddings (pca), UMAP embeddings (umap) etc.
19
#' @param dims the set of dimensions of the embedding data
20
#' @param graph.key the name of the graph
21
#'
22
#' @importFrom igraph add_edges simplify make_empty_graph vertices E<- E
23
#'
24
#' @export
25
getProfileNeighbors <- function(object, assay = NULL, method = "kNN", k = 10, data.type = "pca", dims = seq_len(30), graph.key = method){
26
27
  # get data
28
  if(data.type %in% c("raw", "norm")){
29
    nndata <- vrData(object, assay = assay, norm = (data.type == "norm"))
30
    nndata <- t(nndata)
31
  } else {
32
    embedding_names <- vrEmbeddingNames(object)
33
    if(data.type %in% vrEmbeddingNames(object)) {
34
      nndata <- vrEmbeddings(object, assay = assay, type = data.type, dims = dims)
35
    } else {
36
      stop("Please provide a data type from one of three choices: raw, norm and pca")
37
    }
38
  }
39
40
  # find profile neighbors
41
  # if(knn.method == "FNN"){
42
  #   nnedges <- FNN::get.knn(nndata, k = k + 1)
43
  nnedges <- knn_annoy(nndata, k = k + 1)
44
  names(nnedges) <- c("nn.index", "nn.dist")
45
  weights <- NULL
46
  nnedges <-
47
    switch(method,
48
           SNN = {
49
             g.out <- build_snn_number(nnedges$nn.index)
50
             nnedges <- g.out[[1]]
51
             weights <- g.out[[2]]
52
             weights <- weights/(2 * (k+2) - weights)
53
             nnedges
54
           },
55
           kNN = {
56
             nnedges <- nnedges$nn.index
57
             nnedges <- cbind(seq_len(nrow(nndata)), nnedges)
58
             nnedges <- apply(nnedges, 1, function(x){
59
               do.call(c,lapply(x[-1], function(y) return(c(x[1],y))))
60
             })
61
             nnedges
62
           })
63
  nnedges <- rownames(nndata)[nnedges]
64
65
  # make graph and add edges
66
  graph <- make_empty_graph(directed = FALSE) + vertices(rownames(nndata))
67
  graph <- add_edges(graph, edges = nnedges)
68
  if(!is.null(weights))
69
    igraph::E(graph)$weight <- weights
70
  graph <- simplify(graph, remove.multiple = TRUE, remove.loops = FALSE)
71
  vrGraph(object, graph.type = graph.key) <- graph
72
73
  # return
74
  return(object)
75
}
76
77
#' knn_annoy
78
#' 
79
#' knn engine employed by RcppAnnoy package, adapted from \code{BPCells} package.
80
#' 
81
#' @rdname knn
82
#' 
83
#' @details **knn_annoy**: Use RcppAnnoy as knn engine
84
#' 
85
#' @param data data
86
#' @param query query data (Default: data)
87
#' @param k number of neighbors for kNN
88
#' @param n_trees Number of trees during index build time. More trees gives higher accuracy
89
#' @param search_k Number of nodes to inspect during the query, or -1 for default value. Higher number gives higher accuracy
90
#' 
91
#' @importFrom RcppAnnoy AnnoyEuclidean
92
knn_annoy <- function(data, query = data, k = 10, n_trees = 50, search_k = -1) {
93
  annoy <- new(RcppAnnoy::AnnoyEuclidean, ncol(data))
94
  for (i in seq_len(nrow(data))) {
95
    annoy$addItem(i - 1, data[i, ])
96
  }
97
  annoy$build(n_trees)
98
  
99
  idx <- matrix(nrow = nrow(query), ncol = k)
100
  dist <- matrix(nrow = nrow(query), ncol = k)
101
  rownames(idx) <- rownames(query)
102
  rownames(dist) <- rownames(query)
103
  for (i in seq_len(nrow(query))) {
104
    res <- annoy$getNNsByVectorList(query[i, ], k, search_k, include_distances = TRUE)
105
    idx[i, ] <- res$item + 1
106
    dist[i, ] <- res$dist
107
  }
108
  list(idx = idx, dist = dist)
109
}
110
111
####
112
# Clustering ####
113
####
114
115
#' getClusters
116
#'
117
#' Get clustering of the VoltRon object
118
#'
119
#' @param object a VoltRon object
120
#' @param resolution the resolution parameter for leiden clustering.
121
#' @param nclus The number of cluster centers for K-means clustering.
122
#' @param assay assay name (exp: Assay1) or assay class (exp: Visium, Xenium), see \link{SampleMetadata}. 
123
#' if NULL, the default assay will be used, see \link{vrMainAssay}.
124
#' @param method The method of clustering. Use 'leiden' to perform graph clustering and 'kmeans' for K-means based clustering
125
#' @param label the name for the newly created clustering column in the metadata
126
#' @param graph the graph type to be used
127
#' @param seed seed
128
#' @param abundance_limit the minimum number of points for a cluster, hence clusters with abundance lower than this limit will be appointed to other nearby clusters
129
#'
130
#' @importFrom igraph cluster_leiden
131
#' @importFrom stats kmeans
132
#' 
133
#' @export
134
getClusters <- function(object, resolution = 1, nclus = integer(0), assay = NULL, method = "leiden", label = "clusters", graph = "kNN", seed = 1, abundance_limit = 2){
135
136
  # sample metadata
137
  sample.metadata <- SampleMetadata(object)
138
139
  # get assay names
140
  assay_names <- vrAssayNames(object, assay = assay)
141
142
  # get assays
143
  object_subset <- subsetVoltRon(object, assays = assay_names)
144
145
  # check clustering parameters
146
  .check_clustering_params(method, resolution, nclus, abundance_limit)
147
  
148
  # clustering
149
  set.seed(seed)
150
  if(method == "leiden"){
151
    object_graph <- vrGraph(object_subset, assay = assay, graph.type = graph)
152
    clusters <- igraph::cluster_leiden(object_graph, objective_function = "modularity", resolution = resolution) 
153
  } else if(method == "kmeans"){
154
    vrdata <- vrData(object_subset, norm = TRUE)
155
    clusters <- stats::kmeans(t(vrdata), centers = nclus)
156
    clusters <- list(names = names(clusters$cluster), membership = clusters$cluster)
157
  } else {
158
    stop("Unrecognized clustering method! Use either 'leiden' for graph clustering or 'kmeans' for K-means clustering")
159
  }
160
161
  # correct clustering
162
  clusters <- .correct_low_abundant_clusters(object_graph, clusters, abundance_limit)
163
    
164
  # update metadata
165
  spatialpoints <- vrSpatialPoints(object, assay = assay)
166
  membership <- setNames(rep(NA,length(spatialpoints)), spatialpoints)
167
  membership[clusters$names] <- clusters$membership
168
  object <- addMetadata(object, assay = assay, value = membership, label = label)
169
170
  # return
171
  return(object)
172
}
173
174
#' @noRd
175
.correct_low_abundant_clusters <- function(object_graph, clusters, abundance_limit){
176
177
  # cluster abundances
178
  cluster_abundance <- table(clusters$membership)
179
  
180
  # check if some clusters are low in abundance
181
  ind <- cluster_abundance < abundance_limit
182
  if(any(ind)){
183
    low_abundant_clusters <- names(cluster_abundance)[ind]
184
    clusters$membership[clusters$membership %in% low_abundant_clusters] <- NA
185
  } 
186
  return(clusters)
187
}
188
189
#' @noRd
190
.check_clustering_params <- function(method, resolution, nclus, abundance_limit){
191
  
192
  # method related params
193
  if(method == "leiden"){
194
    msg <- "Resolution must be a single numeric and above 0"
195
    if(!is.numeric(resolution))
196
      stop(msg) 
197
    if(length(resolution) > 1)
198
      stop(msg) 
199
    if(resolution == 0 | resolution < 0)
200
      stop(msg)
201
  } else if(method == "kmeans"){
202
    msg <- "Number of cluster centres (nclus) must be a single integer and should be above 1"
203
    if(!is.numeric(nclus))
204
      stop(msg) 
205
    if(length(nclus) > 1)
206
      stop(msg) 
207
    if(nclus %% 1 != 0)
208
      stop(msg) 
209
    if(nclus == 0)
210
      stop(msg) 
211
  }
212
  
213
  # low abundant clusters
214
  msg <- "Low abundance limit must be a single integer and should be above 0"
215
  if(!is.numeric(abundance_limit))
216
    stop(msg) 
217
  if(length(abundance_limit) > 1)
218
    stop(msg) 
219
  if(abundance_limit %% 1 != 0)
220
    stop(msg) 
221
  if(abundance_limit == 0 || abundance_limit < 0)
222
    stop(msg) 
223
}
224