|
a |
|
b/R/clustering.R |
|
|
1 |
#' @include allgenerics.R |
|
|
2 |
#' |
|
|
3 |
NULL |
|
|
4 |
|
|
|
5 |
#### |
|
|
6 |
# Nearest Neighbor graphs #### |
|
|
7 |
#### |
|
|
8 |
|
|
|
9 |
#' Get profile specific neighbors |
|
|
10 |
#' |
|
|
11 |
#' Get neighbors of spatial points |
|
|
12 |
#' |
|
|
13 |
#' @param object a VoltRon object |
|
|
14 |
#' @param assay assay name (exp: Assay1) or assay class (exp: Visium, Xenium), see \link{SampleMetadata}. |
|
|
15 |
#' if NULL, the default assay will be used, see \link{vrMainAssay}. |
|
|
16 |
#' @param method the method used for graph construction, SNN or kNN |
|
|
17 |
#' @param k number of neighbors for kNN |
|
|
18 |
#' @param data.type the type of embedding used for neighborhood calculation, e.g. raw counts (raw), normalized counts (norm), PCA embeddings (pca), UMAP embeddings (umap) etc. |
|
|
19 |
#' @param dims the set of dimensions of the embedding data |
|
|
20 |
#' @param graph.key the name of the graph |
|
|
21 |
#' |
|
|
22 |
#' @importFrom igraph add_edges simplify make_empty_graph vertices E<- E |
|
|
23 |
#' |
|
|
24 |
#' @export |
|
|
25 |
getProfileNeighbors <- function(object, assay = NULL, method = "kNN", k = 10, data.type = "pca", dims = seq_len(30), graph.key = method){ |
|
|
26 |
|
|
|
27 |
# get data |
|
|
28 |
if(data.type %in% c("raw", "norm")){ |
|
|
29 |
nndata <- vrData(object, assay = assay, norm = (data.type == "norm")) |
|
|
30 |
nndata <- t(nndata) |
|
|
31 |
} else { |
|
|
32 |
embedding_names <- vrEmbeddingNames(object) |
|
|
33 |
if(data.type %in% vrEmbeddingNames(object)) { |
|
|
34 |
nndata <- vrEmbeddings(object, assay = assay, type = data.type, dims = dims) |
|
|
35 |
} else { |
|
|
36 |
stop("Please provide a data type from one of three choices: raw, norm and pca") |
|
|
37 |
} |
|
|
38 |
} |
|
|
39 |
|
|
|
40 |
# find profile neighbors |
|
|
41 |
# if(knn.method == "FNN"){ |
|
|
42 |
# nnedges <- FNN::get.knn(nndata, k = k + 1) |
|
|
43 |
nnedges <- knn_annoy(nndata, k = k + 1) |
|
|
44 |
names(nnedges) <- c("nn.index", "nn.dist") |
|
|
45 |
weights <- NULL |
|
|
46 |
nnedges <- |
|
|
47 |
switch(method, |
|
|
48 |
SNN = { |
|
|
49 |
g.out <- build_snn_number(nnedges$nn.index) |
|
|
50 |
nnedges <- g.out[[1]] |
|
|
51 |
weights <- g.out[[2]] |
|
|
52 |
weights <- weights/(2 * (k+2) - weights) |
|
|
53 |
nnedges |
|
|
54 |
}, |
|
|
55 |
kNN = { |
|
|
56 |
nnedges <- nnedges$nn.index |
|
|
57 |
nnedges <- cbind(seq_len(nrow(nndata)), nnedges) |
|
|
58 |
nnedges <- apply(nnedges, 1, function(x){ |
|
|
59 |
do.call(c,lapply(x[-1], function(y) return(c(x[1],y)))) |
|
|
60 |
}) |
|
|
61 |
nnedges |
|
|
62 |
}) |
|
|
63 |
nnedges <- rownames(nndata)[nnedges] |
|
|
64 |
|
|
|
65 |
# make graph and add edges |
|
|
66 |
graph <- make_empty_graph(directed = FALSE) + vertices(rownames(nndata)) |
|
|
67 |
graph <- add_edges(graph, edges = nnedges) |
|
|
68 |
if(!is.null(weights)) |
|
|
69 |
igraph::E(graph)$weight <- weights |
|
|
70 |
graph <- simplify(graph, remove.multiple = TRUE, remove.loops = FALSE) |
|
|
71 |
vrGraph(object, graph.type = graph.key) <- graph |
|
|
72 |
|
|
|
73 |
# return |
|
|
74 |
return(object) |
|
|
75 |
} |
|
|
76 |
|
|
|
77 |
#' knn_annoy |
|
|
78 |
#' |
|
|
79 |
#' knn engine employed by RcppAnnoy package, adapted from \code{BPCells} package. |
|
|
80 |
#' |
|
|
81 |
#' @rdname knn |
|
|
82 |
#' |
|
|
83 |
#' @details **knn_annoy**: Use RcppAnnoy as knn engine |
|
|
84 |
#' |
|
|
85 |
#' @param data data |
|
|
86 |
#' @param query query data (Default: data) |
|
|
87 |
#' @param k number of neighbors for kNN |
|
|
88 |
#' @param n_trees Number of trees during index build time. More trees gives higher accuracy |
|
|
89 |
#' @param search_k Number of nodes to inspect during the query, or -1 for default value. Higher number gives higher accuracy |
|
|
90 |
#' |
|
|
91 |
#' @importFrom RcppAnnoy AnnoyEuclidean |
|
|
92 |
knn_annoy <- function(data, query = data, k = 10, n_trees = 50, search_k = -1) { |
|
|
93 |
annoy <- new(RcppAnnoy::AnnoyEuclidean, ncol(data)) |
|
|
94 |
for (i in seq_len(nrow(data))) { |
|
|
95 |
annoy$addItem(i - 1, data[i, ]) |
|
|
96 |
} |
|
|
97 |
annoy$build(n_trees) |
|
|
98 |
|
|
|
99 |
idx <- matrix(nrow = nrow(query), ncol = k) |
|
|
100 |
dist <- matrix(nrow = nrow(query), ncol = k) |
|
|
101 |
rownames(idx) <- rownames(query) |
|
|
102 |
rownames(dist) <- rownames(query) |
|
|
103 |
for (i in seq_len(nrow(query))) { |
|
|
104 |
res <- annoy$getNNsByVectorList(query[i, ], k, search_k, include_distances = TRUE) |
|
|
105 |
idx[i, ] <- res$item + 1 |
|
|
106 |
dist[i, ] <- res$dist |
|
|
107 |
} |
|
|
108 |
list(idx = idx, dist = dist) |
|
|
109 |
} |
|
|
110 |
|
|
|
111 |
#### |
|
|
112 |
# Clustering #### |
|
|
113 |
#### |
|
|
114 |
|
|
|
115 |
#' getClusters |
|
|
116 |
#' |
|
|
117 |
#' Get clustering of the VoltRon object |
|
|
118 |
#' |
|
|
119 |
#' @param object a VoltRon object |
|
|
120 |
#' @param resolution the resolution parameter for leiden clustering. |
|
|
121 |
#' @param nclus The number of cluster centers for K-means clustering. |
|
|
122 |
#' @param assay assay name (exp: Assay1) or assay class (exp: Visium, Xenium), see \link{SampleMetadata}. |
|
|
123 |
#' if NULL, the default assay will be used, see \link{vrMainAssay}. |
|
|
124 |
#' @param method The method of clustering. Use 'leiden' to perform graph clustering and 'kmeans' for K-means based clustering |
|
|
125 |
#' @param label the name for the newly created clustering column in the metadata |
|
|
126 |
#' @param graph the graph type to be used |
|
|
127 |
#' @param seed seed |
|
|
128 |
#' @param abundance_limit the minimum number of points for a cluster, hence clusters with abundance lower than this limit will be appointed to other nearby clusters |
|
|
129 |
#' |
|
|
130 |
#' @importFrom igraph cluster_leiden |
|
|
131 |
#' @importFrom stats kmeans |
|
|
132 |
#' |
|
|
133 |
#' @export |
|
|
134 |
getClusters <- function(object, resolution = 1, nclus = integer(0), assay = NULL, method = "leiden", label = "clusters", graph = "kNN", seed = 1, abundance_limit = 2){ |
|
|
135 |
|
|
|
136 |
# sample metadata |
|
|
137 |
sample.metadata <- SampleMetadata(object) |
|
|
138 |
|
|
|
139 |
# get assay names |
|
|
140 |
assay_names <- vrAssayNames(object, assay = assay) |
|
|
141 |
|
|
|
142 |
# get assays |
|
|
143 |
object_subset <- subsetVoltRon(object, assays = assay_names) |
|
|
144 |
|
|
|
145 |
# check clustering parameters |
|
|
146 |
.check_clustering_params(method, resolution, nclus, abundance_limit) |
|
|
147 |
|
|
|
148 |
# clustering |
|
|
149 |
set.seed(seed) |
|
|
150 |
if(method == "leiden"){ |
|
|
151 |
object_graph <- vrGraph(object_subset, assay = assay, graph.type = graph) |
|
|
152 |
clusters <- igraph::cluster_leiden(object_graph, objective_function = "modularity", resolution = resolution) |
|
|
153 |
} else if(method == "kmeans"){ |
|
|
154 |
vrdata <- vrData(object_subset, norm = TRUE) |
|
|
155 |
clusters <- stats::kmeans(t(vrdata), centers = nclus) |
|
|
156 |
clusters <- list(names = names(clusters$cluster), membership = clusters$cluster) |
|
|
157 |
} else { |
|
|
158 |
stop("Unrecognized clustering method! Use either 'leiden' for graph clustering or 'kmeans' for K-means clustering") |
|
|
159 |
} |
|
|
160 |
|
|
|
161 |
# correct clustering |
|
|
162 |
clusters <- .correct_low_abundant_clusters(object_graph, clusters, abundance_limit) |
|
|
163 |
|
|
|
164 |
# update metadata |
|
|
165 |
spatialpoints <- vrSpatialPoints(object, assay = assay) |
|
|
166 |
membership <- setNames(rep(NA,length(spatialpoints)), spatialpoints) |
|
|
167 |
membership[clusters$names] <- clusters$membership |
|
|
168 |
object <- addMetadata(object, assay = assay, value = membership, label = label) |
|
|
169 |
|
|
|
170 |
# return |
|
|
171 |
return(object) |
|
|
172 |
} |
|
|
173 |
|
|
|
174 |
#' @noRd |
|
|
175 |
.correct_low_abundant_clusters <- function(object_graph, clusters, abundance_limit){ |
|
|
176 |
|
|
|
177 |
# cluster abundances |
|
|
178 |
cluster_abundance <- table(clusters$membership) |
|
|
179 |
|
|
|
180 |
# check if some clusters are low in abundance |
|
|
181 |
ind <- cluster_abundance < abundance_limit |
|
|
182 |
if(any(ind)){ |
|
|
183 |
low_abundant_clusters <- names(cluster_abundance)[ind] |
|
|
184 |
clusters$membership[clusters$membership %in% low_abundant_clusters] <- NA |
|
|
185 |
} |
|
|
186 |
return(clusters) |
|
|
187 |
} |
|
|
188 |
|
|
|
189 |
#' @noRd |
|
|
190 |
.check_clustering_params <- function(method, resolution, nclus, abundance_limit){ |
|
|
191 |
|
|
|
192 |
# method related params |
|
|
193 |
if(method == "leiden"){ |
|
|
194 |
msg <- "Resolution must be a single numeric and above 0" |
|
|
195 |
if(!is.numeric(resolution)) |
|
|
196 |
stop(msg) |
|
|
197 |
if(length(resolution) > 1) |
|
|
198 |
stop(msg) |
|
|
199 |
if(resolution == 0 | resolution < 0) |
|
|
200 |
stop(msg) |
|
|
201 |
} else if(method == "kmeans"){ |
|
|
202 |
msg <- "Number of cluster centres (nclus) must be a single integer and should be above 1" |
|
|
203 |
if(!is.numeric(nclus)) |
|
|
204 |
stop(msg) |
|
|
205 |
if(length(nclus) > 1) |
|
|
206 |
stop(msg) |
|
|
207 |
if(nclus %% 1 != 0) |
|
|
208 |
stop(msg) |
|
|
209 |
if(nclus == 0) |
|
|
210 |
stop(msg) |
|
|
211 |
} |
|
|
212 |
|
|
|
213 |
# low abundant clusters |
|
|
214 |
msg <- "Low abundance limit must be a single integer and should be above 0" |
|
|
215 |
if(!is.numeric(abundance_limit)) |
|
|
216 |
stop(msg) |
|
|
217 |
if(length(abundance_limit) > 1) |
|
|
218 |
stop(msg) |
|
|
219 |
if(abundance_limit %% 1 != 0) |
|
|
220 |
stop(msg) |
|
|
221 |
if(abundance_limit == 0 || abundance_limit < 0) |
|
|
222 |
stop(msg) |
|
|
223 |
} |
|
|
224 |
|