|
a |
|
b/R/getNEMO.R |
|
|
1 |
#' @name getNEMO |
|
|
2 |
#' @title Get subtypes from NEMO |
|
|
3 |
#' @description This function wraps the NEMO (Neighborhood based multi-omics clustering) algorithm and provides standard output for `getMoHeatmap()` and `getConsensusMOIC()`. |
|
|
4 |
#' @param data List of matrices. |
|
|
5 |
#' @param N.clust Number of clusters. |
|
|
6 |
#' @param num.neighbors The number of neighbors to use for each omic. |
|
|
7 |
#' @param type Data type corresponding to the list of matrics, which can be gaussian, binomial or possion. |
|
|
8 |
#' @return A list with the following components: |
|
|
9 |
#' |
|
|
10 |
#' \code{fit} an object returned by \link[NEMO]{nemo.clustering}. |
|
|
11 |
#' |
|
|
12 |
#' \code{clust.res} a data.frame storing sample ID and corresponding clusters. |
|
|
13 |
#' |
|
|
14 |
#' \code{mo.method} a string value indicating the method used for multi-omics integrative clustering. |
|
|
15 |
#' @import SNFtool |
|
|
16 |
#' @export |
|
|
17 |
#' @references Rappoport N, Shamir R (2019). NEMO: cancer subtyping by integration of partial multi-omic data. Bioinformatics, 35(18):3348-3356. |
|
|
18 |
#' @examples # There is no example and please refer to vignette. |
|
|
19 |
getNEMO <- function(data = NULL, |
|
|
20 |
N.clust = NULL, |
|
|
21 |
type = rep("gaussian", length(data)), |
|
|
22 |
num.neighbors = NA) { |
|
|
23 |
|
|
|
24 |
# check data |
|
|
25 |
n_dat <- length(data) |
|
|
26 |
if(n_dat > 6){ |
|
|
27 |
stop('current verision of MOVICS can support up to 6 datasets.') |
|
|
28 |
} |
|
|
29 |
if(n_dat < 2){ |
|
|
30 |
stop('current verision of MOVICS needs at least 2 omics data.') |
|
|
31 |
} |
|
|
32 |
|
|
|
33 |
useless.argument <- type |
|
|
34 |
fit <- nemo.clustering(omics.list = data, |
|
|
35 |
num.clusters = N.clust, |
|
|
36 |
num.neighbors = num.neighbors) |
|
|
37 |
|
|
|
38 |
clustres <- data.frame(samID = colnames(data[[1]]), |
|
|
39 |
clust = as.numeric(fit), |
|
|
40 |
row.names = colnames(data[[1]]), |
|
|
41 |
stringsAsFactors = FALSE) |
|
|
42 |
#clustres <- clustres[order(clustres$clust),] |
|
|
43 |
|
|
|
44 |
return(list(fit = fit, clust.res = clustres, mo.method = "NEMO")) |
|
|
45 |
} |
|
|
46 |
|
|
|
47 |
#' @title affinityMatrix |
|
|
48 |
#' @name affinityMatrix |
|
|
49 |
#' @param Diff Distance Matrix. |
|
|
50 |
#' @param K Number of nearest neighbors. |
|
|
51 |
#' @param sigma Variance for local model. |
|
|
52 |
#' @author Bo Wang, Aziz Mezlini, Feyyaz Demir, Marc Fiume, Zhuowen Tu, Michael Brudno, Benjamin Haibe-Kains, Anna Goldenberg |
|
|
53 |
#' @references Wang B, Mezlini AM, Demir F, et al (2014). Similarity network fusion for aggregating data types on a genomic scale. Nat Methods, 11(3):333-337. |
|
|
54 |
#' @keywords internal |
|
|
55 |
#' @return affinityMatrix |
|
|
56 |
affinityMatrix <- function(Diff,K=20,sigma=0.5) { |
|
|
57 |
###This function constructs similarity networks. |
|
|
58 |
N = nrow(Diff) |
|
|
59 |
|
|
|
60 |
Diff = (Diff + t(Diff)) / 2 |
|
|
61 |
diag(Diff) = 0; |
|
|
62 |
sortedColumns = as.matrix(t(apply(Diff,2,sort))) |
|
|
63 |
finiteMean <- function(x) { mean(x[is.finite(x)]) } |
|
|
64 |
means = apply(sortedColumns[,1:K+1],1,finiteMean)+.Machine$double.eps; |
|
|
65 |
|
|
|
66 |
avg <- function(x,y) ((x+y)/2) |
|
|
67 |
Sig = outer(means,means,avg)/3*2 + Diff/3 + .Machine$double.eps; |
|
|
68 |
Sig[Sig <= .Machine$double.eps] = .Machine$double.eps |
|
|
69 |
densities = dnorm(Diff,0,sigma*Sig,log = FALSE) |
|
|
70 |
|
|
|
71 |
W = (densities + t(densities)) / 2 |
|
|
72 |
return(W) |
|
|
73 |
} |
|
|
74 |
|
|
|
75 |
#' @title dist2 |
|
|
76 |
#' @name dist2 |
|
|
77 |
#' @param X A data matrix where each row is a different data point. |
|
|
78 |
#' @param C A data matrix where each row is a different data point. If this matrix is the same as X, pairwise distances for all data points are computed. |
|
|
79 |
#' @author Bo Wang, Aziz Mezlini, Feyyaz Demir, Marc Fiume, Zhuowen Tu, Michael Brudno, Benjamin Haibe-Kains, Anna Goldenberg |
|
|
80 |
#' @references Wang B, Mezlini AM, Demir F, et al (2014). Similarity network fusion for aggregating data types on a genomic scale. Nat Methods, 11(3):333-337. |
|
|
81 |
#' @keywords internal |
|
|
82 |
#' @return dist2 |
|
|
83 |
dist2 <- function(X,C) { |
|
|
84 |
ndata = nrow(X) |
|
|
85 |
ncentres = nrow(C) |
|
|
86 |
|
|
|
87 |
sumsqX = rowSums(X^2) |
|
|
88 |
sumsqC = rowSums(C^2) |
|
|
89 |
|
|
|
90 |
XC = 2 * (X %*% t(C)) |
|
|
91 |
|
|
|
92 |
res = matrix(rep(sumsqX,times=ncentres),ndata,ncentres) + t(matrix(rep(sumsqC,times=ndata),ncentres,ndata)) - XC |
|
|
93 |
res[res < 0] = 0 |
|
|
94 |
return(res) |
|
|
95 |
} |
|
|
96 |
|
|
|
97 |
|
|
|
98 |
#' @title Spectral clustering |
|
|
99 |
#' @name spectralClustering |
|
|
100 |
#' @keywords internal |
|
|
101 |
#' @author Bo Wang, Aziz Mezlini, Feyyaz Demir, Marc Fiume, Zhuowen Tu, Michael Brudno, Benjamin Haibe-Kains, Anna Goldenberg |
|
|
102 |
#' @references Wang B, Mezlini AM, Demir F, et al (2014). Similarity network fusion for aggregating data types on a genomic scale. Nat Methods, 11(3):333-337. |
|
|
103 |
#' @return spectralClustering |
|
|
104 |
spectralClustering = SNFtool::spectralClustering |
|
|
105 |
|
|
|
106 |
#' @title NEMO num clusters |
|
|
107 |
#' @name nemo.num.clusters |
|
|
108 |
#' @description Estimates the number of clusters in an affinity graph. |
|
|
109 |
#' @param W the affinity graph. |
|
|
110 |
#' @param NUMC possible values for the number of clusters. Defaults to 2:15. |
|
|
111 |
#' @return the estimated number of clusters in the graph. |
|
|
112 |
#' @author Nimrod Rappoport |
|
|
113 |
#' @references Rappoport N, Shamir R (2019). NEMO: cancer subtyping by integration of partial multi-omic data. Bioinformatics, 35(18):3348-3356. |
|
|
114 |
#' @keywords internal |
|
|
115 |
nemo.num.clusters <- function(W, NUMC=2:15) { |
|
|
116 |
if (min(NUMC) == 1) { |
|
|
117 |
warning("Note that we always assume there are more than one cluster.") |
|
|
118 |
NUMC = NUMC[NUMC > 1] |
|
|
119 |
} |
|
|
120 |
W = (W + t(W))/2 |
|
|
121 |
diag(W) = 0 |
|
|
122 |
if (length(NUMC) > 0) { |
|
|
123 |
degs = rowSums(W) |
|
|
124 |
degs[degs == 0] = .Machine$double.eps |
|
|
125 |
D = diag(degs) |
|
|
126 |
L = D - W |
|
|
127 |
Di = diag(1/sqrt(degs)) |
|
|
128 |
L = Di %*% L %*% Di |
|
|
129 |
print(dim(L)) |
|
|
130 |
eigs = eigen(L) |
|
|
131 |
eigs_order = sort(eigs$values, index.return = TRUE)$ix |
|
|
132 |
eigs$values = eigs$values[eigs_order] |
|
|
133 |
eigs$vectors = eigs$vectors[, eigs_order] |
|
|
134 |
eigengap = abs(diff(eigs$values)) |
|
|
135 |
eigengap = (1:length(eigengap)) * eigengap |
|
|
136 |
|
|
|
137 |
t1 <- sort(eigengap[NUMC], decreasing = TRUE, index.return = TRUE)$ix |
|
|
138 |
return(NUMC[t1[1]]) |
|
|
139 |
} |
|
|
140 |
} |
|
|
141 |
|
|
|
142 |
#' @title NEMO affinity graph |
|
|
143 |
#' @name nemo.affinity.graph |
|
|
144 |
#' @description Constructs a single affinity graph measuring similarity across different omics. |
|
|
145 |
#' @param raw.data A list of the data to be clustered, where each an entry is a matrix of features x samples. |
|
|
146 |
#' @param k The number of neighbors to use for each omic. It can either be a number, a list of numbers |
|
|
147 |
#' or NA. If it is a number, this is the number of neighbors used for all omics. If this is a list, |
|
|
148 |
#' the number of neighbors are taken for each omic from that list. If it is NA, each omic chooses the |
|
|
149 |
#' number of neighbors to be the number of samples divided by NUM.NEIGHBORS.RATIO. |
|
|
150 |
#' @return A single matrix measuring similarity between the samples across all omics. |
|
|
151 |
#' @author Nimrod Rappoport |
|
|
152 |
#' @references Rappoport N, Shamir R (2019). NEMO: cancer subtyping by integration of partial multi-omic data. Bioinformatics, 35(18):3348-3356. |
|
|
153 |
#' @keywords internal |
|
|
154 |
nemo.affinity.graph <- function(raw.data, k = NA, NUM.NEIGHBORS.RATIO = 6) { |
|
|
155 |
if (is.na(k)) { |
|
|
156 |
k = as.numeric(lapply(1:length(raw.data), function(i) round(ncol(raw.data[[i]]) / NUM.NEIGHBORS.RATIO))) |
|
|
157 |
} else if (length(k) == 1) { |
|
|
158 |
k = rep(k, length(raw.data)) |
|
|
159 |
} |
|
|
160 |
sim.data = lapply(1:length(raw.data), function(i) {affinityMatrix(dist2(as.matrix(t(raw.data[[i]])), |
|
|
161 |
as.matrix(t(raw.data[[i]]))), k[i], 0.5)}) |
|
|
162 |
affinity.per.omic = lapply(1:length(raw.data), function(i) { |
|
|
163 |
sim.datum = sim.data[[i]] |
|
|
164 |
non.sym.knn = apply(sim.datum, 1, function(sim.row) { |
|
|
165 |
returned.row = sim.row |
|
|
166 |
threshold = sort(sim.row, decreasing = TRUE)[k[i]] |
|
|
167 |
returned.row[sim.row < threshold] = 0 |
|
|
168 |
row.sum = sum(returned.row) |
|
|
169 |
returned.row[sim.row >= threshold] = returned.row[sim.row >= threshold] / row.sum |
|
|
170 |
return(returned.row) |
|
|
171 |
}) |
|
|
172 |
sym.knn = non.sym.knn + t(non.sym.knn) |
|
|
173 |
return(sym.knn) |
|
|
174 |
}) |
|
|
175 |
patient.names = Reduce(union, lapply(raw.data, colnames)) |
|
|
176 |
num.patients = length(patient.names) |
|
|
177 |
returned.affinity.matrix = matrix(0, ncol = num.patients, nrow=num.patients) |
|
|
178 |
rownames(returned.affinity.matrix) = patient.names |
|
|
179 |
colnames(returned.affinity.matrix) = patient.names |
|
|
180 |
|
|
|
181 |
shared.omic.count = matrix(0, ncol = num.patients, nrow=num.patients) |
|
|
182 |
rownames(shared.omic.count) = patient.names |
|
|
183 |
colnames(shared.omic.count) = patient.names |
|
|
184 |
|
|
|
185 |
for (j in 1:length(raw.data)) { |
|
|
186 |
curr.omic.patients = colnames(raw.data[[j]]) |
|
|
187 |
returned.affinity.matrix[curr.omic.patients, curr.omic.patients] = returned.affinity.matrix[curr.omic.patients, curr.omic.patients] + affinity.per.omic[[j]][curr.omic.patients, curr.omic.patients] |
|
|
188 |
shared.omic.count[curr.omic.patients, curr.omic.patients] = shared.omic.count[curr.omic.patients, curr.omic.patients] + 1 |
|
|
189 |
} |
|
|
190 |
|
|
|
191 |
final.ret = returned.affinity.matrix / shared.omic.count |
|
|
192 |
lower.tri.ret = final.ret[lower.tri(final.ret)] |
|
|
193 |
final.ret[shared.omic.count == 0] = mean(lower.tri.ret[!is.na(lower.tri.ret)]) |
|
|
194 |
|
|
|
195 |
return(final.ret) |
|
|
196 |
} |
|
|
197 |
|
|
|
198 |
#' @title NEMO clustering |
|
|
199 |
#' @name nemo.clustering |
|
|
200 |
#' @description Performs multi-omic clustering on a datset using the NEMO algorithm. |
|
|
201 |
#' Uses nemo.num.clusters to estimate the number of clusters. |
|
|
202 |
#' @param omics.list A list of the data to be clustered, where each an entry is a matrix of features x samples. |
|
|
203 |
#' @param k The number of neighbors to use for each omic. It can either be a number, a list of numbers |
|
|
204 |
#' or NA. If it is a number, this is the number of neighbors used for all omics. If this is a list, |
|
|
205 |
#' the number of neighbors are taken for each omic from that list. If it is NA, each omic chooses the |
|
|
206 |
#' number of neighbors to be the number of samples divided by NUM.NEIGHBORS.RATIO. |
|
|
207 |
#' @return A single matrix measuring similarity between the samples across all omics. |
|
|
208 |
#' @author Nimrod Rappoport |
|
|
209 |
#' @references Rappoport N, Shamir R (2019). NEMO: cancer subtyping by integration of partial multi-omic data. Bioinformatics, 35(18):3348-3356. |
|
|
210 |
#' @keywords internal |
|
|
211 |
nemo.clustering <- function(omics.list, num.clusters = NULL, num.neighbors = NA) { |
|
|
212 |
if (is.null(num.clusters)) { |
|
|
213 |
num.clusters = NA |
|
|
214 |
} |
|
|
215 |
|
|
|
216 |
graph = nemo.affinity.graph(omics.list, k = num.neighbors) |
|
|
217 |
if (is.na(num.clusters)) { |
|
|
218 |
num.clusters = nemo.num.clusters(graph) |
|
|
219 |
} |
|
|
220 |
clustering = spectralClustering(graph, num.clusters) |
|
|
221 |
names(clustering) = colnames(graph) |
|
|
222 |
return(clustering) |
|
|
223 |
} |