|
a |
|
b/R/DIscBIO-generic-FindOutliers.R |
|
|
1 |
#' @title Inference of outlier cells |
|
|
2 |
#' @description This functions performs the outlier identification for k-means |
|
|
3 |
#' and model-based clustering |
|
|
4 |
#' @param object \code{DISCBIO} class object. |
|
|
5 |
#' @param outminc minimal transcript count of a gene in a clusters to be tested |
|
|
6 |
#' for being an outlier gene. Default is 5. |
|
|
7 |
#' @param outlg Minimum number of outlier genes required for being an outlier |
|
|
8 |
#' cell. Default is 2. |
|
|
9 |
#' @param probthr outlier probability threshold for a minimum of \code{outlg} |
|
|
10 |
#' genes to be an outlier cell. This probability is computed from a negative |
|
|
11 |
#' binomial background model of expression in a cluster. Default is 0.001. |
|
|
12 |
#' @param thr probability values for which the number of outliers is computed in |
|
|
13 |
#' order to plot the dependence of the number of outliers on the probability |
|
|
14 |
#' threshold. Default is 2**-(1:40).set |
|
|
15 |
#' @param outdistquant Real number between zero and one. Outlier cells are |
|
|
16 |
#' merged to outlier clusters if their distance smaller than the |
|
|
17 |
#' outdistquant-quantile of the distance distribution of pairs of cells in |
|
|
18 |
#' the orginal clusters after outlier removal. Default is 0.75. |
|
|
19 |
#' @param K Number of clusters to be used. |
|
|
20 |
#' @param plot if `TRUE`, produces a plot of -log10prob per K |
|
|
21 |
#' @param quiet if `TRUE`, intermediary output is suppressed |
|
|
22 |
#' @importFrom stats coef pnbinom |
|
|
23 |
#' @return A named vector of the genes containing outlying cells and the number |
|
|
24 |
#' of cells on each. |
|
|
25 |
#' @examples |
|
|
26 |
#' sc <- DISCBIO(valuesG1msTest) |
|
|
27 |
#' sc <- Clustexp(sc, cln = 2) # K-means clustering |
|
|
28 |
#' FindOutliers(sc, K = 2) |
|
|
29 |
#' |
|
|
30 |
setGeneric( |
|
|
31 |
"FindOutliers", |
|
|
32 |
function(object, K, outminc = 5, outlg = 2, probthr = 1e-3, thr = 2**-(1:40), |
|
|
33 |
outdistquant = .75, plot = TRUE, quiet = FALSE) { |
|
|
34 |
standardGeneric("FindOutliers") |
|
|
35 |
} |
|
|
36 |
) |
|
|
37 |
|
|
|
38 |
#' @rdname FindOutliers |
|
|
39 |
#' @export |
|
|
40 |
setMethod( |
|
|
41 |
"FindOutliers", |
|
|
42 |
signature = "DISCBIO", |
|
|
43 |
definition = function( |
|
|
44 |
object, K, outminc, outlg, probthr, thr, outdistquant, plot, quiet |
|
|
45 |
) { |
|
|
46 |
# ====================================================================== |
|
|
47 |
# Validating |
|
|
48 |
# ====================================================================== |
|
|
49 |
ran_k <- length(object@kmeans) > 0 |
|
|
50 |
ran_m <- length(object@MBclusters) > 0 |
|
|
51 |
if (ran_k) { |
|
|
52 |
clusters <- object@kmeans$kpart |
|
|
53 |
} else if (ran_m) { |
|
|
54 |
object <- Clustexp( |
|
|
55 |
object, |
|
|
56 |
clustnr = 20, |
|
|
57 |
bootnr = 50, |
|
|
58 |
metric = "pearson", |
|
|
59 |
do.gap = TRUE, |
|
|
60 |
SE.method = "Tibs2001SEmax", |
|
|
61 |
SE.factor = .25, |
|
|
62 |
B.gap = 50, |
|
|
63 |
cln = K, |
|
|
64 |
rseed = 17000, |
|
|
65 |
quiet = quiet |
|
|
66 |
) |
|
|
67 |
clusters <- object@MBclusters$clusterid |
|
|
68 |
} else { |
|
|
69 |
stop("run Clustexp before FindOutliers") |
|
|
70 |
} |
|
|
71 |
if (!is.numeric(outminc)) { |
|
|
72 |
stop("outminc has to be a non-negative integer") |
|
|
73 |
} else if (round(outminc) != outminc | outminc < 0) { |
|
|
74 |
stop("outminc has to be a non-negative integer") |
|
|
75 |
} |
|
|
76 |
if (!is.numeric(outlg)) { |
|
|
77 |
stop("outlg has to be a non-negative integer") |
|
|
78 |
} else if (round(outlg) != outlg | outlg < 0) { |
|
|
79 |
stop("outlg has to be a non-negative integer") |
|
|
80 |
} |
|
|
81 |
if (!is.numeric(probthr)) { |
|
|
82 |
stop("probthr has to be a number between 0 and 1") |
|
|
83 |
} else if (probthr < 0 | probthr > 1) { |
|
|
84 |
stop("probthr has to be a number between 0 and 1") |
|
|
85 |
} |
|
|
86 |
if (!is.numeric(thr)) { |
|
|
87 |
stop("thr hast to be a vector of numbers between 0 and 1") |
|
|
88 |
} else if (min(thr) < 0 | max(thr) > 1) { |
|
|
89 |
stop("thr hast to be a vector of numbers between 0 and 1") |
|
|
90 |
} |
|
|
91 |
if (!is.numeric(outdistquant)) { |
|
|
92 |
stop("outdistquant has to be a number between 0 and 1") |
|
|
93 |
} else if (outdistquant < 0 | outdistquant > 1) { |
|
|
94 |
stop("outdistquant has to be a number between 0 and 1") |
|
|
95 |
} |
|
|
96 |
|
|
|
97 |
object@outlierpar <- list( |
|
|
98 |
outminc = outminc, |
|
|
99 |
outlg = outlg, |
|
|
100 |
probthr = probthr, |
|
|
101 |
thr = thr, |
|
|
102 |
outdistquant = outdistquant |
|
|
103 |
) |
|
|
104 |
### calibrate background model |
|
|
105 |
EXP <- object@expdata + 0.1 |
|
|
106 |
m <- log2(apply(EXP, 1, mean)) |
|
|
107 |
v <- log2(apply(EXP, 1, var)) |
|
|
108 |
f <- m > -Inf & v > -Inf |
|
|
109 |
m <- m[f] |
|
|
110 |
v <- v[f] |
|
|
111 |
mm <- -8 |
|
|
112 |
repeat { |
|
|
113 |
fit <- lm(v ~ m + I(m^2)) |
|
|
114 |
if (coef(fit)[3] >= 0 | mm >= 3) { |
|
|
115 |
break |
|
|
116 |
} |
|
|
117 |
mm <- mm + .5 |
|
|
118 |
f <- m > mm |
|
|
119 |
m <- m[f] |
|
|
120 |
v <- v[f] |
|
|
121 |
} |
|
|
122 |
object@background <- list() |
|
|
123 |
object@background$vfit <- fit |
|
|
124 |
object@background$lvar <- |
|
|
125 |
function(x, object) { |
|
|
126 |
2**( |
|
|
127 |
coef(object@background$vfit)[1] + |
|
|
128 |
log2(x) * coef(object@background$vfit)[2] + |
|
|
129 |
coef(object@background$vfit)[3] * log2(x)**2 |
|
|
130 |
) |
|
|
131 |
} |
|
|
132 |
object@background$lsize <- |
|
|
133 |
function(x, object) { |
|
|
134 |
x**2 / (max(x + 1e-6, object@background$lvar(x, object)) - x) |
|
|
135 |
} |
|
|
136 |
|
|
|
137 |
### identify outliers |
|
|
138 |
out <- vector() |
|
|
139 |
stest <- rep(0, length(thr)) |
|
|
140 |
cprobs <- vector() |
|
|
141 |
for (n in 1:max(clusters)) { |
|
|
142 |
if (sum(clusters == n) == 1) { |
|
|
143 |
cprobs <- |
|
|
144 |
append(cprobs, .5) |
|
|
145 |
names(cprobs)[length(cprobs)] <- |
|
|
146 |
names(clusters)[clusters == n] |
|
|
147 |
next |
|
|
148 |
} |
|
|
149 |
x <- object@fdata[, clusters == n] |
|
|
150 |
x <- x[apply(x, 1, max) > outminc, ] |
|
|
151 |
z <- |
|
|
152 |
t(apply(x, 1, function(x) { |
|
|
153 |
apply(cbind( |
|
|
154 |
pnbinom( |
|
|
155 |
round(x, 0), |
|
|
156 |
mu = mean(x), |
|
|
157 |
size = object@background$lsize(mean(x), object) |
|
|
158 |
), |
|
|
159 |
1 - pnbinom( |
|
|
160 |
round(x, 0), |
|
|
161 |
mu = mean(x), |
|
|
162 |
size = object@background$lsize(mean(x), object) |
|
|
163 |
) |
|
|
164 |
), 1, min) |
|
|
165 |
})) |
|
|
166 |
cp <- |
|
|
167 |
apply(z, 2, function(x) { |
|
|
168 |
y <- |
|
|
169 |
p.adjust(x, method = "BH") |
|
|
170 |
y <- y[order(y, decreasing = FALSE)] |
|
|
171 |
return(y[outlg]) |
|
|
172 |
}) |
|
|
173 |
f <- cp < probthr |
|
|
174 |
cprobs <- append(cprobs, cp) |
|
|
175 |
if (sum(f) > 0) { |
|
|
176 |
out <- append(out, names(x)[f]) |
|
|
177 |
} |
|
|
178 |
for (j in seq_len(length(thr))) { |
|
|
179 |
stest[j] <- stest[j] + sum(cp < thr[j]) |
|
|
180 |
} |
|
|
181 |
} |
|
|
182 |
object@out <- |
|
|
183 |
list( |
|
|
184 |
out = out, |
|
|
185 |
stest = stest, |
|
|
186 |
thr = thr, |
|
|
187 |
cprobs = cprobs |
|
|
188 |
) |
|
|
189 |
|
|
|
190 |
### cluster outliers |
|
|
191 |
clp2p.cl <- vector() |
|
|
192 |
cols <- names(object@fdata) |
|
|
193 |
di <- as.data.frame(object@distances) |
|
|
194 |
for (i in 1:max(clusters)) { |
|
|
195 |
tcol <- cols[clusters == i] |
|
|
196 |
if (sum(!(tcol %in% out)) > 1) { |
|
|
197 |
clp2p.cl <- append( |
|
|
198 |
clp2p.cl, |
|
|
199 |
as.vector( |
|
|
200 |
t(di[tcol[!(tcol %in% out)], tcol[!(tcol %in% out)]]) |
|
|
201 |
) |
|
|
202 |
) |
|
|
203 |
} |
|
|
204 |
} |
|
|
205 |
clp2p.cl <- clp2p.cl[clp2p.cl > 0] |
|
|
206 |
|
|
|
207 |
cpart <- clusters |
|
|
208 |
cadd <- list() |
|
|
209 |
if (length(out) > 0) { |
|
|
210 |
if (length(out) == 1) { |
|
|
211 |
cadd <- list(out) |
|
|
212 |
} else { |
|
|
213 |
n <- out |
|
|
214 |
m <- as.data.frame(di[out, out]) |
|
|
215 |
|
|
|
216 |
for (i in seq_len(length(out))) { |
|
|
217 |
if (length(n) > 1) { |
|
|
218 |
o <- |
|
|
219 |
order( |
|
|
220 |
apply( |
|
|
221 |
cbind(m, seq_len(dim(m)[1])), |
|
|
222 |
1, |
|
|
223 |
function(x) { |
|
|
224 |
min(x[1:(length(x) - 1)][-x[length(x)]]) |
|
|
225 |
} |
|
|
226 |
), |
|
|
227 |
decreasing = FALSE |
|
|
228 |
) |
|
|
229 |
m <- m[o, o] |
|
|
230 |
n <- n[o] |
|
|
231 |
f <- m[, 1] < quantile(clp2p.cl, outdistquant) | |
|
|
232 |
m[, 1] == min(clp2p.cl) |
|
|
233 |
ind <- 1 |
|
|
234 |
if (sum(f) > 1) { |
|
|
235 |
for (j in 2:sum(f)) { |
|
|
236 |
comp1 <- m[f, f][j, c(ind, j)] |
|
|
237 |
} |
|
|
238 |
} |
|
|
239 |
comp2 <- quantile(clp2p.cl, outdistquant) |
|
|
240 |
if (apply(comp1 > comp2, 1, sum) == 0) { |
|
|
241 |
ind <- append(ind, j) |
|
|
242 |
} |
|
|
243 |
cadd[[i]] <- n[f][ind] |
|
|
244 |
g <- !n %in% n[f][ind] |
|
|
245 |
n <- n[g] |
|
|
246 |
m <- m[g, g] |
|
|
247 |
if (sum(g) == 0) { |
|
|
248 |
break |
|
|
249 |
} |
|
|
250 |
} else if (length(n) == 1) { |
|
|
251 |
cadd[[i]] <- n |
|
|
252 |
break |
|
|
253 |
} |
|
|
254 |
} |
|
|
255 |
} |
|
|
256 |
|
|
|
257 |
for (i in seq_len(length(cadd))) { |
|
|
258 |
cpart[cols %in% cadd[[i]]] <- max(cpart) + 1 |
|
|
259 |
} |
|
|
260 |
} |
|
|
261 |
|
|
|
262 |
### determine final clusters |
|
|
263 |
|
|
|
264 |
object@cpart <- cpart |
|
|
265 |
|
|
|
266 |
object@fcol <- rainbow(max(cpart)) |
|
|
267 |
p <- clusters[order(clusters, decreasing = FALSE)] |
|
|
268 |
x <- object@out$cprobs[names(p)] |
|
|
269 |
fcol <- c("black", "blue", "green", "red", "yellow", "gray") |
|
|
270 |
if (plot) { |
|
|
271 |
for (i in 1:max(p)) { |
|
|
272 |
y <- -log10(x + 2.2e-16) |
|
|
273 |
y[p != i] <- 0 |
|
|
274 |
if (i == 1) { |
|
|
275 |
b <- |
|
|
276 |
barplot( |
|
|
277 |
y, |
|
|
278 |
ylim = c(0, max(-log10( |
|
|
279 |
x + 2.2e-16 |
|
|
280 |
)) * 2.1), |
|
|
281 |
col = fcol[i], |
|
|
282 |
border = fcol[i], |
|
|
283 |
names.arg = FALSE, |
|
|
284 |
ylab = "-log10prob" |
|
|
285 |
) |
|
|
286 |
} else { |
|
|
287 |
barplot( |
|
|
288 |
y, |
|
|
289 |
add = TRUE, |
|
|
290 |
col = fcol[i], |
|
|
291 |
border = fcol[i], |
|
|
292 |
names.arg = FALSE, |
|
|
293 |
axes = FALSE |
|
|
294 |
) |
|
|
295 |
} |
|
|
296 |
} |
|
|
297 |
abline( |
|
|
298 |
-log10(object@outlierpar$probthr), |
|
|
299 |
0, |
|
|
300 |
col = "black", |
|
|
301 |
lty = 2 |
|
|
302 |
) |
|
|
303 |
d <- b[2, 1] - b[1, 1] |
|
|
304 |
y <- 0 |
|
|
305 |
for (i in 1:max(p)) { |
|
|
306 |
y <- append(y, b[sum(p <= i), 1] + d / 2) |
|
|
307 |
} |
|
|
308 |
axis(1, at = (y[1:(length(y) - 1)] + y[-1]) / 2, labels = 1:max(p)) |
|
|
309 |
box() |
|
|
310 |
} |
|
|
311 |
if (!quiet) { |
|
|
312 |
message( |
|
|
313 |
"The following cells are considered outliers: ", |
|
|
314 |
which(object@cpart > K), |
|
|
315 |
"\n" |
|
|
316 |
) |
|
|
317 |
print(which(object@cpart > K)) |
|
|
318 |
} |
|
|
319 |
LL <- which(object@cpart > K) |
|
|
320 |
return(LL) |
|
|
321 |
} |
|
|
322 |
) |