Switch to unified view

a b/R/DIscBIO-generic-FindOutliers.R
1
#' @title Inference of outlier cells
2
#' @description This functions performs the outlier identification for k-means
3
#' and model-based clustering
4
#' @param object \code{DISCBIO} class object.
5
#' @param outminc minimal transcript count of a gene in a clusters to be tested
6
#'   for being an outlier gene. Default is 5.
7
#' @param outlg Minimum number of outlier genes required for being an outlier
8
#'   cell. Default is 2.
9
#' @param probthr outlier probability threshold for a minimum of \code{outlg}
10
#'   genes to be an outlier cell. This probability is computed from a negative
11
#'   binomial background model of expression in a cluster. Default is 0.001.
12
#' @param thr probability values for which the number of outliers is computed in
13
#'   order to plot the dependence of the number of outliers on the probability
14
#'   threshold. Default is 2**-(1:40).set
15
#' @param outdistquant Real number between zero and one. Outlier cells are
16
#'   merged to outlier clusters if their distance smaller than the
17
#'   outdistquant-quantile of the distance distribution of  pairs of cells in
18
#'   the orginal clusters after outlier removal. Default is 0.75.
19
#' @param K Number of clusters to be used.
20
#' @param plot if `TRUE`, produces a plot of -log10prob per K
21
#' @param quiet if `TRUE`, intermediary output is suppressed
22
#' @importFrom stats coef pnbinom
23
#' @return A named vector of the genes containing outlying cells and the number
24
#'   of cells on each.
25
#' @examples
26
#' sc <- DISCBIO(valuesG1msTest)
27
#' sc <- Clustexp(sc, cln = 2) # K-means clustering
28
#' FindOutliers(sc, K = 2)
29
#'
30
setGeneric(
31
  "FindOutliers",
32
  function(object, K, outminc = 5, outlg = 2, probthr = 1e-3, thr = 2**-(1:40),
33
           outdistquant = .75, plot = TRUE, quiet = FALSE) {
34
    standardGeneric("FindOutliers")
35
  }
36
)
37
38
#' @rdname FindOutliers
39
#' @export
40
setMethod(
41
  "FindOutliers",
42
  signature = "DISCBIO",
43
  definition = function(
44
    object, K, outminc, outlg, probthr, thr, outdistquant, plot, quiet
45
  ) {
46
    # ======================================================================
47
    # Validating
48
    # ======================================================================
49
    ran_k <- length(object@kmeans) > 0
50
    ran_m <- length(object@MBclusters) > 0
51
    if (ran_k) {
52
      clusters <- object@kmeans$kpart
53
    } else if (ran_m) {
54
      object <- Clustexp(
55
        object,
56
        clustnr = 20,
57
        bootnr = 50,
58
        metric = "pearson",
59
        do.gap = TRUE,
60
        SE.method = "Tibs2001SEmax",
61
        SE.factor = .25,
62
        B.gap = 50,
63
        cln = K,
64
        rseed = 17000,
65
        quiet = quiet
66
      )
67
      clusters <- object@MBclusters$clusterid
68
    } else {
69
      stop("run Clustexp before FindOutliers")
70
    }
71
    if (!is.numeric(outminc)) {
72
      stop("outminc has to be a non-negative integer")
73
    } else if (round(outminc) != outminc | outminc < 0) {
74
      stop("outminc has to be a non-negative integer")
75
    }
76
    if (!is.numeric(outlg)) {
77
      stop("outlg has to be a non-negative integer")
78
    } else if (round(outlg) != outlg | outlg < 0) {
79
      stop("outlg has to be a non-negative integer")
80
    }
81
    if (!is.numeric(probthr)) {
82
      stop("probthr has to be a number between 0 and 1")
83
    } else if (probthr < 0 | probthr > 1) {
84
      stop("probthr has to be a number between 0 and 1")
85
    }
86
    if (!is.numeric(thr)) {
87
      stop("thr hast to be a vector of numbers between 0 and 1")
88
    } else if (min(thr) < 0 | max(thr) > 1) {
89
      stop("thr hast to be a vector of numbers between 0 and 1")
90
    }
91
    if (!is.numeric(outdistquant)) {
92
      stop("outdistquant has to be a number between 0 and 1")
93
    } else if (outdistquant < 0 | outdistquant > 1) {
94
      stop("outdistquant has to be a number between 0 and 1")
95
    }
96
97
    object@outlierpar <- list(
98
      outminc = outminc,
99
      outlg = outlg,
100
      probthr = probthr,
101
      thr = thr,
102
      outdistquant = outdistquant
103
    )
104
    ### calibrate background model
105
    EXP <- object@expdata + 0.1
106
    m <- log2(apply(EXP, 1, mean))
107
    v <- log2(apply(EXP, 1, var))
108
    f <- m > -Inf & v > -Inf
109
    m <- m[f]
110
    v <- v[f]
111
    mm <- -8
112
    repeat {
113
      fit <- lm(v ~ m + I(m^2))
114
      if (coef(fit)[3] >= 0 | mm >= 3) {
115
        break
116
      }
117
      mm <- mm + .5
118
      f <- m > mm
119
      m <- m[f]
120
      v <- v[f]
121
    }
122
    object@background <- list()
123
    object@background$vfit <- fit
124
    object@background$lvar <-
125
      function(x, object) {
126
        2**(
127
          coef(object@background$vfit)[1] +
128
            log2(x) * coef(object@background$vfit)[2] +
129
            coef(object@background$vfit)[3] * log2(x)**2
130
        )
131
      }
132
    object@background$lsize <-
133
      function(x, object) {
134
        x**2 / (max(x + 1e-6, object@background$lvar(x, object)) - x)
135
      }
136
137
    ### identify outliers
138
    out <- vector()
139
    stest <- rep(0, length(thr))
140
    cprobs <- vector()
141
    for (n in 1:max(clusters)) {
142
      if (sum(clusters == n) == 1) {
143
        cprobs <-
144
          append(cprobs, .5)
145
        names(cprobs)[length(cprobs)] <-
146
          names(clusters)[clusters == n]
147
        next
148
      }
149
      x <- object@fdata[, clusters == n]
150
      x <- x[apply(x, 1, max) > outminc, ]
151
      z <-
152
        t(apply(x, 1, function(x) {
153
          apply(cbind(
154
            pnbinom(
155
              round(x, 0),
156
              mu = mean(x),
157
              size = object@background$lsize(mean(x), object)
158
            ),
159
            1 - pnbinom(
160
              round(x, 0),
161
              mu = mean(x),
162
              size = object@background$lsize(mean(x), object)
163
            )
164
          ), 1, min)
165
        }))
166
      cp <-
167
        apply(z, 2, function(x) {
168
          y <-
169
            p.adjust(x, method = "BH")
170
          y <- y[order(y, decreasing = FALSE)]
171
          return(y[outlg])
172
        })
173
      f <- cp < probthr
174
      cprobs <- append(cprobs, cp)
175
      if (sum(f) > 0) {
176
        out <- append(out, names(x)[f])
177
      }
178
      for (j in seq_len(length(thr))) {
179
        stest[j] <- stest[j] + sum(cp < thr[j])
180
      }
181
    }
182
    object@out <-
183
      list(
184
        out = out,
185
        stest = stest,
186
        thr = thr,
187
        cprobs = cprobs
188
      )
189
190
    ### cluster outliers
191
    clp2p.cl <- vector()
192
    cols <- names(object@fdata)
193
    di <- as.data.frame(object@distances)
194
    for (i in 1:max(clusters)) {
195
      tcol <- cols[clusters == i]
196
      if (sum(!(tcol %in% out)) > 1) {
197
        clp2p.cl <- append(
198
          clp2p.cl,
199
          as.vector(
200
            t(di[tcol[!(tcol %in% out)], tcol[!(tcol %in% out)]])
201
          )
202
        )
203
      }
204
    }
205
    clp2p.cl <- clp2p.cl[clp2p.cl > 0]
206
207
    cpart <- clusters
208
    cadd <- list()
209
    if (length(out) > 0) {
210
      if (length(out) == 1) {
211
        cadd <- list(out)
212
      } else {
213
        n <- out
214
        m <- as.data.frame(di[out, out])
215
216
        for (i in seq_len(length(out))) {
217
          if (length(n) > 1) {
218
            o <-
219
              order(
220
                apply(
221
                  cbind(m, seq_len(dim(m)[1])),
222
                  1,
223
                  function(x) {
224
                    min(x[1:(length(x) - 1)][-x[length(x)]])
225
                  }
226
                ),
227
                decreasing = FALSE
228
              )
229
            m <- m[o, o]
230
            n <- n[o]
231
            f <- m[, 1] < quantile(clp2p.cl, outdistquant) |
232
              m[, 1] == min(clp2p.cl)
233
            ind <- 1
234
            if (sum(f) > 1) {
235
              for (j in 2:sum(f)) {
236
                comp1 <- m[f, f][j, c(ind, j)]
237
              }
238
            }
239
            comp2 <- quantile(clp2p.cl, outdistquant)
240
            if (apply(comp1 > comp2, 1, sum) == 0) {
241
              ind <- append(ind, j)
242
            }
243
            cadd[[i]] <- n[f][ind]
244
            g <- !n %in% n[f][ind]
245
            n <- n[g]
246
            m <- m[g, g]
247
            if (sum(g) == 0) {
248
              break
249
            }
250
          } else if (length(n) == 1) {
251
            cadd[[i]] <- n
252
            break
253
          }
254
        }
255
      }
256
257
      for (i in seq_len(length(cadd))) {
258
        cpart[cols %in% cadd[[i]]] <- max(cpart) + 1
259
      }
260
    }
261
262
    ### determine final clusters
263
264
    object@cpart <- cpart
265
266
    object@fcol <- rainbow(max(cpart))
267
    p <- clusters[order(clusters, decreasing = FALSE)]
268
    x <- object@out$cprobs[names(p)]
269
    fcol <- c("black", "blue", "green", "red", "yellow", "gray")
270
    if (plot) {
271
      for (i in 1:max(p)) {
272
        y <- -log10(x + 2.2e-16)
273
        y[p != i] <- 0
274
        if (i == 1) {
275
          b <-
276
            barplot(
277
              y,
278
              ylim = c(0, max(-log10(
279
                x + 2.2e-16
280
              )) * 2.1),
281
              col = fcol[i],
282
              border = fcol[i],
283
              names.arg = FALSE,
284
              ylab = "-log10prob"
285
            )
286
        } else {
287
          barplot(
288
            y,
289
            add = TRUE,
290
            col = fcol[i],
291
            border = fcol[i],
292
            names.arg = FALSE,
293
            axes = FALSE
294
          )
295
        }
296
      }
297
      abline(
298
        -log10(object@outlierpar$probthr),
299
        0,
300
        col = "black",
301
        lty = 2
302
      )
303
      d <- b[2, 1] - b[1, 1]
304
      y <- 0
305
      for (i in 1:max(p)) {
306
        y <- append(y, b[sum(p <= i), 1] + d / 2)
307
      }
308
      axis(1, at = (y[1:(length(y) - 1)] + y[-1]) / 2, labels = 1:max(p))
309
      box()
310
    }
311
    if (!quiet) {
312
      message(
313
        "The following cells are considered outliers: ",
314
        which(object@cpart > K),
315
        "\n"
316
      )
317
      print(which(object@cpart > K))
318
    }
319
    LL <- which(object@cpart > K)
320
    return(LL)
321
  }
322
)