DIscBIO / Git / Diff of /R/DIscBIO-generic-FindOutliers.R

Models:
DanielG/
DIscBIO
Downloads: 1
Diff of /R/DIscBIO-generic-FindOutliers.R [000000] .. [28e211]
Switch to side-by-side view

--- a
+++ b/R/DIscBIO-generic-FindOutliers.R
@@ -0,0 +1,322 @@
+#' @title Inference of outlier cells
+#' @description This functions performs the outlier identification for k-means
+#' and model-based clustering
+#' @param object \code{DISCBIO} class object.
+#' @param outminc minimal transcript count of a gene in a clusters to be tested
+#'   for being an outlier gene. Default is 5.
+#' @param outlg Minimum number of outlier genes required for being an outlier
+#'   cell. Default is 2.
+#' @param probthr outlier probability threshold for a minimum of \code{outlg}
+#'   genes to be an outlier cell. This probability is computed from a negative
+#'   binomial background model of expression in a cluster. Default is 0.001.
+#' @param thr probability values for which the number of outliers is computed in
+#'   order to plot the dependence of the number of outliers on the probability
+#'   threshold. Default is 2**-(1:40).set
+#' @param outdistquant Real number between zero and one. Outlier cells are
+#'   merged to outlier clusters if their distance smaller than the
+#'   outdistquant-quantile of the distance distribution of  pairs of cells in
+#'   the orginal clusters after outlier removal. Default is 0.75.
+#' @param K Number of clusters to be used.
+#' @param plot if `TRUE`, produces a plot of -log10prob per K
+#' @param quiet if `TRUE`, intermediary output is suppressed
+#' @importFrom stats coef pnbinom
+#' @return A named vector of the genes containing outlying cells and the number
+#'   of cells on each.
+#' @examples
+#' sc <- DISCBIO(valuesG1msTest)
+#' sc <- Clustexp(sc, cln = 2) # K-means clustering
+#' FindOutliers(sc, K = 2)
+#'
+setGeneric(
+  "FindOutliers",
+  function(object, K, outminc = 5, outlg = 2, probthr = 1e-3, thr = 2**-(1:40),
+           outdistquant = .75, plot = TRUE, quiet = FALSE) {
+    standardGeneric("FindOutliers")
+  }
+)
+
+#' @rdname FindOutliers
+#' @export
+setMethod(
+  "FindOutliers",
+  signature = "DISCBIO",
+  definition = function(
+    object, K, outminc, outlg, probthr, thr, outdistquant, plot, quiet
+  ) {
+    # ======================================================================
+    # Validating
+    # ======================================================================
+    ran_k <- length(object@kmeans) > 0
+    ran_m <- length(object@MBclusters) > 0
+    if (ran_k) {
+      clusters <- object@kmeans$kpart
+    } else if (ran_m) {
+      object <- Clustexp(
+        object,
+        clustnr = 20,
+        bootnr = 50,
+        metric = "pearson",
+        do.gap = TRUE,
+        SE.method = "Tibs2001SEmax",
+        SE.factor = .25,
+        B.gap = 50,
+        cln = K,
+        rseed = 17000,
+        quiet = quiet
+      )
+      clusters <- object@MBclusters$clusterid
+    } else {
+      stop("run Clustexp before FindOutliers")
+    }
+    if (!is.numeric(outminc)) {
+      stop("outminc has to be a non-negative integer")
+    } else if (round(outminc) != outminc | outminc < 0) {
+      stop("outminc has to be a non-negative integer")
+    }
+    if (!is.numeric(outlg)) {
+      stop("outlg has to be a non-negative integer")
+    } else if (round(outlg) != outlg | outlg < 0) {
+      stop("outlg has to be a non-negative integer")
+    }
+    if (!is.numeric(probthr)) {
+      stop("probthr has to be a number between 0 and 1")
+    } else if (probthr < 0 | probthr > 1) {
+      stop("probthr has to be a number between 0 and 1")
+    }
+    if (!is.numeric(thr)) {
+      stop("thr hast to be a vector of numbers between 0 and 1")
+    } else if (min(thr) < 0 | max(thr) > 1) {
+      stop("thr hast to be a vector of numbers between 0 and 1")
+    }
+    if (!is.numeric(outdistquant)) {
+      stop("outdistquant has to be a number between 0 and 1")
+    } else if (outdistquant < 0 | outdistquant > 1) {
+      stop("outdistquant has to be a number between 0 and 1")
+    }
+
+    object@outlierpar <- list(
+      outminc = outminc,
+      outlg = outlg,
+      probthr = probthr,
+      thr = thr,
+      outdistquant = outdistquant
+    )
+    ### calibrate background model
+    EXP <- object@expdata + 0.1
+    m <- log2(apply(EXP, 1, mean))
+    v <- log2(apply(EXP, 1, var))
+    f <- m > -Inf & v > -Inf
+    m <- m[f]
+    v <- v[f]
+    mm <- -8
+    repeat {
+      fit <- lm(v ~ m + I(m^2))
+      if (coef(fit)[3] >= 0 | mm >= 3) {
+        break
+      }
+      mm <- mm + .5
+      f <- m > mm
+      m <- m[f]
+      v <- v[f]
+    }
+    object@background <- list()
+    object@background$vfit <- fit
+    object@background$lvar <-
+      function(x, object) {
+        2**(
+          coef(object@background$vfit)[1] +
+            log2(x) * coef(object@background$vfit)[2] +
+            coef(object@background$vfit)[3] * log2(x)**2
+        )
+      }
+    object@background$lsize <-
+      function(x, object) {
+        x**2 / (max(x + 1e-6, object@background$lvar(x, object)) - x)
+      }
+
+    ### identify outliers
+    out <- vector()
+    stest <- rep(0, length(thr))
+    cprobs <- vector()
+    for (n in 1:max(clusters)) {
+      if (sum(clusters == n) == 1) {
+        cprobs <-
+          append(cprobs, .5)
+        names(cprobs)[length(cprobs)] <-
+          names(clusters)[clusters == n]
+        next
+      }
+      x <- object@fdata[, clusters == n]
+      x <- x[apply(x, 1, max) > outminc, ]
+      z <-
+        t(apply(x, 1, function(x) {
+          apply(cbind(
+            pnbinom(
+              round(x, 0),
+              mu = mean(x),
+              size = object@background$lsize(mean(x), object)
+            ),
+            1 - pnbinom(
+              round(x, 0),
+              mu = mean(x),
+              size = object@background$lsize(mean(x), object)
+            )
+          ), 1, min)
+        }))
+      cp <-
+        apply(z, 2, function(x) {
+          y <-
+            p.adjust(x, method = "BH")
+          y <- y[order(y, decreasing = FALSE)]
+          return(y[outlg])
+        })
+      f <- cp < probthr
+      cprobs <- append(cprobs, cp)
+      if (sum(f) > 0) {
+        out <- append(out, names(x)[f])
+      }
+      for (j in seq_len(length(thr))) {
+        stest[j] <- stest[j] + sum(cp < thr[j])
+      }
+    }
+    object@out <-
+      list(
+        out = out,
+        stest = stest,
+        thr = thr,
+        cprobs = cprobs
+      )
+
+    ### cluster outliers
+    clp2p.cl <- vector()
+    cols <- names(object@fdata)
+    di <- as.data.frame(object@distances)
+    for (i in 1:max(clusters)) {
+      tcol <- cols[clusters == i]
+      if (sum(!(tcol %in% out)) > 1) {
+        clp2p.cl <- append(
+          clp2p.cl,
+          as.vector(
+            t(di[tcol[!(tcol %in% out)], tcol[!(tcol %in% out)]])
+          )
+        )
+      }
+    }
+    clp2p.cl <- clp2p.cl[clp2p.cl > 0]
+
+    cpart <- clusters
+    cadd <- list()
+    if (length(out) > 0) {
+      if (length(out) == 1) {
+        cadd <- list(out)
+      } else {
+        n <- out
+        m <- as.data.frame(di[out, out])
+
+        for (i in seq_len(length(out))) {
+          if (length(n) > 1) {
+            o <-
+              order(
+                apply(
+                  cbind(m, seq_len(dim(m)[1])),
+                  1,
+                  function(x) {
+                    min(x[1:(length(x) - 1)][-x[length(x)]])
+                  }
+                ),
+                decreasing = FALSE
+              )
+            m <- m[o, o]
+            n <- n[o]
+            f <- m[, 1] < quantile(clp2p.cl, outdistquant) |
+              m[, 1] == min(clp2p.cl)
+            ind <- 1
+            if (sum(f) > 1) {
+              for (j in 2:sum(f)) {
+                comp1 <- m[f, f][j, c(ind, j)]
+              }
+            }
+            comp2 <- quantile(clp2p.cl, outdistquant)
+            if (apply(comp1 > comp2, 1, sum) == 0) {
+              ind <- append(ind, j)
+            }
+            cadd[[i]] <- n[f][ind]
+            g <- !n %in% n[f][ind]
+            n <- n[g]
+            m <- m[g, g]
+            if (sum(g) == 0) {
+              break
+            }
+          } else if (length(n) == 1) {
+            cadd[[i]] <- n
+            break
+          }
+        }
+      }
+
+      for (i in seq_len(length(cadd))) {
+        cpart[cols %in% cadd[[i]]] <- max(cpart) + 1
+      }
+    }
+
+    ### determine final clusters
+
+    object@cpart <- cpart
+
+    object@fcol <- rainbow(max(cpart))
+    p <- clusters[order(clusters, decreasing = FALSE)]
+    x <- object@out$cprobs[names(p)]
+    fcol <- c("black", "blue", "green", "red", "yellow", "gray")
+    if (plot) {
+      for (i in 1:max(p)) {
+        y <- -log10(x + 2.2e-16)
+        y[p != i] <- 0
+        if (i == 1) {
+          b <-
+            barplot(
+              y,
+              ylim = c(0, max(-log10(
+                x + 2.2e-16
+              )) * 2.1),
+              col = fcol[i],
+              border = fcol[i],
+              names.arg = FALSE,
+              ylab = "-log10prob"
+            )
+        } else {
+          barplot(
+            y,
+            add = TRUE,
+            col = fcol[i],
+            border = fcol[i],
+            names.arg = FALSE,
+            axes = FALSE
+          )
+        }
+      }
+      abline(
+        -log10(object@outlierpar$probthr),
+        0,
+        col = "black",
+        lty = 2
+      )
+      d <- b[2, 1] - b[1, 1]
+      y <- 0
+      for (i in 1:max(p)) {
+        y <- append(y, b[sum(p <= i), 1] + d / 2)
+      }
+      axis(1, at = (y[1:(length(y) - 1)] + y[-1]) / 2, labels = 1:max(p))
+      box()
+    }
+    if (!quiet) {
+      message(
+        "The following cells are considered outliers: ",
+        which(object@cpart > K),
+        "\n"
+      )
+      print(which(object@cpart > K))
+    }
+    LL <- which(object@cpart > K)
+    return(LL)
+  }
+)