DIscBIO / Git / [381c22] /R/DIscBIO-generic-FindOutliers.R

Models:
DanielG/
DIscBIO
Downloads: 1
[381c22]: / R / DIscBIO-generic-FindOutliers.R
History
Download this file
323 lines (312 with data), 9.5 kB

#' @title Inference of outlier cells
#' @description This functions performs the outlier identification for k-means
#' and model-based clustering
#' @param object \code{DISCBIO} class object.
#' @param outminc minimal transcript count of a gene in a clusters to be tested
#'   for being an outlier gene. Default is 5.
#' @param outlg Minimum number of outlier genes required for being an outlier
#'   cell. Default is 2.
#' @param probthr outlier probability threshold for a minimum of \code{outlg}
#'   genes to be an outlier cell. This probability is computed from a negative
#'   binomial background model of expression in a cluster. Default is 0.001.
#' @param thr probability values for which the number of outliers is computed in
#'   order to plot the dependence of the number of outliers on the probability
#'   threshold. Default is 2**-(1:40).set
#' @param outdistquant Real number between zero and one. Outlier cells are
#'   merged to outlier clusters if their distance smaller than the
#'   outdistquant-quantile of the distance distribution of  pairs of cells in
#'   the orginal clusters after outlier removal. Default is 0.75.
#' @param K Number of clusters to be used.
#' @param plot if `TRUE`, produces a plot of -log10prob per K
#' @param quiet if `TRUE`, intermediary output is suppressed
#' @importFrom stats coef pnbinom
#' @return A named vector of the genes containing outlying cells and the number
#'   of cells on each.
#' @examples
#' sc <- DISCBIO(valuesG1msTest)
#' sc <- Clustexp(sc, cln = 2) # K-means clustering
#' FindOutliers(sc, K = 2)
#'
setGeneric(
  "FindOutliers",
  function(object, K, outminc = 5, outlg = 2, probthr = 1e-3, thr = 2**-(1:40),
           outdistquant = .75, plot = TRUE, quiet = FALSE) {
    standardGeneric("FindOutliers")
  }
)

#' @rdname FindOutliers
#' @export
setMethod(
  "FindOutliers",
  signature = "DISCBIO",
  definition = function(
    object, K, outminc, outlg, probthr, thr, outdistquant, plot, quiet
  ) {
    # ======================================================================
    # Validating
    # ======================================================================
    ran_k <- length(object@kmeans) > 0
    ran_m <- length(object@MBclusters) > 0
    if (ran_k) {
      clusters <- object@kmeans$kpart
    } else if (ran_m) {
      object <- Clustexp(
        object,
        clustnr = 20,
        bootnr = 50,
        metric = "pearson",
        do.gap = TRUE,
        SE.method = "Tibs2001SEmax",
        SE.factor = .25,
        B.gap = 50,
        cln = K,
        rseed = 17000,
        quiet = quiet
      )
      clusters <- object@MBclusters$clusterid
    } else {
      stop("run Clustexp before FindOutliers")
    }
    if (!is.numeric(outminc)) {
      stop("outminc has to be a non-negative integer")
    } else if (round(outminc) != outminc | outminc < 0) {
      stop("outminc has to be a non-negative integer")
    }
    if (!is.numeric(outlg)) {
      stop("outlg has to be a non-negative integer")
    } else if (round(outlg) != outlg | outlg < 0) {
      stop("outlg has to be a non-negative integer")
    }
    if (!is.numeric(probthr)) {
      stop("probthr has to be a number between 0 and 1")
    } else if (probthr < 0 | probthr > 1) {
      stop("probthr has to be a number between 0 and 1")
    }
    if (!is.numeric(thr)) {
      stop("thr hast to be a vector of numbers between 0 and 1")
    } else if (min(thr) < 0 | max(thr) > 1) {
      stop("thr hast to be a vector of numbers between 0 and 1")
    }
    if (!is.numeric(outdistquant)) {
      stop("outdistquant has to be a number between 0 and 1")
    } else if (outdistquant < 0 | outdistquant > 1) {
      stop("outdistquant has to be a number between 0 and 1")
    }

    object@outlierpar <- list(
      outminc = outminc,
      outlg = outlg,
      probthr = probthr,
      thr = thr,
      outdistquant = outdistquant
    )
    ### calibrate background model
    EXP <- object@expdata + 0.1
    m <- log2(apply(EXP, 1, mean))
    v <- log2(apply(EXP, 1, var))
    f <- m > -Inf & v > -Inf
    m <- m[f]
    v <- v[f]
    mm <- -8
    repeat {
      fit <- lm(v ~ m + I(m^2))
      if (coef(fit)[3] >= 0 | mm >= 3) {
        break
      }
      mm <- mm + .5
      f <- m > mm
      m <- m[f]
      v <- v[f]
    }
    object@background <- list()
    object@background$vfit <- fit
    object@background$lvar <-
      function(x, object) {
        2**(
          coef(object@background$vfit)[1] +
            log2(x) * coef(object@background$vfit)[2] +
            coef(object@background$vfit)[3] * log2(x)**2
        )
      }
    object@background$lsize <-
      function(x, object) {
        x**2 / (max(x + 1e-6, object@background$lvar(x, object)) - x)
      }

    ### identify outliers
    out <- vector()
    stest <- rep(0, length(thr))
    cprobs <- vector()
    for (n in 1:max(clusters)) {
      if (sum(clusters == n) == 1) {
        cprobs <-
          append(cprobs, .5)
        names(cprobs)[length(cprobs)] <-
          names(clusters)[clusters == n]
        next
      }
      x <- object@fdata[, clusters == n]
      x <- x[apply(x, 1, max) > outminc, ]
      z <-
        t(apply(x, 1, function(x) {
          apply(cbind(
            pnbinom(
              round(x, 0),
              mu = mean(x),
              size = object@background$lsize(mean(x), object)
            ),
            1 - pnbinom(
              round(x, 0),
              mu = mean(x),
              size = object@background$lsize(mean(x), object)
            )
          ), 1, min)
        }))
      cp <-
        apply(z, 2, function(x) {
          y <-
            p.adjust(x, method = "BH")
          y <- y[order(y, decreasing = FALSE)]
          return(y[outlg])
        })
      f <- cp < probthr
      cprobs <- append(cprobs, cp)
      if (sum(f) > 0) {
        out <- append(out, names(x)[f])
      }
      for (j in seq_len(length(thr))) {
        stest[j] <- stest[j] + sum(cp < thr[j])
      }
    }
    object@out <-
      list(
        out = out,
        stest = stest,
        thr = thr,
        cprobs = cprobs
      )

    ### cluster outliers
    clp2p.cl <- vector()
    cols <- names(object@fdata)
    di <- as.data.frame(object@distances)
    for (i in 1:max(clusters)) {
      tcol <- cols[clusters == i]
      if (sum(!(tcol %in% out)) > 1) {
        clp2p.cl <- append(
          clp2p.cl,
          as.vector(
            t(di[tcol[!(tcol %in% out)], tcol[!(tcol %in% out)]])
          )
        )
      }
    }
    clp2p.cl <- clp2p.cl[clp2p.cl > 0]

    cpart <- clusters
    cadd <- list()
    if (length(out) > 0) {
      if (length(out) == 1) {
        cadd <- list(out)
      } else {
        n <- out
        m <- as.data.frame(di[out, out])

        for (i in seq_len(length(out))) {
          if (length(n) > 1) {
            o <-
              order(
                apply(
                  cbind(m, seq_len(dim(m)[1])),
                  1,
                  function(x) {
                    min(x[1:(length(x) - 1)][-x[length(x)]])
                  }
                ),
                decreasing = FALSE
              )
            m <- m[o, o]
            n <- n[o]
            f <- m[, 1] < quantile(clp2p.cl, outdistquant) |
              m[, 1] == min(clp2p.cl)
            ind <- 1
            if (sum(f) > 1) {
              for (j in 2:sum(f)) {
                comp1 <- m[f, f][j, c(ind, j)]
              }
            }
            comp2 <- quantile(clp2p.cl, outdistquant)
            if (apply(comp1 > comp2, 1, sum) == 0) {
              ind <- append(ind, j)
            }
            cadd[[i]] <- n[f][ind]
            g <- !n %in% n[f][ind]
            n <- n[g]
            m <- m[g, g]
            if (sum(g) == 0) {
              break
            }
          } else if (length(n) == 1) {
            cadd[[i]] <- n
            break
          }
        }
      }

      for (i in seq_len(length(cadd))) {
        cpart[cols %in% cadd[[i]]] <- max(cpart) + 1
      }
    }

    ### determine final clusters

    object@cpart <- cpart

    object@fcol <- rainbow(max(cpart))
    p <- clusters[order(clusters, decreasing = FALSE)]
    x <- object@out$cprobs[names(p)]
    fcol <- c("black", "blue", "green", "red", "yellow", "gray")
    if (plot) {
      for (i in 1:max(p)) {
        y <- -log10(x + 2.2e-16)
        y[p != i] <- 0
        if (i == 1) {
          b <-
            barplot(
              y,
              ylim = c(0, max(-log10(
                x + 2.2e-16
              )) * 2.1),
              col = fcol[i],
              border = fcol[i],
              names.arg = FALSE,
              ylab = "-log10prob"
            )
        } else {
          barplot(
            y,
            add = TRUE,
            col = fcol[i],
            border = fcol[i],
            names.arg = FALSE,
            axes = FALSE
          )
        }
      }
      abline(
        -log10(object@outlierpar$probthr),
        0,
        col = "black",
        lty = 2
      )
      d <- b[2, 1] - b[1, 1]
      y <- 0
      for (i in 1:max(p)) {
        y <- append(y, b[sum(p <= i), 1] + d / 2)
      }
      axis(1, at = (y[1:(length(y) - 1)] + y[-1]) / 2, labels = 1:max(p))
      box()
    }
    if (!quiet) {
      message(
        "The following cells are considered outliers: ",
        which(object@cpart > K),
        "\n"
      )
      print(which(object@cpart > K))
    }
    LL <- which(object@cpart > K)
    return(LL)
  }
)