deepvariant-r1.6.1 / Git / [9b26b7] /third_party/nucleus/util/genomics_math

Models:
cathy-stones/
deepvariant-r1.6.1
Downloads: 0
[9b26b7]: / third_party / nucleus / util / genomics_math_test.py
History
Download this file
240 lines (225 with data), 8.8 kB

# Copyright 2018 Google LLC.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
#    this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from this
#    software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.

"""Tests for third_party.nucleus.util.genomics_math."""

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from absl.testing import absltest
from absl.testing import parameterized
import numpy as np
import numpy.testing as npt

from third_party.nucleus.util import genomics_math


class MathTests(parameterized.TestCase):

  @parameterized.parameters(
      (0.9000000, None, 10.0),
      (0.9900000, None, 20.0),
      (0.9990000, None, 30.0),
      (0.9999000, None, 40.0),
      (0.9999900, None, 50.0),
      (0.9999990, None, 60.0),
      (0.9999999, None, 70.0),
      # Check that bounding works.
      (0.9999999, 1 - 1e-1, 10.0),
      (0.9999999, 1 - 1e-2, 20.0),
      (0.9999999, 1 - 1e-3, 30.0),
      (0.9999999, 1 - 1e-9, 70.0),
  )
  def test_phred_scale(self, prob, bound, expected):
    if bound:
      actual = genomics_math.ptrue_to_bounded_phred(prob, bound)
    else:
      actual = genomics_math.ptrue_to_bounded_phred(prob)
    self.assertAlmostEqual(actual, expected, places=6)

  @parameterized.parameters(
      (1.000000, None, 0.0),
      (0.100000, None, -1.0),
      (0.010000, None, -2.0),
      (0.001000, None, -3.0),
      (0.000100, None, -4.0),
      (0.000010, None, -5.0),
      (0.000001, None, -6.0),
      # Check that bounding works.
      (0.000100, 1e-1, -1.0),
      (0.000100, 1e-2, -2.0),
      (0.000100, 1e-3, -3.0),
      (0.000100, 1e-4, -4.0),
      (0.000100, 1e-5, -4.0),
      (0.000100, 1e-6, -4.0),
  )
  def test_log10_prob(self, prob, bound, expected):
    if bound:
      actual = genomics_math.perror_to_bounded_log10_perror(prob, bound)
    else:
      actual = genomics_math.perror_to_bounded_log10_perror(prob)
    self.assertAlmostEqual(actual, expected, places=6)

  @parameterized.parameters(
      (np.log10(0.900000), -1.0, 10.0),
      (np.log10(0.990000), -1.0, 20.0),
      (np.log10(0.999000), -1.0, 30.0),
      # A huge negative value is handled correctly.
      (-10000000.0, -1.0, 0.0),
      # Check that we can pass in a 0.0 probability and get a good value.
      (0.0, -1.0, -1.0),
      # This probability is still calculated correctly, included for safety.
      (0 - 1e-16, -1.0, 156.53559774527022),
      # Pass in a prob close to one, making sure we get bound value back.
      (0 - 1e-32, -1.0, -1.0),
  )
  def test_log10_ptrue_to_phred(self, prob, value_if_not_finite, expected):
    actual = genomics_math.log10_ptrue_to_phred(prob, value_if_not_finite)
    self.assertAlmostEqual(actual, expected, places=6)

  # R code to produce the expectation table.
  # expected <- function(k, n, p) {
  #   pbin <- dbinom(k, n, p, log=T) * log10(exp(1))
  #   likelihoods = paste(sprintf("%.6f", pbin), collapse=", ")
  #   result = paste(k, n, p, pbin, sep=", ")
  #   cat(paste("(", result, "),\n", sep=""))
  # }
  #
  # for (n in c(0, 5, 10)) {
  #   for (k in seq(0, n)) {
  #     for (p in c(0.01, 0.5)) {
  #       expected(k, n, p)
  #     }
  #   }
  # }
  # expected(0, 1000, 0.5)
  # expected(0, 10000, 0.5)
  # expected(100, 10000, 0.5)
  @parameterized.parameters(
      (0, 0, 0.01, 0),
      (0, 0, 0.5, 0),
      (0, 5, 0.01, -0.0218240270122504),
      (0, 5, 0.5, -1.50514997831991),
      (1, 5, 0.01, -1.31848921727378),
      (1, 5, 0.5, -0.806179973983887),
      (2, 5, 0.01, -3.01309441620735),
      (2, 5, 0.5, -0.505149978319906),
      (3, 5, 0.01, -5.0087296108049),
      (3, 5, 0.5, -0.505149978319906),
      (4, 5, 0.01, -7.30539480106643),
      (4, 5, 0.5, -0.806179973983887),
      (5, 5, 0.01, -10),
      (5, 5, 0.5, -1.50514997831991),
      (0, 10, 0.01, -0.0436480540245008),
      (0, 10, 0.5, -3.01029995663981),
      (1, 10, 0.01, -1.03928324862205),
      (1, 10, 0.5, -2.01029995663981),
      (2, 10, 0.01, -2.38170592944426),
      (2, 10, 0.5, -1.35708744286447),
      (3, 10, 0.01, -3.95137239176953),
      (3, 10, 0.5, -0.931118710592187),
      (4, 10, 0.01, -5.70396953768078),
      (4, 10, 0.5, -0.688080661905893),
      (5, 10, 0.01, -7.62042348623071),
      (5, 10, 0.5, -0.608899415858268),
      (6, 10, 0.01, -9.69523992687588),
      (6, 10, 0.5, -0.688080661905893),
      (7, 10, 0.01, -11.9339131701597),
      (7, 10, 0.5, -0.931118710592187),
      (8, 10, 0.01, -14.3555170970296),
      (8, 10, 0.5, -1.35708744286447),
      (9, 10, 0.01, -17.0043648054024),
      (9, 10, 0.5, -2.01029995663981),
      (10, 10, 0.01, -20),
      (10, 10, 0.5, -3.01029995663981),
      (0, 1000, 0.5, -301.029995663981),
      (0, 10000, 0.5, -3010.29995663981),
      (100, 10000, 0.5, -2768.48565263445),
  )
  def test_log10_binomial(self, k, n, p, expected):
    self.assertAlmostEqual(genomics_math.log10_binomial(k, n, p), expected)

  @parameterized.parameters(
      ([0], 0.0),
      ([0.0], 0.0),
      ([0.0, -10000.0], 0.0),
      ([-1000.0, -10000.0], -1000.0),
      ([-1, -10, -100], -1.0),
      ([-1, -10, -1], -0.69897),
      ([-1, -1, -1], -0.5228787),
      ([-1, -1, -1, -100], -0.5228787),
      ([-1, -1, -1, -100, -1000], -0.5228787),
      ([-1, -1, -1, -100, -1000, -10000], -0.5228787),
      ([-1, -1, -1, -100, -1000, -10000, -100000], -0.5228787),
  )
  def test_log10sumexp(self, log10_probs, expected):
    self.assertAlmostEqual(genomics_math.log10sumexp(log10_probs), expected)

  # R code to compute expected results.
  # expected <- function(lprobs) {
  #   result = lprobs - log10(sum(10^lprobs))
  #   lprob_str = paste("[", paste(sprintf("%.6f", lprobs), collapse=", "),
  #                     "]", sep="")
  #   result_str = paste("[", paste(sprintf("%.6f", result), collapse=", "),
  #                     "]", sep="")
  #   cat(paste("(", lprob_str, ", ", result_str, "),\n", sep=""))
  # }
  #
  # expected(c(0))
  # expected(c(-1, -10))
  # expected(c(-1, -100))
  # expected(c(-1, -1000))
  # expected(c(-1, -2))
  # expected(c(-1, -2, -3))
  # expected(c(-1, -2, -3, -100))
  # expected(c(-1, -2, -100))
  # expected(c(-1, -2, -100, -100))
  @parameterized.parameters(
      dict(
          log10_probs=[0.000000],
          expected=[0.000000]),
      dict(
          log10_probs=[-1.000000, -10.000000],
          expected=[-0.000000, -9.000000]),
      dict(
          log10_probs=[-1.000000, -100.000000],
          expected=[0.000000, -99.000000]),
      dict(
          log10_probs=[-1.000000, -1000.000000],
          expected=[0.000000, -999.000000]),
      dict(
          log10_probs=[-1.000000, -2.000000],
          expected=[-0.041393, -1.041393]),
      dict(
          log10_probs=[-1.000000, -2.000000, -3.000000],
          expected=[-0.045323, -1.045323, -2.045323]),
      dict(
          log10_probs=[-1.000000, -2.000000, -3.000000, -100.000000],
          expected=[-0.045323, -1.045323, -2.045323, -99.045323]),
      dict(
          log10_probs=[-1.000000, -2.000000, -100.000000],
          expected=[-0.041393, -1.041393, -99.041393]),
      dict(
          log10_probs=[-1.000000, -2.000000, -100.000000, -100.000000],
          expected=[-0.041393, -1.041393, -99.041393, -99.041393]),
  )
  def test_normalize_log10_probs(self, log10_probs, expected):
    npt.assert_allclose(
        genomics_math.normalize_log10_probs(log10_probs), expected, atol=1e-6)


if __name__ == '__main__':
  absltest.main()