deepvariant-r1.6.1 / Git / [9b26b7] /deepvariant/labeler/positional_labeler

Models:
cathy-stones/
deepvariant-r1.6.1
Downloads: 0
[9b26b7]: / deepvariant / labeler / positional_labeler_test.py
History
Download this file
248 lines (229 with data), 9.6 kB

# Copyright 2017 Google LLC.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
#    this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from this
#    software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""Tests for deepvariant .variant_labeler."""



from absl.testing import absltest
from absl.testing import parameterized

from third_party.nucleus.io import vcf
from third_party.nucleus.testing import test_utils
from third_party.nucleus.util import ranges
from deepvariant import testdata
from deepvariant.labeler import positional_labeler


def setUpModule():
  testdata.init()


class PositionalVariantLabelerTest(parameterized.TestCase):
  # Confident variants: SNP, deletion, and multi-allelic.
  snp = test_utils.make_variant(start=10, alleles=['A', 'C'], gt=[0, 1])
  deletion = test_utils.make_variant(start=20, alleles=['ACG', 'A'], gt=[1, 1])
  multiallelic = test_utils.make_variant(
      start=30, alleles=['ACT', 'ACTGT', 'A'], gt=[1, 2]
  )
  # Outside our confident regions.
  non_confident = test_utils.make_variant(
      start=200, alleles=['A', 'C'], gt=[0, 1]
  )
  filtered = test_utils.make_variant(start=40, filters='FAILED', gt=[0, 1])

  variants = [snp, deletion, multiallelic, non_confident, filtered]

  def _make_labeler(self, variants, confident_regions):
    return positional_labeler.PositionalVariantLabeler(
        truth_vcf_reader=vcf.InMemoryVcfReader(variants),
        confident_regions=confident_regions,
    )

  @parameterized.parameters(
      # Simple tests: we get back our matching variants in the confident regions
      dict(candidate=snp, expected_confident=True, expected_truth=snp),
      dict(
          candidate=deletion, expected_confident=True, expected_truth=deletion
      ),
      dict(
          candidate=multiallelic,
          expected_confident=True,
          expected_truth=multiallelic,
      ),
      # Test the behavior outside of our confident regions.
      # If we provide a variant outside the confident regions (non_confident) we
      # don't get back any expected_truth variants.
      dict(
          candidate=non_confident, expected_confident=False, expected_truth=None
      ),
      # No matching variant, so we get a None as well as False.
      dict(
          candidate=test_utils.make_variant(start=300, alleles=['A', 'C']),
          expected_confident=False,
          expected_truth=None,
      ),
      # This variant doesn't have any match but we're confident in it.
      dict(
          candidate=test_utils.make_variant(start=15, alleles=['C', 'A']),
          expected_confident=True,
          expected_genotype=(0, 0),
          expected_truth=test_utils.make_variant(
              start=15, alleles=['C', 'A'], gt=[0, 0]
          ),
      ),
      # These variant start at our SNP but has a different allele. We are
      # confident and we get back the true snp variant, despite having the
      # different alleles. snp has alleles=['A', 'C'] and gt=[0, 1].
      dict(
          candidate=test_utils.make_variant(
              start=snp.start, alleles=['A', 'G']
          ),
          expected_confident=True,
          expected_genotype=(0, 0),
          expected_truth=snp,
      ),
      dict(
          candidate=test_utils.make_variant(
              start=snp.start, alleles=['AC', 'C']
          ),
          expected_confident=True,
          expected_genotype=(0, 0),
          expected_truth=snp,
      ),
      dict(
          candidate=test_utils.make_variant(
              start=snp.start, alleles=['A', 'CA']
          ),
          expected_confident=True,
          expected_genotype=(0, 0),
          expected_truth=snp,
      ),
      # Checks that we don't match against the filtered truth variant in our
      # database. This means that we return not the filtered variant but one
      # with a (0, 0) genotype.
      dict(
          candidate=test_utils.make_variant(start=filtered.start),
          expected_confident=True,
          expected_genotype=(0, 0),
          expected_truth=test_utils.make_variant(
              start=filtered.start, gt=(0, 0)
          ),
      ),
  )
  def test_label_variants(
      self,
      candidate,
      expected_confident,
      expected_truth,
      expected_genotype=None,
  ):
    labeler = self._make_labeler(
        self.variants,
        ranges.RangeSet([ranges.make_range(self.snp.reference_name, 10, 100)]),
    )

    # Call _match so we can compare our expected truth with the actual one.
    is_confident, truth_variant = labeler._match(candidate)
    self.assertEqual(expected_truth, truth_variant)
    self.assertEqual(is_confident, expected_confident)

    # Now call label_variants to exercise the higher-level API.
    if expected_genotype is None and expected_truth is not None:
      expected_genotype = tuple(expected_truth.calls[0].genotype)
    labels = list(labeler.label_variants([candidate]))
    self.assertLen(labels, 1)
    self.assertEqual(candidate, labels[0].variant)
    self.assertEqual(expected_confident, labels[0].is_confident)
    self.assertEqual(expected_genotype, labels[0].genotype)

  def test_match_selects_variant_by_start(self):
    # Tests that match() selects the variant at the same start even if that
    # variant doesn't have the same alleles at candidate and there's an
    # overlapping with the same alleles.
    overlapping = [
        test_utils.make_variant(start=20, alleles=['CC', 'A'], gt=[1, 1]),
        test_utils.make_variant(start=21, alleles=['AAA', 'A'], gt=[0, 1]),
        test_utils.make_variant(start=22, alleles=['AA', 'A'], gt=[1, 1]),
    ]
    candidate = test_utils.make_variant(start=21, alleles=['CC', 'A'])

    labeler = self._make_labeler(
        overlapping,
        ranges.RangeSet(
            [ranges.make_range(overlapping[0].reference_name, 0, 100)]
        ),
    )
    is_confident, truth_variant = labeler._match(candidate)
    self.assertEqual(is_confident, True)
    self.assertEqual(truth_variant, overlapping[1])

  @parameterized.parameters(
      dict(
          overlapping_variants=[
              test_utils.make_variant(start=20, alleles=['A', 'CC'], gt=[1, 1]),
              test_utils.make_variant(
                  start=20, alleles=['A', 'AAA'], gt=[0, 1]
              ),
              test_utils.make_variant(start=20, alleles=['A', 'AA'], gt=[1, 1]),
          ],
          candidate=test_utils.make_variant(start=20, alleles=['A', 'AAA']),
          expected_confident=True,
          truth_variant_idx=1,
      ),
      # No candidate variant with matching alt, so use first candidate.
      dict(
          overlapping_variants=[
              test_utils.make_variant(start=20, alleles=['A', 'CC'], gt=[1, 1]),
              test_utils.make_variant(
                  start=20, alleles=['A', 'AAA'], gt=[0, 1]
              ),
              test_utils.make_variant(start=20, alleles=['A', 'AA'], gt=[1, 1]),
          ],
          candidate=test_utils.make_variant(start=20, alleles=['A', 'TT']),
          expected_confident=True,
          truth_variant_idx=0,
      ),
      # GAAA->GAA is the same as GA->A (the second one in matches), but if we
      # don't simplify the alleles before comparing, there will be no match and
      # will incorrectly fall back to the first one.
      dict(
          overlapping_variants=[
              test_utils.make_variant(
                  start=20, alleles=['GAA', 'G'], gt=[1, 1]
              ),
              test_utils.make_variant(start=20, alleles=['GA', 'G'], gt=[0, 1]),
          ],
          candidate=test_utils.make_variant(start=20, alleles=['GAAA', 'GAA']),
          expected_confident=True,
          truth_variant_idx=1,
      ),
  )
  def test_match_multiple_matches(
      self,
      overlapping_variants,
      candidate,
      expected_confident,
      truth_variant_idx,
  ):
    labeler = self._make_labeler(
        overlapping_variants,
        ranges.RangeSet(
            [ranges.make_range(overlapping_variants[0].reference_name, 0, 100)]
        ),
    )
    is_confident, variant_match = labeler._match(candidate)
    expected_variant = overlapping_variants[truth_variant_idx]
    self.assertEqual(is_confident, expected_confident)
    self.assertEqual(variant_match, expected_variant)


if __name__ == '__main__':
  absltest.main()