deepvariant-r1.6.1 / Git / [9b26b7] /deepvariant/labeler/variant_labeler

Models:
cathy-stones/
deepvariant-r1.6.1
Downloads: 0
[9b26b7]: / deepvariant / labeler / variant_labeler_test.py
History
Download this file
304 lines (278 with data), 13.1 kB

# Copyright 2017 Google LLC.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
#
# 1. Redistributions of source code must retain the above copyright notice,
#    this list of conditions and the following disclaimer.
#
# 2. Redistributions in binary form must reproduce the above copyright
#    notice, this list of conditions and the following disclaimer in the
#    documentation and/or other materials provided with the distribution.
#
# 3. Neither the name of the copyright holder nor the names of its
#    contributors may be used to endorse or promote products derived from this
#    software without specific prior written permission.
#
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
# POSSIBILITY OF SUCH DAMAGE.
"""Tests for deepvariant .variant_labeler."""



from absl.testing import absltest
from absl.testing import parameterized
from third_party.nucleus.io import vcf
from third_party.nucleus.testing import test_utils
from third_party.nucleus.util import ranges
from deepvariant import testdata
from deepvariant.labeler import variant_labeler


def setUpModule():
  testdata.init()


class PlaceholderVariantLabeler(variant_labeler.VariantLabeler):
  """A placeholder VariantLabeler.

  This class provides a label_variants implementation and so allows the base
  class to be instantiated and its methods tested.
  """

  def __init__(self, *pos, **kwargs):
    super(PlaceholderVariantLabeler, self).__init__(*pos, **kwargs)

  def label_variants(self, variants, region):
    raise NotImplementedError


class VariantLabelerTest(parameterized.TestCase):
  snp = test_utils.make_variant(start=10, alleles=['A', 'C'], gt=[0, 1])

  def test_get_truth_variants(self):
    v1 = test_utils.make_variant(chrom='1', start=10)
    v2 = test_utils.make_variant(chrom='1', start=20)
    v3_filtered = test_utils.make_variant(chrom='1', start=30, filters=['FAIL'])
    v4_del = test_utils.make_variant(chrom='1', start=40, alleles=['AAAA', 'A'])
    v5_non_confident = test_utils.make_variant(chrom='1', start=150)

    variants = [v1, v2, v3_filtered, v4_del, v5_non_confident]
    reader = vcf.InMemoryVcfReader(variants=variants)
    confident_regions = ranges.RangeSet([ranges.make_range('1', 1, 100)])
    labeler = PlaceholderVariantLabeler(
        truth_vcf_reader=reader, confident_regions=confident_regions
    )

    # Check that we get v1 and v2 specifically when only they are covered by the
    # query.
    self.assertEqual(
        list(labeler._get_truth_variants(ranges.parse_literal('1:1-15'))), [v1]
    )
    self.assertEqual(
        list(labeler._get_truth_variants(ranges.parse_literal('1:15-25'))), [v2]
    )

    # We don't include filtered variants.
    self.assertEqual(
        list(labeler._get_truth_variants(ranges.parse_literal('1:25-35'))), []
    )

    # Check that we get all overlapping variants of our query.
    for del_query in ['1:35-45', '1:42-43', '1:38-42', '1:42-50']:
      self.assertEqual(
          list(labeler._get_truth_variants(ranges.parse_literal(del_query))),
          [v4_del],
      )

    # Checks that a simple query gets all our non-filtered variants.
    self.assertEqual(
        list(labeler._get_truth_variants(ranges.parse_literal('1:1-100'))),
        [v1, v2, v4_del],
    )
    # Even through our query covers v5, it's not confident, so we don't get it.
    self.assertEqual(
        list(labeler._get_truth_variants(ranges.parse_literal('1:1-1000'))),
        [v1, v2, v4_del],
    )

  @parameterized.parameters(
      # Make sure we get the right alt counts for all diploid genotypes.
      (['A', 'C'], ['C'], ['A', 'C'], [0, 0], (0, 0), 0),
      (['A', 'C'], ['C'], ['A', 'C'], [0, 1], (0, 1), 1),
      (['A', 'C'], ['C'], ['A', 'C'], [1, 0], (0, 1), 1),
      (['A', 'C'], ['C'], ['A', 'C'], [1, 1], (1, 1), 2),
      # Make sure get back a zero alt count for a reference variant.
      (['A'], [], ['A'], [0, 0], (0, 0), 0),
      # Basic multi-allelic tests, without having to deal with simplifying
      # alleles as all of the alleles are SNPs. Our candidates have an extra
      # allele, but the true GT is A/C.
      (['A', 'C', 'G'], ['C'], ['A', 'C'], [0, 1], (0, 1), 1),
      (['A', 'C', 'G'], ['C'], ['A', 'C'], [1, 1], (1, 1), 2),
      # When considering A/G our answer should be 0 as we have no copies
      # of the G allele.
      (['A', 'C', 'G'], ['G'], ['A', 'C'], [0, 1], (0, 1), 0),
      (['A', 'C', 'G'], ['G'], ['A', 'C'], [1, 1], (1, 1), 0),
      # We are considering the het-alt configuration here of A vs. C+G. We've
      # got one copy of the C allele so our true genotype is het. If truth is
      # hom-var for the C, though, we again label the composite as hom_var as
      # we have two copies of the C/G alt.
      (['A', 'C', 'G'], ['C', 'G'], ['A', 'C'], [0, 1], (0, 1), 1),
      (['A', 'C', 'G'], ['C', 'G'], ['A', 'C'], [1, 1], (1, 1), 2),
      # Here we have an extra allele in truth, while candidate is bi-allelic.
      # This example 'G' is unused in truth, so we are simply the normal
      # bi-allelic result.
      (['A', 'C'], ['C'], ['A', 'C', 'G'], [0, 0], (0, 0), 0),
      (['A', 'C'], ['C'], ['A', 'C', 'G'], [0, 1], (0, 1), 1),
      (['A', 'C'], ['C'], ['A', 'C', 'G'], [1, 1], (1, 1), 2),
      # We check here that we get the bi-allelic result even when the extra
      # allele is in position 1 not 2.
      (['A', 'G'], ['G'], ['A', 'C', 'G'], [0, 0], (0, 0), 0),
      (['A', 'G'], ['G'], ['A', 'C', 'G'], [0, 2], (0, 1), 1),
      (['A', 'G'], ['G'], ['A', 'C', 'G'], [2, 2], (1, 1), 2),
      # Now for a real het-alt. We've got three alleles in both, and the true
      # genotype is 1/2.
      (['A', 'C', 'G'], ['C'], ['A', 'C', 'G'], [1, 2], (1, 2), 1),
      (['A', 'C', 'G'], ['G'], ['A', 'C', 'G'], [1, 2], (1, 2), 1),
      (['A', 'C', 'G'], ['C', 'G'], ['A', 'C', 'G'], [1, 2], (1, 2), 2),
      # Test all possible values in candidate against het-alt:
      (['A', 'C', 'G', 'T'], ['C'], ['A', 'C', 'G'], [1, 2], (1, 2), 1),
      (['A', 'C', 'G', 'T'], ['G'], ['A', 'C', 'G'], [1, 2], (1, 2), 1),
      (['A', 'C', 'G', 'T'], ['T'], ['A', 'C', 'G'], [1, 2], (1, 2), 0),
      (['A', 'C', 'G', 'T'], ['C', 'G'], ['A', 'C', 'G'], [1, 2], (1, 2), 2),
      (['A', 'C', 'G', 'T'], ['C', 'T'], ['A', 'C', 'G'], [1, 2], (1, 2), 1),
      (['A', 'C', 'G', 'T'], ['G', 'T'], ['A', 'C', 'G'], [1, 2], (1, 2), 1),
      # Simple start for indel alleles => exact matching works here.
      (['A', 'AC'], ['AC'], ['A', 'AC'], [0, 0], (0, 0), 0),
      (['A', 'AC'], ['AC'], ['A', 'AC'], [0, 1], (0, 1), 1),
      (['A', 'AC'], ['AC'], ['A', 'AC'], [1, 1], (1, 1), 2),
      # We've got a multi-allelic truth, but again exact matching is enough.
      (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [0, 0], (0, 0), 0),
      (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [0, 1], (0, 1), 1),
      (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [1, 1], (1, 1), 2),
      (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [0, 2], (0, 0), 0),
      (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [1, 2], (0, 1), 1),
      (['A', 'AC'], ['AC'], ['A', 'AC', 'ACC'], [2, 2], (0, 0), 0),
      # This case has an extra allele (A) in truth but the true genotype
      # corresponds to our candidate alleles exactly.
      (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [0, 2], (0, 1), 1),
      (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [2, 2], (1, 1), 2),
      # If the true genotype involved just the deletion (A) allele, we don't
      # have that allele in our candidate so we always get 0 copies.
      (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [0, 1], (0, 0), 0),
      (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [1, 1], (0, 0), 0),
      # If the truth is het-alt, we can't match the deletion A allele but we do
      # in fact have the A => AC allele as this matches the AC => ACC allele in
      # truth set.
      (['A', 'AC'], ['AC'], ['AC', 'A', 'ACC'], [1, 2], (0, 1), 1),
      # We have a multi-allelic candidate but a simple bi-allelic truth. Make
      # sure we match correctly. This is an key case, as we should expect that
      # our candidates frequently have extra alleles changing the represention
      # relative to our truth candidates.
      (['ACT', 'A', 'AACT'], ['A'], ['A', 'AA'], [0, 1], (0, 2), 0),
      (['ACT', 'A', 'AACT'], ['A'], ['A', 'AA'], [1, 1], (2, 2), 0),
      (['ACT', 'A', 'AACT'], ['AACT'], ['A', 'AA'], [0, 1], (0, 2), 1),
      (['ACT', 'A', 'AACT'], ['AACT'], ['A', 'AA'], [1, 1], (2, 2), 2),
      (['ACT', 'A', 'AACT'], ['A', 'AACT'], ['A', 'AA'], [0, 1], (0, 2), 1),
      (['ACT', 'A', 'AACT'], ['A', 'AACT'], ['A', 'AA'], [1, 1], (2, 2), 2),
      # The whole complexity: multi-allelic candidate and truth, all with
      # different allele representations.
      # True genotype here is A/AGTGT where ref is AGT [common
      # dinucleotide expansion]. Both candidate and truth have this but each
      # as a different ref so none of the alleles exactly match.
      #
      # Truth     : AGT   => A [1] + AGTGT [2]
      # Candidate : AGTGT => AGT [2] + AGTGTGT [3]
      (
          ['AGTGT', 'A', 'AGT', 'AGTGTGT'],
          ['A'],
          ['AGT', 'A', 'AGTGT', 'AGTGTGT'],
          [1, 2],
          (2, 3),
          0,
      ),
      (
          ['AGTGT', 'A', 'AGT', 'AGTGTGT'],
          ['AGT'],
          ['AGT', 'A', 'AGTGT', 'AGTGTGT'],
          [1, 2],
          (2, 3),
          1,
      ),
      (
          ['AGTGT', 'A', 'AGT', 'AGTGTGT'],
          ['AGTGTGT'],
          ['AGT', 'A', 'AGTGT', 'AGTGTGT'],
          [1, 2],
          (2, 3),
          1,
      ),
      (
          ['AGTGT', 'A', 'AGT', 'AGTGTGT'],
          ['A', 'AGT'],
          ['AGT', 'A', 'AGTGT', 'AGTGTGT'],
          [1, 2],
          (2, 3),
          1,
      ),
      (
          ['AGTGT', 'A', 'AGT', 'AGTGTGT'],
          ['A', 'AGTGTGT'],
          ['AGT', 'A', 'AGTGT', 'AGTGTGT'],
          [1, 2],
          (2, 3),
          1,
      ),
      (
          ['AGTGT', 'A', 'AGT', 'AGTGTGT'],
          ['AGT', 'AGTGTGT'],
          ['AGT', 'A', 'AGTGT', 'AGTGTGT'],
          [1, 2],
          (2, 3),
          2,
      ),
      # Misc. checks with block substititions.
      (['AT', 'A', 'GC'], ['A'], ['ATT', 'AT', 'A'], [0, 1], (0, 1), 1),
      (['AT', 'A', 'GT'], ['A'], ['A', 'G'], [0, 1], (0, 2), 0),
      (['AT', 'A', 'GT'], ['GT'], ['A', 'G'], [0, 1], (0, 2), 1),
  )
  def test_genotype_from_matched_truth(
      self,
      variant_alleles,
      alt_alleles,
      truth_alleles,
      truth_gt,
      expected_genotype,
      expected_label,
  ):
    variant = test_utils.make_variant(start=10, alleles=variant_alleles)
    truth_variant = test_utils.make_variant(
        start=10, alleles=truth_alleles, gt=truth_gt
    )
    self.assertEqual(
        expected_genotype,
        variant_labeler._genotype_from_matched_truth(variant, truth_variant),
    )
    labeled = variant_labeler.VariantLabel(
        is_confident=True, variant=variant, genotype=expected_genotype
    )
    indices = [variant_alleles.index(alt) - 1 for alt in alt_alleles]
    self.assertEqual(labeled.label_for_alt_alleles(indices), expected_label)

  def test_genotype_from_matched_truth_none_truth_variant_raises(self):
    with self.assertRaisesRegex(ValueError, 'truth_variant cannot be None'):
      variant_labeler._genotype_from_matched_truth(self.snp, None)

  def test_genotype_from_matched_truth_no_call_truth_variant_raises(self):
    with self.assertRaisesRegex(ValueError, 'Expected exactly one VariantCal'):
      variant_labeler._genotype_from_matched_truth(
          self.snp,
          test_utils.make_variant(
              start=10,
              alleles=['A', 'C'],
          ),
      )

  def test_genotype_from_matched_truth_no_gt_truth_variant_raises(self):
    with self.assertRaisesRegex(ValueError, 'truth_variant needs genotypes'):
      variant_labeler._genotype_from_matched_truth(
          self.snp,
          test_utils.make_variant(
              start=10,
              alleles=['A', 'C'],
              gt=[-1, -1],
          ),
      )

  def test_genotype_from_matched_truth_none_variant_raises(self):
    with self.assertRaisesRegex(ValueError, 'variant cannot be None'):
      variant_labeler._genotype_from_matched_truth(None, self.snp)


if __name__ == '__main__':
  absltest.main()