Switch to unified view

a b/deepvariant/labeler/positional_labeler_test.py
1
# Copyright 2017 Google LLC.
2
#
3
# Redistribution and use in source and binary forms, with or without
4
# modification, are permitted provided that the following conditions
5
# are met:
6
#
7
# 1. Redistributions of source code must retain the above copyright notice,
8
#    this list of conditions and the following disclaimer.
9
#
10
# 2. Redistributions in binary form must reproduce the above copyright
11
#    notice, this list of conditions and the following disclaimer in the
12
#    documentation and/or other materials provided with the distribution.
13
#
14
# 3. Neither the name of the copyright holder nor the names of its
15
#    contributors may be used to endorse or promote products derived from this
16
#    software without specific prior written permission.
17
#
18
# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
19
# AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21
# ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE
22
# LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23
# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24
# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25
# INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26
# CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27
# ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28
# POSSIBILITY OF SUCH DAMAGE.
29
"""Tests for deepvariant .variant_labeler."""
30
31
from absl.testing import absltest
32
from absl.testing import parameterized
33
34
from third_party.nucleus.io import vcf
35
from third_party.nucleus.testing import test_utils
36
from third_party.nucleus.util import ranges
37
from deepvariant import testdata
38
from deepvariant.labeler import positional_labeler
39
40
41
def setUpModule():
42
  testdata.init()
43
44
45
class PositionalVariantLabelerTest(parameterized.TestCase):
46
  # Confident variants: SNP, deletion, and multi-allelic.
47
  snp = test_utils.make_variant(start=10, alleles=['A', 'C'], gt=[0, 1])
48
  deletion = test_utils.make_variant(start=20, alleles=['ACG', 'A'], gt=[1, 1])
49
  multiallelic = test_utils.make_variant(
50
      start=30, alleles=['ACT', 'ACTGT', 'A'], gt=[1, 2]
51
  )
52
  # Outside our confident regions.
53
  non_confident = test_utils.make_variant(
54
      start=200, alleles=['A', 'C'], gt=[0, 1]
55
  )
56
  filtered = test_utils.make_variant(start=40, filters='FAILED', gt=[0, 1])
57
58
  variants = [snp, deletion, multiallelic, non_confident, filtered]
59
60
  def _make_labeler(self, variants, confident_regions):
61
    return positional_labeler.PositionalVariantLabeler(
62
        truth_vcf_reader=vcf.InMemoryVcfReader(variants),
63
        confident_regions=confident_regions,
64
    )
65
66
  @parameterized.parameters(
67
      # Simple tests: we get back our matching variants in the confident regions
68
      dict(candidate=snp, expected_confident=True, expected_truth=snp),
69
      dict(
70
          candidate=deletion, expected_confident=True, expected_truth=deletion
71
      ),
72
      dict(
73
          candidate=multiallelic,
74
          expected_confident=True,
75
          expected_truth=multiallelic,
76
      ),
77
      # Test the behavior outside of our confident regions.
78
      # If we provide a variant outside the confident regions (non_confident) we
79
      # don't get back any expected_truth variants.
80
      dict(
81
          candidate=non_confident, expected_confident=False, expected_truth=None
82
      ),
83
      # No matching variant, so we get a None as well as False.
84
      dict(
85
          candidate=test_utils.make_variant(start=300, alleles=['A', 'C']),
86
          expected_confident=False,
87
          expected_truth=None,
88
      ),
89
      # This variant doesn't have any match but we're confident in it.
90
      dict(
91
          candidate=test_utils.make_variant(start=15, alleles=['C', 'A']),
92
          expected_confident=True,
93
          expected_genotype=(0, 0),
94
          expected_truth=test_utils.make_variant(
95
              start=15, alleles=['C', 'A'], gt=[0, 0]
96
          ),
97
      ),
98
      # These variant start at our SNP but has a different allele. We are
99
      # confident and we get back the true snp variant, despite having the
100
      # different alleles. snp has alleles=['A', 'C'] and gt=[0, 1].
101
      dict(
102
          candidate=test_utils.make_variant(
103
              start=snp.start, alleles=['A', 'G']
104
          ),
105
          expected_confident=True,
106
          expected_genotype=(0, 0),
107
          expected_truth=snp,
108
      ),
109
      dict(
110
          candidate=test_utils.make_variant(
111
              start=snp.start, alleles=['AC', 'C']
112
          ),
113
          expected_confident=True,
114
          expected_genotype=(0, 0),
115
          expected_truth=snp,
116
      ),
117
      dict(
118
          candidate=test_utils.make_variant(
119
              start=snp.start, alleles=['A', 'CA']
120
          ),
121
          expected_confident=True,
122
          expected_genotype=(0, 0),
123
          expected_truth=snp,
124
      ),
125
      # Checks that we don't match against the filtered truth variant in our
126
      # database. This means that we return not the filtered variant but one
127
      # with a (0, 0) genotype.
128
      dict(
129
          candidate=test_utils.make_variant(start=filtered.start),
130
          expected_confident=True,
131
          expected_genotype=(0, 0),
132
          expected_truth=test_utils.make_variant(
133
              start=filtered.start, gt=(0, 0)
134
          ),
135
      ),
136
  )
137
  def test_label_variants(
138
      self,
139
      candidate,
140
      expected_confident,
141
      expected_truth,
142
      expected_genotype=None,
143
  ):
144
    labeler = self._make_labeler(
145
        self.variants,
146
        ranges.RangeSet([ranges.make_range(self.snp.reference_name, 10, 100)]),
147
    )
148
149
    # Call _match so we can compare our expected truth with the actual one.
150
    is_confident, truth_variant = labeler._match(candidate)
151
    self.assertEqual(expected_truth, truth_variant)
152
    self.assertEqual(is_confident, expected_confident)
153
154
    # Now call label_variants to exercise the higher-level API.
155
    if expected_genotype is None and expected_truth is not None:
156
      expected_genotype = tuple(expected_truth.calls[0].genotype)
157
    labels = list(labeler.label_variants([candidate]))
158
    self.assertLen(labels, 1)
159
    self.assertEqual(candidate, labels[0].variant)
160
    self.assertEqual(expected_confident, labels[0].is_confident)
161
    self.assertEqual(expected_genotype, labels[0].genotype)
162
163
  def test_match_selects_variant_by_start(self):
164
    # Tests that match() selects the variant at the same start even if that
165
    # variant doesn't have the same alleles at candidate and there's an
166
    # overlapping with the same alleles.
167
    overlapping = [
168
        test_utils.make_variant(start=20, alleles=['CC', 'A'], gt=[1, 1]),
169
        test_utils.make_variant(start=21, alleles=['AAA', 'A'], gt=[0, 1]),
170
        test_utils.make_variant(start=22, alleles=['AA', 'A'], gt=[1, 1]),
171
    ]
172
    candidate = test_utils.make_variant(start=21, alleles=['CC', 'A'])
173
174
    labeler = self._make_labeler(
175
        overlapping,
176
        ranges.RangeSet(
177
            [ranges.make_range(overlapping[0].reference_name, 0, 100)]
178
        ),
179
    )
180
    is_confident, truth_variant = labeler._match(candidate)
181
    self.assertEqual(is_confident, True)
182
    self.assertEqual(truth_variant, overlapping[1])
183
184
  @parameterized.parameters(
185
      dict(
186
          overlapping_variants=[
187
              test_utils.make_variant(start=20, alleles=['A', 'CC'], gt=[1, 1]),
188
              test_utils.make_variant(
189
                  start=20, alleles=['A', 'AAA'], gt=[0, 1]
190
              ),
191
              test_utils.make_variant(start=20, alleles=['A', 'AA'], gt=[1, 1]),
192
          ],
193
          candidate=test_utils.make_variant(start=20, alleles=['A', 'AAA']),
194
          expected_confident=True,
195
          truth_variant_idx=1,
196
      ),
197
      # No candidate variant with matching alt, so use first candidate.
198
      dict(
199
          overlapping_variants=[
200
              test_utils.make_variant(start=20, alleles=['A', 'CC'], gt=[1, 1]),
201
              test_utils.make_variant(
202
                  start=20, alleles=['A', 'AAA'], gt=[0, 1]
203
              ),
204
              test_utils.make_variant(start=20, alleles=['A', 'AA'], gt=[1, 1]),
205
          ],
206
          candidate=test_utils.make_variant(start=20, alleles=['A', 'TT']),
207
          expected_confident=True,
208
          truth_variant_idx=0,
209
      ),
210
      # GAAA->GAA is the same as GA->A (the second one in matches), but if we
211
      # don't simplify the alleles before comparing, there will be no match and
212
      # will incorrectly fall back to the first one.
213
      dict(
214
          overlapping_variants=[
215
              test_utils.make_variant(
216
                  start=20, alleles=['GAA', 'G'], gt=[1, 1]
217
              ),
218
              test_utils.make_variant(start=20, alleles=['GA', 'G'], gt=[0, 1]),
219
          ],
220
          candidate=test_utils.make_variant(start=20, alleles=['GAAA', 'GAA']),
221
          expected_confident=True,
222
          truth_variant_idx=1,
223
      ),
224
  )
225
  def test_match_multiple_matches(
226
      self,
227
      overlapping_variants,
228
      candidate,
229
      expected_confident,
230
      truth_variant_idx,
231
  ):
232
    labeler = self._make_labeler(
233
        overlapping_variants,
234
        ranges.RangeSet(
235
            [ranges.make_range(overlapping_variants[0].reference_name, 0, 100)]
236
        ),
237
    )
238
    is_confident, variant_match = labeler._match(candidate)
239
    expected_variant = overlapping_variants[truth_variant_idx]
240
    self.assertEqual(is_confident, expected_confident)
241
    self.assertEqual(variant_match, expected_variant)
242
243
244
if __name__ == '__main__':
245
  absltest.main()